From 8798ea68b0a34e7ecae43f4ccaf7a446697c51c8 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Wed, 10 Oct 2018 13:39:59 +0000 Subject: [PATCH 0001/1116] [x86] allow single source horizontal op matching (PR39195) This is intended to restore horizontal codegen to what it looked like before IR demanded elements improved in: rL343727 As noted in PR39195: https://bugs.llvm.org/show_bug.cgi?id=39195 ...horizontal ops can be worse for performance than a shuffle+regular binop, so I've added a TODO. Ideally, we'd solve that in a machine instruction pass, but a quicker solution will be adding a 'HasFastHorizontalOp' feature bit to deal with it here in the DAG. Differential Revision: https://reviews.llvm.org/D52997 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344141 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 8 +- .../X86/avx512-intrinsics-fast-isel.ll | 12 +-- test/CodeGen/X86/haddsub-undef.ll | 71 ++++---------- test/CodeGen/X86/phaddsub.ll | 96 +++++-------------- test/CodeGen/X86/vector-shuffle-combining.ll | 39 +++++--- 5 files changed, 79 insertions(+), 147 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 4c18c5a84c2..67f98d8ee72 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -37026,9 +37026,13 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) { continue; // The low half of the 128-bit result must choose from A. - // The high half of the 128-bit result must choose from B. + // The high half of the 128-bit result must choose from B, + // unless B is undef. In that case, we are always choosing from A. + // TODO: Using a horizontal op on a single input is likely worse for + // performance on many CPUs, so this should be limited here or reversed + // in a later pass. unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2; - unsigned Src = i >= NumEltsPer64BitChunk; + unsigned Src = B.getNode() ? i >= NumEltsPer64BitChunk : 0; // Check that successive elements are being operated on. If not, this is // not a horizontal operation. diff --git a/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll index f889bb90550..20c509732c8 100644 --- a/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll +++ b/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll @@ -7210,8 +7210,7 @@ define double @test_mm512_reduce_add_pd(<8 x double> %__W) { ; X86-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; X86-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 ; X86-NEXT: vmovlpd %xmm0, (%esp) ; X86-NEXT: fldl (%esp) ; X86-NEXT: movl %ebp, %esp @@ -7226,8 +7225,7 @@ define double @test_mm512_reduce_add_pd(<8 x double> %__W) { ; X64-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; X64-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq entry: @@ -7407,8 +7405,7 @@ define double @test_mm512_mask_reduce_add_pd(i8 zeroext %__M, <8 x double> %__W) ; X86-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; X86-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 ; X86-NEXT: vmovlpd %xmm0, (%esp) ; X86-NEXT: fldl (%esp) ; X86-NEXT: movl %ebp, %esp @@ -7425,8 +7422,7 @@ define double @test_mm512_mask_reduce_add_pd(i8 zeroext %__M, <8 x double> %__W) ; X64-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; X64-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq entry: diff --git a/test/CodeGen/X86/haddsub-undef.ll b/test/CodeGen/X86/haddsub-undef.ll index 84decabcbce..d7c0936a474 100644 --- a/test/CodeGen/X86/haddsub-undef.ll +++ b/test/CodeGen/X86/haddsub-undef.ll @@ -453,14 +453,12 @@ define <8 x i32> @test17_undef(<8 x i32> %a, <8 x i32> %b) { define <2 x double> @add_pd_003(<2 x double> %x) { ; SSE-LABEL: add_pd_003: ; SSE: # %bb.0: -; SSE-NEXT: movddup {{.*#+}} xmm1 = xmm0[0,0] -; SSE-NEXT: addpd %xmm1, %xmm0 +; SSE-NEXT: haddpd %xmm0, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: add_pd_003: ; AVX: # %bb.0: -; AVX-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] -; AVX-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq %l = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> %add = fadd <2 x double> %l, %x @@ -472,16 +470,12 @@ define <2 x double> @add_pd_003(<2 x double> %x) { define <2 x double> @add_pd_003_2(<2 x double> %x) { ; SSE-LABEL: add_pd_003_2: ; SSE: # %bb.0: -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0] -; SSE-NEXT: addpd %xmm0, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: haddpd %xmm0, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: add_pd_003_2: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq %l = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> %add = fadd <2 x double> %l, %x @@ -491,16 +485,12 @@ define <2 x double> @add_pd_003_2(<2 x double> %x) { define <2 x double> @add_pd_010(<2 x double> %x) { ; SSE-LABEL: add_pd_010: ; SSE: # %bb.0: -; SSE-NEXT: movddup {{.*#+}} xmm1 = xmm0[0,0] -; SSE-NEXT: addpd %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] -; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: haddpd %xmm0, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: add_pd_010: ; AVX: # %bb.0: -; AVX-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] -; AVX-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX-NEXT: retq %l = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> @@ -512,17 +502,12 @@ define <2 x double> @add_pd_010(<2 x double> %x) { define <4 x float> @add_ps_007(<4 x float> %x) { ; SSE-LABEL: add_ps_007: ; SSE: # %bb.0: -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE-NEXT: addps %xmm1, %xmm0 +; SSE-NEXT: haddps %xmm0, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: add_ps_007: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,1,0,2] -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> @@ -533,18 +518,13 @@ define <4 x float> @add_ps_007(<4 x float> %x) { define <4 x float> @add_ps_030(<4 x float> %x) { ; SSE-LABEL: add_ps_030: ; SSE: # %bb.0: -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE-NEXT: addps %xmm1, %xmm0 +; SSE-NEXT: haddps %xmm0, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,2,3] ; SSE-NEXT: retq ; ; AVX-LABEL: add_ps_030: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,1,0,2] -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,2,3] ; AVX-NEXT: retq %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> @@ -557,16 +537,12 @@ define <4 x float> @add_ps_030(<4 x float> %x) { define <4 x float> @add_ps_007_2(<4 x float> %x) { ; SSE-LABEL: add_ps_007_2: ; SSE: # %bb.0: -; SSE-NEXT: movddup {{.*#+}} xmm1 = xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE-NEXT: addps %xmm1, %xmm0 +; SSE-NEXT: haddps %xmm0, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: add_ps_007_2: ; AVX: # %bb.0: -; AVX-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> @@ -577,14 +553,12 @@ define <4 x float> @add_ps_007_2(<4 x float> %x) { define <4 x float> @add_ps_008(<4 x float> %x) { ; SSE-LABEL: add_ps_008: ; SSE: # %bb.0: -; SSE-NEXT: movsldup {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE-NEXT: addps %xmm1, %xmm0 +; SSE-NEXT: haddps %xmm0, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: add_ps_008: ; AVX: # %bb.0: -; AVX-NEXT: vmovsldup {{.*#+}} xmm1 = xmm0[0,0,2,2] -; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %add = fadd <4 x float> %l, %x @@ -594,16 +568,13 @@ define <4 x float> @add_ps_008(<4 x float> %x) { define <4 x float> @add_ps_017(<4 x float> %x) { ; SSE-LABEL: add_ps_017: ; SSE: # %bb.0: -; SSE-NEXT: movsldup {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE-NEXT: addps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3] -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: haddps %xmm0, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE-NEXT: retq ; ; AVX-LABEL: add_ps_017: ; AVX: # %bb.0: -; AVX-NEXT: vmovsldup {{.*#+}} xmm1 = xmm0[0,0,2,2] -; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX-NEXT: retq %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> @@ -615,17 +586,13 @@ define <4 x float> @add_ps_017(<4 x float> %x) { define <4 x float> @add_ps_018(<4 x float> %x) { ; SSE-LABEL: add_ps_018: ; SSE: # %bb.0: -; SSE-NEXT: movddup {{.*#+}} xmm1 = xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE-NEXT: addps %xmm1, %xmm0 +; SSE-NEXT: haddps %xmm0, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: retq ; ; AVX-LABEL: add_ps_018: ; AVX: # %bb.0: -; AVX-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX-NEXT: retq %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> diff --git a/test/CodeGen/X86/phaddsub.ll b/test/CodeGen/X86/phaddsub.ll index 5d7c77b9a81..7b3f8db76c4 100644 --- a/test/CodeGen/X86/phaddsub.ll +++ b/test/CodeGen/X86/phaddsub.ll @@ -286,16 +286,12 @@ define <4 x i32> @phsubd1_reverse(<4 x i32> %x, <4 x i32> %y) { define <4 x i32> @phaddd_single_source1(<4 x i32> %x) { ; SSSE3-LABEL: phaddd_single_source1: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,2] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSSE3-NEXT: paddd %xmm1, %xmm0 +; SSSE3-NEXT: phaddd %xmm0, %xmm0 ; SSSE3-NEXT: retq ; ; AVX-LABEL: phaddd_single_source1: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,2] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> @@ -306,17 +302,13 @@ define <4 x i32> @phaddd_single_source1(<4 x i32> %x) { define <4 x i32> @phaddd_single_source2(<4 x i32> %x) { ; SSSE3-LABEL: phaddd_single_source2: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,2] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSSE3-NEXT: paddd %xmm1, %xmm0 +; SSSE3-NEXT: phaddd %xmm0, %xmm0 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] ; SSSE3-NEXT: retq ; ; AVX-LABEL: phaddd_single_source2: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,2] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] ; AVX-NEXT: retq %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> @@ -329,16 +321,12 @@ define <4 x i32> @phaddd_single_source2(<4 x i32> %x) { define <4 x i32> @phaddd_single_source3(<4 x i32> %x) { ; SSSE3-LABEL: phaddd_single_source3: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSSE3-NEXT: paddd %xmm1, %xmm0 +; SSSE3-NEXT: phaddd %xmm0, %xmm0 ; SSSE3-NEXT: retq ; ; AVX-LABEL: phaddd_single_source3: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] -; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> @@ -349,14 +337,12 @@ define <4 x i32> @phaddd_single_source3(<4 x i32> %x) { define <4 x i32> @phaddd_single_source4(<4 x i32> %x) { ; SSSE3-LABEL: phaddd_single_source4: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,2,2] -; SSSE3-NEXT: paddd %xmm1, %xmm0 +; SSSE3-NEXT: phaddd %xmm0, %xmm0 ; SSSE3-NEXT: retq ; ; AVX-LABEL: phaddd_single_source4: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,2,2] -; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %add = add <4 x i32> %l, %x @@ -366,15 +352,13 @@ define <4 x i32> @phaddd_single_source4(<4 x i32> %x) { define <4 x i32> @phaddd_single_source5(<4 x i32> %x) { ; SSSE3-LABEL: phaddd_single_source5: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,2,2] -; SSSE3-NEXT: paddd %xmm0, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] +; SSSE3-NEXT: phaddd %xmm0, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSSE3-NEXT: retq ; ; AVX-LABEL: phaddd_single_source5: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,2,2] -; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX-NEXT: retq %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> @@ -386,17 +370,13 @@ define <4 x i32> @phaddd_single_source5(<4 x i32> %x) { define <4 x i32> @phaddd_single_source6(<4 x i32> %x) { ; SSSE3-LABEL: phaddd_single_source6: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSSE3-NEXT: paddd %xmm1, %xmm0 +; SSSE3-NEXT: phaddd %xmm0, %xmm0 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; SSSE3-NEXT: retq ; ; AVX-LABEL: phaddd_single_source6: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] -; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX-NEXT: retq %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> @@ -409,17 +389,12 @@ define <4 x i32> @phaddd_single_source6(<4 x i32> %x) { define <8 x i16> @phaddw_single_source1(<8 x i16> %x) { ; SSSE3-LABEL: phaddw_single_source1: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,12,13] -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15] -; SSSE3-NEXT: paddw %xmm1, %xmm0 +; SSSE3-NEXT: phaddw %xmm0, %xmm0 ; SSSE3-NEXT: retq ; ; AVX-LABEL: phaddw_single_source1: ; AVX: # %bb.0: -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,4,5,4,5,6,7,0,1,4,5,8,9,12,13] -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15] -; AVX-NEXT: vpaddw %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> @@ -430,22 +405,14 @@ define <8 x i16> @phaddw_single_source1(<8 x i16> %x) { define <8 x i16> @phaddw_single_source2(<8 x i16> %x) { ; SSSE3-LABEL: phaddw_single_source2: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSSE3-NEXT: paddw %xmm1, %xmm0 +; SSSE3-NEXT: phaddw %xmm0, %xmm0 ; SSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7] ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSSE3-NEXT: retq ; ; AVX-LABEL: phaddw_single_source2: ; AVX: # %bb.0: -; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; AVX-NEXT: vpaddw %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; AVX-NEXT: retq @@ -459,20 +426,12 @@ define <8 x i16> @phaddw_single_source2(<8 x i16> %x) { define <8 x i16> @phaddw_single_source3(<8 x i16> %x) { ; SSSE3-LABEL: phaddw_single_source3: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSSE3-NEXT: paddw %xmm1, %xmm0 +; SSSE3-NEXT: phaddw %xmm0, %xmm0 ; SSSE3-NEXT: retq ; ; AVX-LABEL: phaddw_single_source3: ; AVX: # %bb.0: -; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; AVX-NEXT: vpaddw %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> @@ -483,16 +442,12 @@ define <8 x i16> @phaddw_single_source3(<8 x i16> %x) { define <8 x i16> @phaddw_single_source4(<8 x i16> %x) { ; SSSE3-LABEL: phaddw_single_source4: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pslld $16, %xmm1 -; SSSE3-NEXT: paddw %xmm0, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: phaddw %xmm0, %xmm0 ; SSSE3-NEXT: retq ; ; AVX-LABEL: phaddw_single_source4: ; AVX: # %bb.0: -; AVX-NEXT: vpslld $16, %xmm0, %xmm1 -; AVX-NEXT: vpaddw %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> %add = add <8 x i16> %l, %x @@ -502,18 +457,13 @@ define <8 x i16> @phaddw_single_source4(<8 x i16> %x) { define <8 x i16> @phaddw_single_source6(<8 x i16> %x) { ; SSSE3-LABEL: phaddw_single_source6: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7] -; SSSE3-NEXT: paddw %xmm1, %xmm0 +; SSSE3-NEXT: phaddw %xmm0, %xmm0 ; SSSE3-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; SSSE3-NEXT: retq ; ; AVX-LABEL: phaddw_single_source6: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] -; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX-NEXT: vpaddw %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX-NEXT: retq %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> diff --git a/test/CodeGen/X86/vector-shuffle-combining.ll b/test/CodeGen/X86/vector-shuffle-combining.ll index 2eb9362947e..5c0a223d496 100644 --- a/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/test/CodeGen/X86/vector-shuffle-combining.ll @@ -2700,21 +2700,36 @@ define <4 x i32> @combine_constant_insertion_v4i32(i32 %f) { } define <4 x float> @PR22377(<4 x float> %a, <4 x float> %b) { -; SSE-LABEL: PR22377: -; SSE: # %bb.0: # %entry -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm0[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,0,2] -; SSE-NEXT: addps %xmm0, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: retq +; SSE2-LABEL: PR22377: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm0[1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,0,2] +; SSE2-NEXT: addps %xmm0, %xmm1 +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: PR22377: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movaps %xmm0, %xmm1 +; SSSE3-NEXT: haddps %xmm0, %xmm1 +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: PR22377: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: haddps %xmm0, %xmm1 +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1] +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE41-NEXT: retq ; ; AVX-LABEL: PR22377: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,3,1,3] -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,2] -; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm1 -; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm1 +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1] +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX-NEXT: retq entry: %s1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> -- GitLab From df002d74e3e6588a8ffb2aa8459369696b2611c4 Mon Sep 17 00:00:00 2001 From: Nirav Dave Date: Wed, 10 Oct 2018 14:15:52 +0000 Subject: [PATCH 0002/1116] [DAGCombine] Improve Load-Store Forwarding Summary: Extend analysis forwarding loads from preceeding stores to work with extended loads and truncated stores to the same address so long as the load is fully subsumed by the store. Hexagon's swp-epilog-phis.ll and swp-memrefs-epilog1.ll test are deleted as they've no longer seem to be relevant. Reviewers: RKSimon, rnk, kparzysz, javed.absar Subscribers: sdardis, nemanjai, hiraditya, atanasyan, llvm-commits Differential Revision: https://reviews.llvm.org/D49200 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344142 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 145 ++++++++++++++++-- test/CodeGen/AArch64/arm64-ld-from-st.ll | 28 ++-- test/CodeGen/AArch64/regress-tblgen-chains.ll | 4 +- test/CodeGen/Hexagon/clr_set_toggle.ll | 2 +- test/CodeGen/Hexagon/swp-epilog-phis.ll | 55 ------- test/CodeGen/Hexagon/swp-memrefs-epilog1.ll | 90 ----------- test/CodeGen/Mips/cconv/vector.ll | 22 +-- .../Mips/indirect-jump-hazard/jumptables.ll | 22 ++- test/CodeGen/Mips/o32_cc_byval.ll | 8 +- test/CodeGen/Mips/o32_cc_vararg.ll | 10 +- test/CodeGen/PowerPC/addi-offset-fold.ll | 5 +- .../SystemZ/store_nonbytesized_vecs.ll | 3 +- test/CodeGen/X86/i386-shrink-wrapping.ll | 2 +- test/CodeGen/X86/pr32108.ll | 1 - test/CodeGen/X86/pr38533.ll | 6 - test/CodeGen/X86/win64_vararg.ll | 5 +- 16 files changed, 184 insertions(+), 224 deletions(-) delete mode 100644 test/CodeGen/Hexagon/swp-epilog-phis.ll delete mode 100644 test/CodeGen/Hexagon/swp-memrefs-epilog1.ll diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 29adcad22e1..eca5d8369eb 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -250,6 +250,11 @@ namespace { SDValue SplitIndexingFromLoad(LoadSDNode *LD); bool SliceUpLoad(SDNode *N); + // Scalars have size 0 to distinguish from singleton vectors. + SDValue ForwardStoreValueToDirectLoad(LoadSDNode *LD); + bool getTruncatedStoreValue(StoreSDNode *ST, SDValue &Val); + bool extendLoadedValueToExtension(LoadSDNode *LD, SDValue &Val); + /// Replace an ISD::EXTRACT_VECTOR_ELT of a load with a narrowed /// load. /// @@ -12762,6 +12767,133 @@ SDValue DAGCombiner::SplitIndexingFromLoad(LoadSDNode *LD) { return DAG.getNode(Opc, SDLoc(LD), BP.getSimpleValueType(), BP, Inc); } +static inline int numVectorEltsOrZero(EVT T) { + return T.isVector() ? T.getVectorNumElements() : 0; +} + +bool DAGCombiner::getTruncatedStoreValue(StoreSDNode *ST, SDValue &Val) { + Val = ST->getValue(); + EVT STType = Val.getValueType(); + EVT STMemType = ST->getMemoryVT(); + if (STType == STMemType) + return true; + if (isTypeLegal(STMemType)) + return false; // fail. + if (STType.isFloatingPoint() && STMemType.isFloatingPoint() && + TLI.isOperationLegal(ISD::FTRUNC, STMemType)) { + Val = DAG.getNode(ISD::FTRUNC, SDLoc(ST), STMemType, Val); + return true; + } + if (numVectorEltsOrZero(STType) == numVectorEltsOrZero(STMemType) && + STType.isInteger() && STMemType.isInteger()) { + Val = DAG.getNode(ISD::TRUNCATE, SDLoc(ST), STMemType, Val); + return true; + } + if (STType.getSizeInBits() == STMemType.getSizeInBits()) { + Val = DAG.getBitcast(STMemType, Val); + return true; + } + return false; // fail. +} + +bool DAGCombiner::extendLoadedValueToExtension(LoadSDNode *LD, SDValue &Val) { + EVT LDMemType = LD->getMemoryVT(); + EVT LDType = LD->getValueType(0); + assert(Val.getValueType() == LDMemType && + "Attempting to extend value of non-matching type"); + if (LDType == LDMemType) + return true; + if (LDMemType.isInteger() && LDType.isInteger()) { + switch (LD->getExtensionType()) { + case ISD::NON_EXTLOAD: + Val = DAG.getBitcast(LDType, Val); + return true; + case ISD::EXTLOAD: + Val = DAG.getNode(ISD::ANY_EXTEND, SDLoc(LD), LDType, Val); + return true; + case ISD::SEXTLOAD: + Val = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(LD), LDType, Val); + return true; + case ISD::ZEXTLOAD: + Val = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(LD), LDType, Val); + return true; + } + } + return false; +} + +SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) { + if (OptLevel == CodeGenOpt::None || LD->isVolatile()) + return SDValue(); + SDValue Chain = LD->getOperand(0); + StoreSDNode *ST = dyn_cast(Chain.getNode()); + if (!ST || ST->isVolatile()) + return SDValue(); + + EVT LDType = LD->getValueType(0); + EVT LDMemType = LD->getMemoryVT(); + EVT STMemType = ST->getMemoryVT(); + EVT STType = ST->getValue().getValueType(); + + BaseIndexOffset BasePtrLD = BaseIndexOffset::match(LD, DAG); + BaseIndexOffset BasePtrST = BaseIndexOffset::match(ST, DAG); + int64_t Offset; + + bool STCoversLD = + BasePtrST.equalBaseIndex(BasePtrLD, DAG, Offset) && (Offset >= 0) && + (Offset * 8 <= LDMemType.getSizeInBits()) && + (Offset * 8 + LDMemType.getSizeInBits() <= STMemType.getSizeInBits()); + + if (!STCoversLD) + return SDValue(); + + // Memory as copy space (potentially masked). + if (Offset == 0 && LDType == STType && STMemType == LDMemType) { + // Simple case: Direct non-truncating forwarding + if (LDType.getSizeInBits() == LDMemType.getSizeInBits()) + return CombineTo(LD, ST->getValue(), Chain); + // Can we model the truncate and extension with an and mask? + if (STType.isInteger() && LDMemType.isInteger() && !STType.isVector() && + !LDMemType.isVector() && LD->getExtensionType() != ISD::SEXTLOAD) { + // Mask to size of LDMemType + auto Mask = + DAG.getConstant(APInt::getLowBitsSet(STType.getSizeInBits(), + STMemType.getSizeInBits()), + SDLoc(ST), STType); + auto Val = DAG.getNode(ISD::AND, SDLoc(LD), LDType, ST->getValue(), Mask); + return CombineTo(LD, Val, Chain); + } + } + + // TODO: Deal with nonzero offset. + if (LD->getBasePtr().isUndef() || Offset != 0) + return SDValue(); + // Model necessary truncations / extenstions. + SDValue Val; + // Truncate Value To Stored Memory Size. + do { + if (!getTruncatedStoreValue(ST, Val)) + continue; + if (!isTypeLegal(LDMemType)) + continue; + if (STMemType != LDMemType) { + if (numVectorEltsOrZero(STMemType) == numVectorEltsOrZero(LDMemType) && + STMemType.isInteger() && LDMemType.isInteger()) + Val = DAG.getNode(ISD::TRUNCATE, SDLoc(LD), LDMemType, Val); + else + continue; + } + if (!extendLoadedValueToExtension(LD, Val)) + continue; + return CombineTo(LD, Val, Chain); + } while (false); + + // On failure, cleanup dead nodes we may have created. + if (Val->use_empty()) + deleteAndRecombine(Val.getNode()); + return SDValue(); +} + SDValue DAGCombiner::visitLOAD(SDNode *N) { LoadSDNode *LD = cast(N); SDValue Chain = LD->getChain(); @@ -12828,17 +12960,8 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) { // If this load is directly stored, replace the load value with the stored // value. - // TODO: Handle store large -> read small portion. - // TODO: Handle TRUNCSTORE/LOADEXT - if (OptLevel != CodeGenOpt::None && - ISD::isNormalLoad(N) && !LD->isVolatile()) { - if (ISD::isNON_TRUNCStore(Chain.getNode())) { - StoreSDNode *PrevST = cast(Chain); - if (PrevST->getBasePtr() == Ptr && - PrevST->getValue().getValueType() == N->getValueType(0)) - return CombineTo(N, PrevST->getOperand(1), Chain); - } - } + if (auto V = ForwardStoreValueToDirectLoad(LD)) + return V; // Try to infer better alignment information than the load already has. if (OptLevel != CodeGenOpt::None && LD->isUnindexed()) { diff --git a/test/CodeGen/AArch64/arm64-ld-from-st.ll b/test/CodeGen/AArch64/arm64-ld-from-st.ll index dd8add70cdb..5488c21fa29 100644 --- a/test/CodeGen/AArch64/arm64-ld-from-st.ll +++ b/test/CodeGen/AArch64/arm64-ld-from-st.ll @@ -13,7 +13,7 @@ entry: } ; CHECK-LABEL: Str64Ldr32_0 -; CHECK: and x0, x1, #0xffffffff +; CHECK: mov w0, w1 define i32 @Str64Ldr32_0(i64* nocapture %P, i64 %v, i64 %n) { entry: %0 = bitcast i64* %P to i32* @@ -37,7 +37,7 @@ entry: } ; CHECK-LABEL: Str64Ldr16_0 -; CHECK: and x0, x1, #0xffff +; CHECK: mov w0, w1 define i16 @Str64Ldr16_0(i64* nocapture %P, i64 %v, i64 %n) { entry: %0 = bitcast i64* %P to i16* @@ -85,7 +85,7 @@ entry: } ; CHECK-LABEL: Str64Ldr8_0 -; CHECK: and x0, x1, #0xff +; CHECK: mov w0, w1 define i8 @Str64Ldr8_0(i64* nocapture %P, i64 %v, i64 %n) { entry: %0 = bitcast i64* %P to i8* @@ -193,7 +193,7 @@ entry: } ; CHECK-LABEL: Str32Ldr16_0 -; CHECK: and w0, w1, #0xffff +; CHECK: mov w0, w1 define i16 @Str32Ldr16_0(i32* nocapture %P, i32 %v, i64 %n) { entry: %0 = bitcast i32* %P to i16* @@ -217,7 +217,7 @@ entry: } ; CHECK-LABEL: Str32Ldr8_0 -; CHECK: and w0, w1, #0xff +; CHECK: mov w0, w1 define i8 @Str32Ldr8_0(i32* nocapture %P, i32 %v, i64 %n) { entry: %0 = bitcast i32* %P to i8* @@ -265,7 +265,7 @@ entry: } ; CHECK-LABEL: Str16Ldr16 -; CHECK: and w0, w1, #0xffff +; CHECK: mov w0, w1 define i16 @Str16Ldr16(i16* nocapture %P, i16 %v, i64 %n) { entry: %0 = bitcast i16* %P to i16* @@ -277,7 +277,7 @@ entry: } ; CHECK-LABEL: Str16Ldr8_0 -; CHECK: and w0, w1, #0xff +; CHECK: mov w0, w1 define i8 @Str16Ldr8_0(i16* nocapture %P, i16 %v, i64 %n) { entry: %0 = bitcast i16* %P to i8* @@ -314,7 +314,7 @@ entry: } ; CHECK-LABEL: Unscaled_Str64Ldr32_0 -; CHECK: and x0, x1, #0xffffffff +; CHECK: mov w0, w1 define i32 @Unscaled_Str64Ldr32_0(i64* nocapture %P, i64 %v, i64 %n) { entry: %0 = bitcast i64* %P to i32* @@ -338,7 +338,7 @@ entry: } ; CHECK-LABEL: Unscaled_Str64Ldr16_0 -; CHECK: and x0, x1, #0xffff +; CHECK: mov w0, w1 define i16 @Unscaled_Str64Ldr16_0(i64* nocapture %P, i64 %v, i64 %n) { entry: %0 = bitcast i64* %P to i16* @@ -386,7 +386,7 @@ entry: } ; CHECK-LABEL: Unscaled_Str64Ldr8_0 -; CHECK: and x0, x1, #0xff +; CHECK: mov w0, w1 define i8 @Unscaled_Str64Ldr8_0(i64* nocapture %P, i64 %v, i64 %n) { entry: %0 = bitcast i64* %P to i8* @@ -494,7 +494,7 @@ entry: } ; CHECK-LABEL: Unscaled_Str32Ldr16_0 -; CHECK: and w0, w1, #0xffff +; CHECK: mov w0, w1 define i16 @Unscaled_Str32Ldr16_0(i32* nocapture %P, i32 %v, i64 %n) { entry: %0 = bitcast i32* %P to i16* @@ -518,7 +518,7 @@ entry: } ; CHECK-LABEL: Unscaled_Str32Ldr8_0 -; CHECK: and w0, w1, #0xff +; CHECK: mov w0, w1 define i8 @Unscaled_Str32Ldr8_0(i32* nocapture %P, i32 %v, i64 %n) { entry: %0 = bitcast i32* %P to i8* @@ -566,7 +566,7 @@ entry: } ; CHECK-LABEL: Unscaled_Str16Ldr16 -; CHECK: and w0, w1, #0xffff +; CHECK: mov w0, w1 define i16 @Unscaled_Str16Ldr16(i16* nocapture %P, i16 %v, i64 %n) { entry: %0 = bitcast i16* %P to i16* @@ -578,7 +578,7 @@ entry: } ; CHECK-LABEL: Unscaled_Str16Ldr8_0 -; CHECK: and w0, w1, #0xff +; CHECK: mov w0, w1 define i8 @Unscaled_Str16Ldr8_0(i16* nocapture %P, i16 %v, i64 %n) { entry: %0 = bitcast i16* %P to i8* diff --git a/test/CodeGen/AArch64/regress-tblgen-chains.ll b/test/CodeGen/AArch64/regress-tblgen-chains.ll index 24038cda507..50da7d139f1 100644 --- a/test/CodeGen/AArch64/regress-tblgen-chains.ll +++ b/test/CodeGen/AArch64/regress-tblgen-chains.ll @@ -26,9 +26,9 @@ define i64 @test_chains() { store i8 %inc.4, i8* %locvar ; CHECK: ldurb {{w[0-9]+}}, [x29, [[LOCADDR:#-?[0-9]+]]] -; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, #1 +; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #1 ; CHECK: sturb w[[STRVAL:[0-9]+]], [x29, [[LOCADDR]]] -; CHECK: and w0, w[[STRVAL]], #0xff +; CHECK: and x0, x[[STRVAL]], #0xff %ret.1 = load i8, i8* %locvar %ret.2 = zext i8 %ret.1 to i64 diff --git a/test/CodeGen/Hexagon/clr_set_toggle.ll b/test/CodeGen/Hexagon/clr_set_toggle.ll index 9318f2d8a6b..43c866c7b76 100644 --- a/test/CodeGen/Hexagon/clr_set_toggle.ll +++ b/test/CodeGen/Hexagon/clr_set_toggle.ll @@ -70,7 +70,7 @@ entry: define zeroext i16 @my_setbit(i16 zeroext %crc) nounwind { entry: ; CHECK-LABEL: my_setbit -; CHECK: memh(r{{[0-9]+}}+#{{[0-9]+}}) = setbit(#15) +; CHECK: r{{[0-9]+}} = setbit(r{{[0-9]+}},#15) %crc.addr = alloca i16, align 2 store i16 %crc, i16* %crc.addr, align 2 %0 = load i16, i16* %crc.addr, align 2 diff --git a/test/CodeGen/Hexagon/swp-epilog-phis.ll b/test/CodeGen/Hexagon/swp-epilog-phis.ll deleted file mode 100644 index 1073f1c46b1..00000000000 --- a/test/CodeGen/Hexagon/swp-epilog-phis.ll +++ /dev/null @@ -1,55 +0,0 @@ -; RUN: llc -march=hexagon -enable-pipeliner -pipeliner-max-stages=2 \ -; RUN: -pipeliner-ignore-recmii -disable-hexagon-nv-schedule \ -; RUN: -hexagon-initial-cfg-cleanup=0 -stats -o /dev/null \ -; RUN: -enable-aa-sched-mi < %s 2>&1 | FileCheck %s --check-prefix=STATS -; REQUIRES: asserts -; -; Test that we generate the correct phis in the last epilog block when -; allowing multiple stages. -; -; STATS: 1 pipeliner - Number of loops software pipelined - -; Function Attrs: nounwind -define void @f0() #0 { -b0: - br i1 undef, label %b6, label %b1 - -b1: ; preds = %b0 - br i1 undef, label %b6, label %b2 - -b2: ; preds = %b1 - br label %b4 - -b3: ; preds = %b4, %b3 - %v0 = add nsw i32 0, 57344 - %v1 = trunc i32 %v0 to i16 - store i16 %v1, i16* null, align 2, !tbaa !0 - %v2 = getelementptr inbounds i8, i8* null, i32 undef - %v3 = load i8, i8* %v2, align 1, !tbaa !4 - %v4 = zext i8 %v3 to i32 - %v5 = shl nuw nsw i32 %v4, 6 - %v6 = add nsw i32 %v5, 57344 - %v7 = trunc i32 %v6 to i16 - store i16 %v7, i16* undef, align 2, !tbaa !0 - br i1 undef, label %b5, label %b3 - -b4: ; preds = %b5, %b2 - %v8 = phi i32 [ 0, %b2 ], [ %v9, %b5 ] - br label %b3 - -b5: ; preds = %b3 - %v9 = add i32 %v8, 1 - %v10 = icmp eq i32 %v9, undef - br i1 %v10, label %b6, label %b4 - -b6: ; preds = %b5, %b1, %b0 - ret void -} - -attributes #0 = { nounwind "target-cpu"="hexagonv55" } - -!0 = !{!1, !1, i64 0} -!1 = !{!"short", !2} -!2 = !{!"omnipotent char", !3} -!3 = !{!"Simple C/C++ TBAA"} -!4 = !{!2, !2, i64 0} diff --git a/test/CodeGen/Hexagon/swp-memrefs-epilog1.ll b/test/CodeGen/Hexagon/swp-memrefs-epilog1.ll deleted file mode 100644 index bb45eeac140..00000000000 --- a/test/CodeGen/Hexagon/swp-memrefs-epilog1.ll +++ /dev/null @@ -1,90 +0,0 @@ -; RUN: llc -march=hexagon -enable-pipeliner < %s | FileCheck %s - -; Test that a store and load, that alias, are not put in the same packet. The -; pipeliner altered the size of the memrefs for these instructions, which -; resulted in no order dependence between the instructions in the DAG. No order -; dependence was added since the size was set to UINT_MAX, but there is a -; computation using the size that overflowed. - -; CHECK: endloop0 -; CHECK: memh([[REG:r([0-9]+)]]+#0) = -; CHECK: = memh([[REG]]++#2) - -; Function Attrs: nounwind -define signext i16 @f0(i16* nocapture readonly %a0, i16* nocapture readonly %a1) local_unnamed_addr #0 { -b0: - %v0 = alloca [40 x i16], align 8 - %v1 = bitcast [40 x i16]* %v0 to i8* - call void @llvm.lifetime.start.p0i8(i64 80, i8* nonnull %v1) #2 - %v2 = getelementptr inbounds [40 x i16], [40 x i16]* %v0, i32 0, i32 0 - br label %b1 - -b1: ; preds = %b1, %b0 - %v3 = phi i16* [ %a1, %b0 ], [ %v24, %b1 ] - %v4 = phi i16* [ %v2, %b0 ], [ %v25, %b1 ] - %v5 = phi i32 [ 0, %b0 ], [ %v14, %b1 ] - %v6 = phi i32 [ 1, %b0 ], [ %v22, %b1 ] - %v7 = phi i32 [ 0, %b0 ], [ %v23, %b1 ] - %v8 = load i16, i16* %v3, align 2 - %v9 = sext i16 %v8 to i32 - %v10 = tail call i32 @llvm.hexagon.A2.aslh(i32 %v9) - %v11 = tail call i32 @llvm.hexagon.S2.asr.r.r.sat(i32 %v10, i32 1) - %v12 = tail call i32 @llvm.hexagon.A2.asrh(i32 %v11) - %v13 = trunc i32 %v12 to i16 - store i16 %v13, i16* %v4, align 2 - %v14 = add nuw nsw i32 %v5, 1 - %v15 = icmp eq i32 %v14, 40 - %v16 = getelementptr inbounds i16, i16* %a0, i32 %v7 - %v17 = load i16, i16* %v16, align 2 - %v18 = sext i16 %v17 to i32 - %v19 = getelementptr inbounds [40 x i16], [40 x i16]* %v0, i32 0, i32 %v7 - %v20 = load i16, i16* %v19, align 2 - %v21 = sext i16 %v20 to i32 - %v22 = tail call i32 @llvm.hexagon.M2.mpy.acc.sat.ll.s1(i32 %v6, i32 %v18, i32 %v21) - %v23 = add nuw nsw i32 %v7, 1 - %v24 = getelementptr i16, i16* %v3, i32 1 - %v25 = getelementptr i16, i16* %v4, i32 1 - br i1 %v15, label %b2, label %b1 - -b2: ; preds = %b1 - %v26 = tail call signext i16 @f1(i32 %v22) #0 - %v27 = sext i16 %v26 to i32 - %v28 = tail call i32 @llvm.hexagon.S2.asl.r.r.sat(i32 %v22, i32 %v27) - %v29 = tail call i32 @llvm.hexagon.A2.asrh(i32 %v28) - %v30 = shl i32 %v29, 16 - %v31 = ashr exact i32 %v30, 16 - %v32 = icmp slt i32 %v30, 65536 - br label %b3 - -b3: ; preds = %b2 - call void @llvm.lifetime.end.p0i8(i64 80, i8* nonnull %v1) #2 - ret i16 0 -} - -; Function Attrs: argmemonly nounwind -declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1 - -; Function Attrs: nounwind readnone -declare i32 @llvm.hexagon.S2.asr.r.r.sat(i32, i32) #2 - -; Function Attrs: nounwind readnone -declare i32 @llvm.hexagon.A2.aslh(i32) #2 - -; Function Attrs: nounwind readnone -declare i32 @llvm.hexagon.A2.asrh(i32) #2 - -; Function Attrs: nounwind readnone -declare i32 @llvm.hexagon.M2.mpy.acc.sat.ll.s1(i32, i32, i32) #2 - -; Function Attrs: nounwind -declare signext i16 @f1(i32) local_unnamed_addr #0 - -; Function Attrs: nounwind readnone -declare i32 @llvm.hexagon.S2.asl.r.r.sat(i32, i32) #2 - -; Function Attrs: argmemonly nounwind -declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1 - -attributes #0 = { nounwind } -attributes #1 = { argmemonly nounwind } -attributes #2 = { nounwind readnone } diff --git a/test/CodeGen/Mips/cconv/vector.ll b/test/CodeGen/Mips/cconv/vector.ll index b580d2a338c..8cec16683ca 100644 --- a/test/CodeGen/Mips/cconv/vector.ll +++ b/test/CodeGen/Mips/cconv/vector.ll @@ -2053,12 +2053,10 @@ define <2 x i32> @i32_2(<2 x i32> %a, <2 x i32> %b) { ; MIPS64R5-NEXT: sd $4, 24($sp) ; MIPS64R5-NEXT: ldi.b $w0, 0 ; MIPS64R5-NEXT: lw $1, 20($sp) -; MIPS64R5-NEXT: lw $2, 16($sp) ; MIPS64R5-NEXT: move.v $w1, $w0 -; MIPS64R5-NEXT: insert.d $w1[0], $2 +; MIPS64R5-NEXT: insert.d $w1[0], $5 ; MIPS64R5-NEXT: insert.d $w1[1], $1 -; MIPS64R5-NEXT: lw $1, 24($sp) -; MIPS64R5-NEXT: insert.d $w0[0], $1 +; MIPS64R5-NEXT: insert.d $w0[0], $4 ; MIPS64R5-NEXT: lw $1, 28($sp) ; MIPS64R5-NEXT: insert.d $w0[1], $1 ; MIPS64R5-NEXT: addv.d $w0, $w0, $w1 @@ -3533,12 +3531,8 @@ define void @call_i8_2() { ; MIPS32R5EB-NEXT: .cfi_def_cfa_offset 32 ; MIPS32R5EB-NEXT: sw $ra, 28($sp) # 4-byte Folded Spill ; MIPS32R5EB-NEXT: .cfi_offset 31, -4 -; MIPS32R5EB-NEXT: addiu $1, $zero, 1543 -; MIPS32R5EB-NEXT: sh $1, 20($sp) -; MIPS32R5EB-NEXT: addiu $1, $zero, 3080 -; MIPS32R5EB-NEXT: sh $1, 24($sp) -; MIPS32R5EB-NEXT: lhu $4, 20($sp) -; MIPS32R5EB-NEXT: lhu $5, 24($sp) +; MIPS32R5EB-NEXT: addiu $4, $zero, 1543 +; MIPS32R5EB-NEXT: addiu $5, $zero, 3080 ; MIPS32R5EB-NEXT: jal i8_2 ; MIPS32R5EB-NEXT: nop ; MIPS32R5EB-NEXT: sw $2, 16($sp) @@ -3645,12 +3639,8 @@ define void @call_i8_2() { ; MIPS32R5EL-NEXT: .cfi_def_cfa_offset 32 ; MIPS32R5EL-NEXT: sw $ra, 28($sp) # 4-byte Folded Spill ; MIPS32R5EL-NEXT: .cfi_offset 31, -4 -; MIPS32R5EL-NEXT: addiu $1, $zero, 1798 -; MIPS32R5EL-NEXT: sh $1, 20($sp) -; MIPS32R5EL-NEXT: addiu $1, $zero, 2060 -; MIPS32R5EL-NEXT: sh $1, 24($sp) -; MIPS32R5EL-NEXT: lhu $4, 20($sp) -; MIPS32R5EL-NEXT: lhu $5, 24($sp) +; MIPS32R5EL-NEXT: addiu $4, $zero, 1798 +; MIPS32R5EL-NEXT: addiu $5, $zero, 2060 ; MIPS32R5EL-NEXT: jal i8_2 ; MIPS32R5EL-NEXT: nop ; MIPS32R5EL-NEXT: sw $2, 16($sp) diff --git a/test/CodeGen/Mips/indirect-jump-hazard/jumptables.ll b/test/CodeGen/Mips/indirect-jump-hazard/jumptables.ll index 4f2339d18c3..efa07590900 100644 --- a/test/CodeGen/Mips/indirect-jump-hazard/jumptables.ll +++ b/test/CodeGen/Mips/indirect-jump-hazard/jumptables.ll @@ -155,11 +155,10 @@ define i8* @_Z3fooi(i32 signext %Letter) { ; MIPS64R2: # %bb.0: # %entry ; MIPS64R2-NEXT: daddiu $sp, $sp, -16 ; MIPS64R2-NEXT: .cfi_def_cfa_offset 16 -; MIPS64R2-NEXT: sw $4, 4($sp) -; MIPS64R2-NEXT: lwu $2, 4($sp) +; MIPS64R2-NEXT: dext $2, $4, 0, 32 ; MIPS64R2-NEXT: sltiu $1, $2, 7 ; MIPS64R2-NEXT: beqz $1, .LBB0_3 -; MIPS64R2-NEXT: nop +; MIPS64R2-NEXT: sw $4, 4($sp) ; MIPS64R2-NEXT: .LBB0_1: # %entry ; MIPS64R2-NEXT: dsll $1, $2, 3 ; MIPS64R2-NEXT: lui $2, %highest(.LJTI0_0) @@ -251,10 +250,10 @@ define i8* @_Z3fooi(i32 signext %Letter) { ; MIPS64R6: # %bb.0: # %entry ; MIPS64R6-NEXT: daddiu $sp, $sp, -16 ; MIPS64R6-NEXT: .cfi_def_cfa_offset 16 -; MIPS64R6-NEXT: sw $4, 4($sp) -; MIPS64R6-NEXT: lwu $2, 4($sp) +; MIPS64R6-NEXT: dext $2, $4, 0, 32 ; MIPS64R6-NEXT: sltiu $1, $2, 7 -; MIPS64R6-NEXT: beqzc $1, .LBB0_3 +; MIPS64R6-NEXT: beqz $1, .LBB0_3 +; MIPS64R6-NEXT: sw $4, 4($sp) ; MIPS64R6-NEXT: .LBB0_1: # %entry ; MIPS64R6-NEXT: dsll $1, $2, 3 ; MIPS64R6-NEXT: lui $2, %highest(.LJTI0_0) @@ -473,11 +472,10 @@ define i8* @_Z3fooi(i32 signext %Letter) { ; PIC-MIPS64R2-NEXT: lui $1, %hi(%neg(%gp_rel(_Z3fooi))) ; PIC-MIPS64R2-NEXT: daddu $1, $1, $25 ; PIC-MIPS64R2-NEXT: daddiu $2, $1, %lo(%neg(%gp_rel(_Z3fooi))) -; PIC-MIPS64R2-NEXT: sw $4, 4($sp) -; PIC-MIPS64R2-NEXT: lwu $3, 4($sp) +; PIC-MIPS64R2-NEXT: dext $3, $4, 0, 32 ; PIC-MIPS64R2-NEXT: sltiu $1, $3, 7 ; PIC-MIPS64R2-NEXT: beqz $1, .LBB0_3 -; PIC-MIPS64R2-NEXT: nop +; PIC-MIPS64R2-NEXT: sw $4, 4($sp) ; PIC-MIPS64R2-NEXT: .LBB0_1: # %entry ; PIC-MIPS64R2-NEXT: dsll $1, $3, 3 ; PIC-MIPS64R2-NEXT: ld $3, %got_page(.LJTI0_0)($2) @@ -537,10 +535,10 @@ define i8* @_Z3fooi(i32 signext %Letter) { ; PIC-MIPS64R6-NEXT: lui $1, %hi(%neg(%gp_rel(_Z3fooi))) ; PIC-MIPS64R6-NEXT: daddu $1, $1, $25 ; PIC-MIPS64R6-NEXT: daddiu $2, $1, %lo(%neg(%gp_rel(_Z3fooi))) -; PIC-MIPS64R6-NEXT: sw $4, 4($sp) -; PIC-MIPS64R6-NEXT: lwu $3, 4($sp) +; PIC-MIPS64R6-NEXT: dext $3, $4, 0, 32 ; PIC-MIPS64R6-NEXT: sltiu $1, $3, 7 -; PIC-MIPS64R6-NEXT: beqzc $1, .LBB0_3 +; PIC-MIPS64R6-NEXT: beqz $1, .LBB0_3 +; PIC-MIPS64R6-NEXT: sw $4, 4($sp) ; PIC-MIPS64R6-NEXT: .LBB0_1: # %entry ; PIC-MIPS64R6-NEXT: dsll $1, $3, 3 ; PIC-MIPS64R6-NEXT: ld $3, %got_page(.LJTI0_0)($2) diff --git a/test/CodeGen/Mips/o32_cc_byval.ll b/test/CodeGen/Mips/o32_cc_byval.ll index d61f05dc868..19eb80b79ba 100644 --- a/test/CodeGen/Mips/o32_cc_byval.ll +++ b/test/CodeGen/Mips/o32_cc_byval.ll @@ -109,7 +109,8 @@ define void @f2(float %f, %struct.S1* nocapture byval %s1) nounwind { ; CHECK-NEXT: lw $1, 64($sp) ; CHECK-NEXT: lw $2, 68($sp) ; CHECK-NEXT: lh $3, 58($sp) -; CHECK-NEXT: lb $5, 56($sp) +; CHECK-NEXT: sll $5, $6, 24 +; CHECK-NEXT: sra $5, $5, 24 ; CHECK-NEXT: swc1 $f12, 36($sp) ; CHECK-NEXT: sw $5, 32($sp) ; CHECK-NEXT: sw $3, 28($sp) @@ -191,11 +192,12 @@ define void @f4(float %f, %struct.S3* nocapture byval %s3, %struct.S1* nocapture ; CHECK-NEXT: sw $ra, 44($sp) # 4-byte Folded Spill ; CHECK-NEXT: addu $gp, $2, $25 ; CHECK-NEXT: move $4, $7 -; CHECK-NEXT: sw $5, 52($sp) ; CHECK-NEXT: sw $6, 56($sp) +; CHECK-NEXT: sw $5, 52($sp) ; CHECK-NEXT: sw $7, 60($sp) ; CHECK-NEXT: lw $1, 80($sp) -; CHECK-NEXT: lb $2, 52($sp) +; CHECK-NEXT: sll $2, $5, 24 +; CHECK-NEXT: sra $2, $2, 24 ; CHECK-NEXT: addiu $3, $zero, 4 ; CHECK-NEXT: lui $5, 16576 ; CHECK-NEXT: sw $5, 36($sp) diff --git a/test/CodeGen/Mips/o32_cc_vararg.ll b/test/CodeGen/Mips/o32_cc_vararg.ll index 73aad48b73e..27d454f31d9 100644 --- a/test/CodeGen/Mips/o32_cc_vararg.ll +++ b/test/CodeGen/Mips/o32_cc_vararg.ll @@ -29,10 +29,10 @@ entry: ; CHECK-LABEL: va1: ; CHECK: addiu $sp, $sp, -16 -; CHECK: sw $5, 20($sp) ; CHECK: sw $7, 28($sp) ; CHECK: sw $6, 24($sp) -; CHECK: lw $2, 20($sp) +; CHECK: sw $5, 20($sp) +; CHECK: move $2, $5 } ; check whether the variable double argument will be accessed from the 8-byte @@ -83,9 +83,9 @@ entry: ; CHECK-LABEL: va3: ; CHECK: addiu $sp, $sp, -16 -; CHECK: sw $6, 24($sp) ; CHECK: sw $7, 28($sp) -; CHECK: lw $2, 24($sp) +; CHECK: sw $6, 24($sp) +; CHECK: move $2, $6 } ; double @@ -135,7 +135,7 @@ entry: ; CHECK-LABEL: va5: ; CHECK: addiu $sp, $sp, -24 ; CHECK: sw $7, 36($sp) -; CHECK: lw $2, 36($sp) +; CHECK: move $2, $7 } ; double diff --git a/test/CodeGen/PowerPC/addi-offset-fold.ll b/test/CodeGen/PowerPC/addi-offset-fold.ll index ab00a4dab3a..7af99203694 100644 --- a/test/CodeGen/PowerPC/addi-offset-fold.ll +++ b/test/CodeGen/PowerPC/addi-offset-fold.ll @@ -24,12 +24,11 @@ entry: ret i32 %bf.cast ; CHECK-LABEL: @foo -; FIXME: We don't need to do these stores/loads at all. +; FIXME: We don't need to do these stores at all. ; CHECK-DAG: std 3, -24(1) ; CHECK-DAG: stb 4, -16(1) -; CHECK-DAG: lbz [[REG1:[0-9]+]], -16(1) +; CHECK-DAG: sldi [[REG3:[0-9]+]], 4, 32 ; CHECK-DAG: lwz [[REG2:[0-9]+]], -20(1) -; CHECK-DAG: sldi [[REG3:[0-9]+]], [[REG1]], 32 ; CHECK-DAG: or [[REG4:[0-9]+]], [[REG2]], [[REG3]] ; CHECK: rldicl 3, [[REG4]], 33, 57 ; CHECK: blr diff --git a/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll b/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll index 8b7184f38e8..60a6a180467 100644 --- a/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll +++ b/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll @@ -60,8 +60,7 @@ define i16 @fun1(<16 x i1> %src) ; CHECK-NEXT: rosbg %r0, %r1, 62, 62, 1 ; CHECK-NEXT: vlgvb %r1, %v24, 15 ; CHECK-NEXT: rosbg %r0, %r1, 63, 63, 0 -; CHECK-NEXT: sth %r0, 160(%r15) -; CHECK-NEXT: lh %r2, 160(%r15) +; CHECK-NEXT: llhr %r2, %r0 ; CHECK-NEXT: aghi %r15, 168 ; CHECK-NEXT: br %r14 { diff --git a/test/CodeGen/X86/i386-shrink-wrapping.ll b/test/CodeGen/X86/i386-shrink-wrapping.ll index 8a5b92a82fb..495ead223b2 100644 --- a/test/CodeGen/X86/i386-shrink-wrapping.ll +++ b/test/CodeGen/X86/i386-shrink-wrapping.ll @@ -56,7 +56,7 @@ target triple = "i386-apple-macosx10.5" ; ; CHECK-NEXT: L_e$non_lazy_ptr, [[E:%[a-z]+]] ; CHECK-NEXT: movb %dl, ([[E]]) -; CHECK-NEXT: movsbl ([[E]]), [[CONV:%[a-z]+]] +; CHECK-NEXT: movzbl %dl, [[CONV:%[a-z]+]] ; CHECK-NEXT: movl $6, [[CONV:%[a-z]+]] ; The eflags is used in the next instruction. ; If that instruction disappear, we are not exercising the bug diff --git a/test/CodeGen/X86/pr32108.ll b/test/CodeGen/X86/pr32108.ll index bde5daff285..dc14746440a 100644 --- a/test/CodeGen/X86/pr32108.ll +++ b/test/CodeGen/X86/pr32108.ll @@ -4,7 +4,6 @@ define void @pr32108() { ; CHECK-LABEL: pr32108: ; CHECK: # %bb.0: # %BB -; CHECK-NEXT: movb $0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB0_1: # %CF244 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 diff --git a/test/CodeGen/X86/pr38533.ll b/test/CodeGen/X86/pr38533.ll index 96d003ba1a8..59c67acc9be 100644 --- a/test/CodeGen/X86/pr38533.ll +++ b/test/CodeGen/X86/pr38533.ll @@ -19,8 +19,6 @@ define void @pr38533_2(half %x) { ; SSE-NEXT: pushq %rax ; SSE-NEXT: .cfi_def_cfa_offset 16 ; SSE-NEXT: callq __gnu_f2h_ieee -; SSE-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; SSE-NEXT: movzwl {{[0-9]+}}(%rsp), %eax ; SSE-NEXT: movw %ax, (%rax) ; SSE-NEXT: popq %rax ; SSE-NEXT: .cfi_def_cfa_offset 8 @@ -30,8 +28,6 @@ define void @pr38533_2(half %x) { ; AVX512: # %bb.0: ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax ; AVX512-NEXT: movw %ax, (%rax) ; AVX512-NEXT: retq %a = bitcast half %x to <4 x i4> @@ -46,8 +42,6 @@ define void @pr38533_3(half %x) { ; SSE-NEXT: pushq %rax ; SSE-NEXT: .cfi_def_cfa_offset 16 ; SSE-NEXT: callq __gnu_f2h_ieee -; SSE-NEXT: movw %ax, (%rsp) -; SSE-NEXT: movzwl (%rsp), %eax ; SSE-NEXT: movw %ax, (%rax) ; SSE-NEXT: popq %rax ; SSE-NEXT: .cfi_def_cfa_offset 8 diff --git a/test/CodeGen/X86/win64_vararg.ll b/test/CodeGen/X86/win64_vararg.ll index fc9a10ed586..f0aff6f89bc 100644 --- a/test/CodeGen/X86/win64_vararg.ll +++ b/test/CodeGen/X86/win64_vararg.ll @@ -124,7 +124,8 @@ entry: ; CHECK: movq %rcx, %rax ; CHECK-DAG: movq %r9, 40(%rsp) ; CHECK-DAG: movq %r8, 32(%rsp) -; CHECK: movl 32(%rsp), %[[tmp:[^ ]*]] -; CHECK: movl %[[tmp]], (%rax) +; CHECK-DAG: leaq 36(%rsp), %[[sret:[^ ]*]] +; CHECK-DAG: movl %r8d, (%rax) +; CHECK-DAG: movq %[[sret]], (%rsp) ; CHECK: popq ; CHECK: retq -- GitLab From 444acbd06151972689d80403f079bbcae562d11e Mon Sep 17 00:00:00 2001 From: Guillaume Chatelet Date: Wed, 10 Oct 2018 14:22:48 +0000 Subject: [PATCH 0003/1116] [llvm-exegesis][NFC] Code simplification Summary: Simplify code by having LLVMState hold the RegisterAliasingTrackerCache. Reviewers: courbet Subscribers: tschuett, llvm-commits Differential Revision: https://reviews.llvm.org/D53078 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344143 91177308-0d34-0410-b5e6-96231b3b80d8 --- tools/llvm-exegesis/lib/Latency.cpp | 5 ++--- tools/llvm-exegesis/lib/LlvmState.cpp | 2 ++ tools/llvm-exegesis/lib/LlvmState.h | 3 +++ tools/llvm-exegesis/lib/MCInstrDescView.cpp | 18 +++++++++--------- tools/llvm-exegesis/lib/MCInstrDescView.h | 4 ++-- tools/llvm-exegesis/lib/SnippetGenerator.cpp | 7 +++---- tools/llvm-exegesis/lib/SnippetGenerator.h | 1 - tools/llvm-exegesis/lib/Uops.cpp | 9 +++++---- tools/llvm-exegesis/lib/X86/Target.cpp | 6 +++--- .../llvm-exegesis/X86/SnippetGeneratorTest.cpp | 2 +- 10 files changed, 30 insertions(+), 27 deletions(-) diff --git a/tools/llvm-exegesis/lib/Latency.cpp b/tools/llvm-exegesis/lib/Latency.cpp index 4173cf3f9a1..ea646f4f261 100644 --- a/tools/llvm-exegesis/lib/Latency.cpp +++ b/tools/llvm-exegesis/lib/Latency.cpp @@ -32,8 +32,7 @@ LatencySnippetGenerator::generateTwoInstructionPrototype( for (const unsigned OtherOpcode : Opcodes) { if (OtherOpcode == Instr.Description->Opcode) continue; - const auto &OtherInstrDesc = State.getInstrInfo().get(OtherOpcode); - const Instruction OtherInstr(OtherInstrDesc, RATC); + const Instruction OtherInstr(State, OtherOpcode); if (OtherInstr.hasMemoryOperands()) continue; const AliasingConfigurations Forward(Instr, OtherInstr); @@ -59,7 +58,7 @@ LatencySnippetGenerator::generateTwoInstructionPrototype( llvm::Expected LatencySnippetGenerator::generateCodeTemplate(unsigned Opcode) const { - const Instruction Instr(State.getInstrInfo().get(Opcode), RATC); + const Instruction Instr(State, Opcode); if (Instr.hasMemoryOperands()) return llvm::make_error( "Infeasible : has memory operands"); diff --git a/tools/llvm-exegesis/lib/LlvmState.cpp b/tools/llvm-exegesis/lib/LlvmState.cpp index 9ff42ca71fd..279792e9031 100644 --- a/tools/llvm-exegesis/lib/LlvmState.cpp +++ b/tools/llvm-exegesis/lib/LlvmState.cpp @@ -35,6 +35,8 @@ LLVMState::LLVMState(const std::string &Triple, const std::string &CpuName) { llvm::errs() << "no exegesis target for " << Triple << ", using default\n"; TheExegesisTarget = &ExegesisTarget::getDefault(); } + RATC.reset(new RegisterAliasingTrackerCache( + getRegInfo(), getFunctionReservedRegs(getTargetMachine()))); } LLVMState::LLVMState() diff --git a/tools/llvm-exegesis/lib/LlvmState.h b/tools/llvm-exegesis/lib/LlvmState.h index c84db300841..aa7705a36a6 100644 --- a/tools/llvm-exegesis/lib/LlvmState.h +++ b/tools/llvm-exegesis/lib/LlvmState.h @@ -15,6 +15,7 @@ #ifndef LLVM_TOOLS_LLVM_EXEGESIS_LLVMSTATE_H #define LLVM_TOOLS_LLVM_EXEGESIS_LLVMSTATE_H +#include "RegisterAliasing.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstrInfo.h" @@ -54,10 +55,12 @@ public: const llvm::MCSubtargetInfo &getSubtargetInfo() const { return *TargetMachine->getMCSubtargetInfo(); } + const RegisterAliasingTrackerCache &getRATC() const { return *RATC; } private: const ExegesisTarget *TheExegesisTarget; std::unique_ptr TargetMachine; + std::unique_ptr RATC; }; } // namespace exegesis diff --git a/tools/llvm-exegesis/lib/MCInstrDescView.cpp b/tools/llvm-exegesis/lib/MCInstrDescView.cpp index 75d85873146..d54f3ca2a45 100644 --- a/tools/llvm-exegesis/lib/MCInstrDescView.cpp +++ b/tools/llvm-exegesis/lib/MCInstrDescView.cpp @@ -87,24 +87,24 @@ const llvm::MCOperandInfo &Operand::getExplicitOperandInfo() const { return *Info; } -Instruction::Instruction(const llvm::MCInstrDesc &MCInstrDesc, - const RegisterAliasingTrackerCache &RATC) - : Description(&MCInstrDesc) { +Instruction::Instruction(const LLVMState &State, unsigned Opcode) + : Description(&State.getInstrInfo().get(Opcode)) { + const auto &RATC = State.getRATC(); unsigned OpIndex = 0; - for (; OpIndex < MCInstrDesc.getNumOperands(); ++OpIndex) { - const auto &OpInfo = MCInstrDesc.opInfo_begin()[OpIndex]; + for (; OpIndex < Description->getNumOperands(); ++OpIndex) { + const auto &OpInfo = Description->opInfo_begin()[OpIndex]; Operand Operand; Operand.Index = OpIndex; - Operand.IsDef = (OpIndex < MCInstrDesc.getNumDefs()); + Operand.IsDef = (OpIndex < Description->getNumDefs()); // TODO(gchatelet): Handle isLookupPtrRegClass. if (OpInfo.RegClass >= 0) Operand.Tracker = &RATC.getRegisterClass(OpInfo.RegClass); Operand.TiedToIndex = - MCInstrDesc.getOperandConstraint(OpIndex, llvm::MCOI::TIED_TO); + Description->getOperandConstraint(OpIndex, llvm::MCOI::TIED_TO); Operand.Info = &OpInfo; Operands.push_back(Operand); } - for (const llvm::MCPhysReg *MCPhysReg = MCInstrDesc.getImplicitDefs(); + for (const llvm::MCPhysReg *MCPhysReg = Description->getImplicitDefs(); MCPhysReg && *MCPhysReg; ++MCPhysReg, ++OpIndex) { Operand Operand; Operand.Index = OpIndex; @@ -113,7 +113,7 @@ Instruction::Instruction(const llvm::MCInstrDesc &MCInstrDesc, Operand.ImplicitReg = MCPhysReg; Operands.push_back(Operand); } - for (const llvm::MCPhysReg *MCPhysReg = MCInstrDesc.getImplicitUses(); + for (const llvm::MCPhysReg *MCPhysReg = Description->getImplicitUses(); MCPhysReg && *MCPhysReg; ++MCPhysReg, ++OpIndex) { Operand Operand; Operand.Index = OpIndex; diff --git a/tools/llvm-exegesis/lib/MCInstrDescView.h b/tools/llvm-exegesis/lib/MCInstrDescView.h index 39e5c4a5f5b..914bf51a22b 100644 --- a/tools/llvm-exegesis/lib/MCInstrDescView.h +++ b/tools/llvm-exegesis/lib/MCInstrDescView.h @@ -21,6 +21,7 @@ #include +#include "LlvmState.h" #include "RegisterAliasing.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/Optional.h" @@ -92,8 +93,7 @@ struct Operand { // A view over an MCInstrDesc offering a convenient interface to compute // Register aliasing. struct Instruction { - Instruction(const llvm::MCInstrDesc &MCInstrDesc, - const RegisterAliasingTrackerCache &ATC); + Instruction(const LLVMState &State, unsigned Opcode); // Returns the Operand linked to this Variable. // In case the Variable is tied, the primary (i.e. Def) Operand is returned. diff --git a/tools/llvm-exegesis/lib/SnippetGenerator.cpp b/tools/llvm-exegesis/lib/SnippetGenerator.cpp index 3765776f724..16dbd214e95 100644 --- a/tools/llvm-exegesis/lib/SnippetGenerator.cpp +++ b/tools/llvm-exegesis/lib/SnippetGenerator.cpp @@ -25,9 +25,7 @@ namespace exegesis { SnippetGeneratorFailure::SnippetGeneratorFailure(const llvm::Twine &S) : llvm::StringError(S, llvm::inconvertibleErrorCode()) {} -SnippetGenerator::SnippetGenerator(const LLVMState &State) - : State(State), RATC(State.getRegInfo(), - getFunctionReservedRegs(State.getTargetMachine())) {} +SnippetGenerator::SnippetGenerator(const LLVMState &State) : State(State) {} SnippetGenerator::~SnippetGenerator() = default; @@ -35,6 +33,7 @@ llvm::Expected> SnippetGenerator::generateConfigurations(unsigned Opcode) const { if (auto E = generateCodeTemplate(Opcode)) { CodeTemplate &CT = E.get(); + const auto &RATC = State.getRATC(); const llvm::BitVector &ForbiddenRegs = CT.ScratchSpacePointerInReg ? RATC.getRegister(CT.ScratchSpacePointerInReg).aliasedBits() @@ -64,7 +63,7 @@ std::vector SnippetGenerator::computeRegisterInitialValues( // Ignore memory operands which are handled separately. // Loop invariant: DefinedRegs[i] is true iif it has been set at least once // before the current instruction. - llvm::BitVector DefinedRegs = RATC.emptyRegisters(); + llvm::BitVector DefinedRegs = State.getRATC().emptyRegisters(); std::vector RIV; for (const InstructionTemplate &IT : Instructions) { // Returns the register that this Operand sets or uses, or 0 if this is not diff --git a/tools/llvm-exegesis/lib/SnippetGenerator.h b/tools/llvm-exegesis/lib/SnippetGenerator.h index 9493c584816..24afe95fda0 100644 --- a/tools/llvm-exegesis/lib/SnippetGenerator.h +++ b/tools/llvm-exegesis/lib/SnippetGenerator.h @@ -54,7 +54,6 @@ public: protected: const LLVMState &State; - const RegisterAliasingTrackerCache RATC; // Generates a single code template that has a self-dependency. llvm::Expected diff --git a/tools/llvm-exegesis/lib/Uops.cpp b/tools/llvm-exegesis/lib/Uops.cpp index 2208e2a3821..fdb6a27ab59 100644 --- a/tools/llvm-exegesis/lib/Uops.cpp +++ b/tools/llvm-exegesis/lib/Uops.cpp @@ -130,7 +130,7 @@ UopsSnippetGenerator::generateCodeTemplate(unsigned Opcode) const { CodeTemplate CT; const llvm::BitVector *ScratchSpaceAliasedRegs = nullptr; - const Instruction Instr(State.getInstrInfo().get(Opcode), RATC); + const Instruction Instr(State, Opcode); if (Instr.hasMemoryOperands()) { CT.ScratchSpacePointerInReg = ET.getScratchMemoryRegister(State.getTargetMachine().getTargetTriple()); @@ -138,7 +138,7 @@ UopsSnippetGenerator::generateCodeTemplate(unsigned Opcode) const { return llvm::make_error( "Infeasible : target does not support memory instructions"); ScratchSpaceAliasedRegs = - &RATC.getRegister(CT.ScratchSpacePointerInReg).aliasedBits(); + &State.getRATC().getRegister(CT.ScratchSpacePointerInReg).aliasedBits(); // If the instruction implicitly writes to ScratchSpacePointerInReg , abort. // FIXME: We could make a copy of the scratch register. for (const auto &Op : Instr.Operands) { @@ -185,12 +185,13 @@ UopsSnippetGenerator::generateCodeTemplate(unsigned Opcode) const { instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions); return std::move(CT); } + const auto &ReservedRegisters = State.getRATC().reservedRegisters(); // No tied variables, we pick random values for defs. llvm::BitVector Defs(State.getRegInfo().getNumRegs()); for (const auto &Op : Instr.Operands) { if (Op.isReg() && Op.isExplicit() && Op.isDef() && !Op.isMemory()) { auto PossibleRegisters = Op.getRegisterAliasing().sourceBits(); - remove(PossibleRegisters, RATC.reservedRegisters()); + remove(PossibleRegisters, ReservedRegisters); // Do not use the scratch memory address register. if (ScratchSpaceAliasedRegs) remove(PossibleRegisters, *ScratchSpaceAliasedRegs); @@ -205,7 +206,7 @@ UopsSnippetGenerator::generateCodeTemplate(unsigned Opcode) const { for (const auto &Op : Instr.Operands) { if (Op.isReg() && Op.isExplicit() && Op.isUse() && !Op.isMemory()) { auto PossibleRegisters = Op.getRegisterAliasing().sourceBits(); - remove(PossibleRegisters, RATC.reservedRegisters()); + remove(PossibleRegisters, ReservedRegisters); // Do not use the scratch memory address register. if (ScratchSpaceAliasedRegs) remove(PossibleRegisters, *ScratchSpaceAliasedRegs); diff --git a/tools/llvm-exegesis/lib/X86/Target.cpp b/tools/llvm-exegesis/lib/X86/Target.cpp index 4a9cb08e27a..8c03f1ac826 100644 --- a/tools/llvm-exegesis/lib/X86/Target.cpp +++ b/tools/llvm-exegesis/lib/X86/Target.cpp @@ -37,9 +37,9 @@ template class X86SnippetGenerator : public Impl { } // Handle X87. - const auto &InstrDesc = InstrInfo.get(Opcode); - const unsigned FPInstClass = InstrDesc.TSFlags & llvm::X86II::FPTypeMask; - const Instruction Instr(InstrDesc, this->RATC); + const unsigned FPInstClass = + InstrInfo.get(Opcode).TSFlags & llvm::X86II::FPTypeMask; + const Instruction Instr(this->State, Opcode); switch (FPInstClass) { case llvm::X86II::NotFP: break; diff --git a/unittests/tools/llvm-exegesis/X86/SnippetGeneratorTest.cpp b/unittests/tools/llvm-exegesis/X86/SnippetGeneratorTest.cpp index 9685c730b8b..f2539aaea18 100644 --- a/unittests/tools/llvm-exegesis/X86/SnippetGeneratorTest.cpp +++ b/unittests/tools/llvm-exegesis/X86/SnippetGeneratorTest.cpp @@ -248,7 +248,7 @@ public: FakeSnippetGenerator(const LLVMState &State) : SnippetGenerator(State) {} Instruction createInstruction(unsigned Opcode) { - return Instruction(State.getInstrInfo().get(Opcode), RATC); + return Instruction(State, Opcode); } private: -- GitLab From e144c76bb4fb9fb8bdfcbaf53b1b353da0cdbe05 Mon Sep 17 00:00:00 2001 From: Andrea Di Biagio Date: Wed, 10 Oct 2018 14:46:54 +0000 Subject: [PATCH 0004/1116] [llvm-mca][BtVer2] Add two more move-elimination tests. NFC These should test all the optimizable moves on Jaguar. A follow-up patch will teach how to recognize these optimizable register moves. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344144 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../X86/BtVer2/reg-move-elimination-2.s | 137 ++++++++++++++++++ .../X86/BtVer2/reg-move-elimination-3.s | 122 ++++++++++++++++ 2 files changed, 259 insertions(+) create mode 100644 test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-2.s create mode 100644 test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-3.s diff --git a/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-2.s b/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-2.s new file mode 100644 index 00000000000..33cd3972194 --- /dev/null +++ b/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-2.s @@ -0,0 +1,137 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=3 -timeline -register-file-stats < %s | FileCheck %s + +pxor %mm0, %mm0 +movq %mm0, %mm1 + +xorps %xmm0, %xmm0 +movaps %xmm0, %xmm1 +movups %xmm1, %xmm2 +movapd %xmm2, %xmm3 +movupd %xmm3, %xmm4 +movdqa %xmm4, %xmm5 +movdqu %xmm5, %xmm0 + +# CHECK: Iterations: 3 +# CHECK-NEXT: Instructions: 27 +# CHECK-NEXT: Total Cycles: 19 +# CHECK-NEXT: Total uOps: 27 + +# CHECK: Dispatch Width: 2 +# CHECK-NEXT: uOps Per Cycle: 1.42 +# CHECK-NEXT: IPC: 1.42 +# CHECK-NEXT: Block RThroughput: 4.5 + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 0 0.50 pxor %mm0, %mm0 +# CHECK-NEXT: 1 1 0.50 movq %mm0, %mm1 +# CHECK-NEXT: 1 0 0.50 xorps %xmm0, %xmm0 +# CHECK-NEXT: 1 1 0.50 movaps %xmm0, %xmm1 +# CHECK-NEXT: 1 1 0.50 movups %xmm1, %xmm2 +# CHECK-NEXT: 1 1 0.50 movapd %xmm2, %xmm3 +# CHECK-NEXT: 1 1 0.50 movupd %xmm3, %xmm4 +# CHECK-NEXT: 1 1 0.50 movdqa %xmm4, %xmm5 +# CHECK-NEXT: 1 1 0.50 movdqu %xmm5, %xmm0 + +# CHECK: Register File statistics: +# CHECK-NEXT: Total number of mappings created: 21 +# CHECK-NEXT: Max number of mappings used: 8 + +# CHECK: * Register File #1 -- JFpuPRF: +# CHECK-NEXT: Number of physical registers: 72 +# CHECK-NEXT: Total number of mappings created: 21 +# CHECK-NEXT: Max number of mappings used: 8 + +# CHECK: * Register File #2 -- JIntegerPRF: +# CHECK-NEXT: Number of physical registers: 64 +# CHECK-NEXT: Total number of mappings created: 0 +# CHECK-NEXT: Max number of mappings used: 0 + +# CHECK: Resources: +# CHECK-NEXT: [0] - JALU0 +# CHECK-NEXT: [1] - JALU1 +# CHECK-NEXT: [2] - JDiv +# CHECK-NEXT: [3] - JFPA +# CHECK-NEXT: [4] - JFPM +# CHECK-NEXT: [5] - JFPU0 +# CHECK-NEXT: [6] - JFPU1 +# CHECK-NEXT: [7] - JLAGU +# CHECK-NEXT: [8] - JMul +# CHECK-NEXT: [9] - JSAGU +# CHECK-NEXT: [10] - JSTC +# CHECK-NEXT: [11] - JVALU0 +# CHECK-NEXT: [12] - JVALU1 +# CHECK-NEXT: [13] - JVIMUL + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] +# CHECK-NEXT: - - - 2.00 2.00 3.33 3.67 - - - - 1.33 1.67 - + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions: +# CHECK-NEXT: - - - - - - - - - - - - - - pxor %mm0, %mm0 +# CHECK-NEXT: - - - - - - 1.00 - - - - - 1.00 - movq %mm0, %mm1 +# CHECK-NEXT: - - - - - - - - - - - - - - xorps %xmm0, %xmm0 +# CHECK-NEXT: - - - - 1.00 0.33 0.67 - - - - - - - movaps %xmm0, %xmm1 +# CHECK-NEXT: - - - 1.00 - 0.33 0.67 - - - - - - - movups %xmm1, %xmm2 +# CHECK-NEXT: - - - - 1.00 0.67 0.33 - - - - - - - movapd %xmm2, %xmm3 +# CHECK-NEXT: - - - 1.00 - 0.33 0.67 - - - - - - - movupd %xmm3, %xmm4 +# CHECK-NEXT: - - - - - 1.00 - - - - - 1.00 - - movdqa %xmm4, %xmm5 +# CHECK-NEXT: - - - - - 0.67 0.33 - - - - 0.33 0.67 - movdqu %xmm5, %xmm0 + +# CHECK: Timeline view: +# CHECK-NEXT: 012345678 +# CHECK-NEXT: Index 0123456789 + +# CHECK: [0,0] DR . . . . pxor %mm0, %mm0 +# CHECK-NEXT: [0,1] DeER . . . . movq %mm0, %mm1 +# CHECK-NEXT: [0,2] .D-R . . . . xorps %xmm0, %xmm0 +# CHECK-NEXT: [0,3] .DeER. . . . movaps %xmm0, %xmm1 +# CHECK-NEXT: [0,4] . DeER . . . movups %xmm1, %xmm2 +# CHECK-NEXT: [0,5] . D=eER . . . movapd %xmm2, %xmm3 +# CHECK-NEXT: [0,6] . D=eER . . . movupd %xmm3, %xmm4 +# CHECK-NEXT: [0,7] . D==eER . . . movdqa %xmm4, %xmm5 +# CHECK-NEXT: [0,8] . D==eER. . . movdqu %xmm5, %xmm0 +# CHECK-NEXT: [1,0] . D----R. . . pxor %mm0, %mm0 +# CHECK-NEXT: [1,1] . DeE--R . . movq %mm0, %mm1 +# CHECK-NEXT: [1,2] . D----R . . xorps %xmm0, %xmm0 +# CHECK-NEXT: [1,3] . .DeE--R . . movaps %xmm0, %xmm1 +# CHECK-NEXT: [1,4] . .D=eE-R . . movups %xmm1, %xmm2 +# CHECK-NEXT: [1,5] . . D=eE-R . . movapd %xmm2, %xmm3 +# CHECK-NEXT: [1,6] . . D==eER . . movupd %xmm3, %xmm4 +# CHECK-NEXT: [1,7] . . D==eER . . movdqa %xmm4, %xmm5 +# CHECK-NEXT: [1,8] . . D===eER. . movdqu %xmm5, %xmm0 +# CHECK-NEXT: [2,0] . . D----R. . pxor %mm0, %mm0 +# CHECK-NEXT: [2,1] . . DeE---R . movq %mm0, %mm1 +# CHECK-NEXT: [2,2] . . D----R . xorps %xmm0, %xmm0 +# CHECK-NEXT: [2,3] . . DeE---R . movaps %xmm0, %xmm1 +# CHECK-NEXT: [2,4] . . .DeE--R . movups %xmm1, %xmm2 +# CHECK-NEXT: [2,5] . . .D=eE--R. movapd %xmm2, %xmm3 +# CHECK-NEXT: [2,6] . . . D=eE-R. movupd %xmm3, %xmm4 +# CHECK-NEXT: [2,7] . . . D==eE-R movdqa %xmm4, %xmm5 +# CHECK-NEXT: [2,8] . . . D==eER movdqu %xmm5, %xmm0 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 3 0.0 0.0 2.7 pxor %mm0, %mm0 +# CHECK-NEXT: 1. 3 1.0 1.0 1.7 movq %mm0, %mm1 +# CHECK-NEXT: 2. 3 0.0 0.0 3.0 xorps %xmm0, %xmm0 +# CHECK-NEXT: 3. 3 1.0 1.0 1.7 movaps %xmm0, %xmm1 +# CHECK-NEXT: 4. 3 1.3 0.0 1.0 movups %xmm1, %xmm2 +# CHECK-NEXT: 5. 3 2.0 0.0 1.0 movapd %xmm2, %xmm3 +# CHECK-NEXT: 6. 3 2.3 0.0 0.3 movupd %xmm3, %xmm4 +# CHECK-NEXT: 7. 3 3.0 0.0 0.3 movdqa %xmm4, %xmm5 +# CHECK-NEXT: 8. 3 3.3 0.0 0.0 movdqu %xmm5, %xmm0 diff --git a/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-3.s b/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-3.s new file mode 100644 index 00000000000..e3e0abc75e7 --- /dev/null +++ b/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-3.s @@ -0,0 +1,122 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=3 -timeline -register-file-stats < %s | FileCheck %s + +vxorps %xmm0, %xmm0, %xmm0 +vmovaps %xmm0, %xmm1 +vmovups %xmm1, %xmm2 +vmovapd %xmm2, %xmm3 +vmovupd %xmm3, %xmm4 +vmovdqa %xmm4, %xmm5 +vmovdqu %xmm5, %xmm0 + +# CHECK: Iterations: 3 +# CHECK-NEXT: Instructions: 21 +# CHECK-NEXT: Total Cycles: 16 +# CHECK-NEXT: Total uOps: 21 + +# CHECK: Dispatch Width: 2 +# CHECK-NEXT: uOps Per Cycle: 1.31 +# CHECK-NEXT: IPC: 1.31 +# CHECK-NEXT: Block RThroughput: 3.5 + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 0 0.50 vxorps %xmm0, %xmm0, %xmm0 +# CHECK-NEXT: 1 1 0.50 vmovaps %xmm0, %xmm1 +# CHECK-NEXT: 1 1 0.50 vmovups %xmm1, %xmm2 +# CHECK-NEXT: 1 1 0.50 vmovapd %xmm2, %xmm3 +# CHECK-NEXT: 1 1 0.50 vmovupd %xmm3, %xmm4 +# CHECK-NEXT: 1 1 0.50 vmovdqa %xmm4, %xmm5 +# CHECK-NEXT: 1 1 0.50 vmovdqu %xmm5, %xmm0 + +# CHECK: Register File statistics: +# CHECK-NEXT: Total number of mappings created: 18 +# CHECK-NEXT: Max number of mappings used: 9 + +# CHECK: * Register File #1 -- JFpuPRF: +# CHECK-NEXT: Number of physical registers: 72 +# CHECK-NEXT: Total number of mappings created: 18 +# CHECK-NEXT: Max number of mappings used: 9 + +# CHECK: * Register File #2 -- JIntegerPRF: +# CHECK-NEXT: Number of physical registers: 64 +# CHECK-NEXT: Total number of mappings created: 0 +# CHECK-NEXT: Max number of mappings used: 0 + +# CHECK: Resources: +# CHECK-NEXT: [0] - JALU0 +# CHECK-NEXT: [1] - JALU1 +# CHECK-NEXT: [2] - JDiv +# CHECK-NEXT: [3] - JFPA +# CHECK-NEXT: [4] - JFPM +# CHECK-NEXT: [5] - JFPU0 +# CHECK-NEXT: [6] - JFPU1 +# CHECK-NEXT: [7] - JLAGU +# CHECK-NEXT: [8] - JMul +# CHECK-NEXT: [9] - JSAGU +# CHECK-NEXT: [10] - JSTC +# CHECK-NEXT: [11] - JVALU0 +# CHECK-NEXT: [12] - JVALU1 +# CHECK-NEXT: [13] - JVIMUL + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] +# CHECK-NEXT: - - - 2.00 2.00 3.00 3.00 - - - - 1.00 1.00 - + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions: +# CHECK-NEXT: - - - - - - - - - - - - - - vxorps %xmm0, %xmm0, %xmm0 +# CHECK-NEXT: - - - - 1.00 0.33 0.67 - - - - - - - vmovaps %xmm0, %xmm1 +# CHECK-NEXT: - - - 1.00 - 0.67 0.33 - - - - - - - vmovups %xmm1, %xmm2 +# CHECK-NEXT: - - - - 1.00 - 1.00 - - - - - - - vmovapd %xmm2, %xmm3 +# CHECK-NEXT: - - - 1.00 - 1.00 - - - - - - - - vmovupd %xmm3, %xmm4 +# CHECK-NEXT: - - - - - 0.33 0.67 - - - - - 1.00 - vmovdqa %xmm4, %xmm5 +# CHECK-NEXT: - - - - - 0.67 0.33 - - - - 1.00 - - vmovdqu %xmm5, %xmm0 + +# CHECK: Timeline view: +# CHECK-NEXT: 012345 +# CHECK-NEXT: Index 0123456789 + +# CHECK: [0,0] DR . . . vxorps %xmm0, %xmm0, %xmm0 +# CHECK-NEXT: [0,1] DeER . . . vmovaps %xmm0, %xmm1 +# CHECK-NEXT: [0,2] .DeER. . . vmovups %xmm1, %xmm2 +# CHECK-NEXT: [0,3] .D=eER . . vmovapd %xmm2, %xmm3 +# CHECK-NEXT: [0,4] . D=eER . . vmovupd %xmm3, %xmm4 +# CHECK-NEXT: [0,5] . D==eER . . vmovdqa %xmm4, %xmm5 +# CHECK-NEXT: [0,6] . D==eER . . vmovdqu %xmm5, %xmm0 +# CHECK-NEXT: [1,0] . D----R . . vxorps %xmm0, %xmm0, %xmm0 +# CHECK-NEXT: [1,1] . DeE--R. . vmovaps %xmm0, %xmm1 +# CHECK-NEXT: [1,2] . D=eE-R. . vmovups %xmm1, %xmm2 +# CHECK-NEXT: [1,3] . D=eE-R . vmovapd %xmm2, %xmm3 +# CHECK-NEXT: [1,4] . D==eER . vmovupd %xmm3, %xmm4 +# CHECK-NEXT: [1,5] . .D==eER . vmovdqa %xmm4, %xmm5 +# CHECK-NEXT: [1,6] . .D===eER . vmovdqu %xmm5, %xmm0 +# CHECK-NEXT: [2,0] . . D----R . vxorps %xmm0, %xmm0, %xmm0 +# CHECK-NEXT: [2,1] . . DeE---R . vmovaps %xmm0, %xmm1 +# CHECK-NEXT: [2,2] . . DeE--R . vmovups %xmm1, %xmm2 +# CHECK-NEXT: [2,3] . . D=eE--R. vmovapd %xmm2, %xmm3 +# CHECK-NEXT: [2,4] . . D=eE-R. vmovupd %xmm3, %xmm4 +# CHECK-NEXT: [2,5] . . D==eE-R vmovdqa %xmm4, %xmm5 +# CHECK-NEXT: [2,6] . . D==eER vmovdqu %xmm5, %xmm0 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 3 0.0 0.0 2.7 vxorps %xmm0, %xmm0, %xmm0 +# CHECK-NEXT: 1. 3 1.0 1.0 1.7 vmovaps %xmm0, %xmm1 +# CHECK-NEXT: 2. 3 1.3 0.0 1.0 vmovups %xmm1, %xmm2 +# CHECK-NEXT: 3. 3 2.0 0.0 1.0 vmovapd %xmm2, %xmm3 +# CHECK-NEXT: 4. 3 2.3 0.0 0.3 vmovupd %xmm3, %xmm4 +# CHECK-NEXT: 5. 3 3.0 0.0 0.3 vmovdqa %xmm4, %xmm5 +# CHECK-NEXT: 6. 3 3.3 0.0 0.0 vmovdqu %xmm5, %xmm0 -- GitLab From 09f76c80ba8d7572c8121d173b32d24f7259a795 Mon Sep 17 00:00:00 2001 From: Guillaume Chatelet Date: Wed, 10 Oct 2018 14:57:32 +0000 Subject: [PATCH 0005/1116] [llvm-exegesis][NFC] Pass Instruction instead of bare Opcode git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344145 91177308-0d34-0410-b5e6-96231b3b80d8 --- tools/llvm-exegesis/lib/Latency.cpp | 3 +-- tools/llvm-exegesis/lib/Latency.h | 2 +- tools/llvm-exegesis/lib/MCInstrDescView.cpp | 15 +++++++++++---- tools/llvm-exegesis/lib/MCInstrDescView.h | 1 + tools/llvm-exegesis/lib/SnippetGenerator.cpp | 4 ++-- tools/llvm-exegesis/lib/SnippetGenerator.h | 4 ++-- tools/llvm-exegesis/lib/Uops.cpp | 6 ++---- tools/llvm-exegesis/lib/Uops.h | 2 +- tools/llvm-exegesis/lib/X86/Target.cpp | 10 ++++------ tools/llvm-exegesis/llvm-exegesis.cpp | 14 +++++++------- .../llvm-exegesis/X86/SnippetGeneratorTest.cpp | 8 +++++--- 11 files changed, 37 insertions(+), 32 deletions(-) diff --git a/tools/llvm-exegesis/lib/Latency.cpp b/tools/llvm-exegesis/lib/Latency.cpp index ea646f4f261..f6786b123ad 100644 --- a/tools/llvm-exegesis/lib/Latency.cpp +++ b/tools/llvm-exegesis/lib/Latency.cpp @@ -57,8 +57,7 @@ LatencySnippetGenerator::generateTwoInstructionPrototype( } llvm::Expected -LatencySnippetGenerator::generateCodeTemplate(unsigned Opcode) const { - const Instruction Instr(State, Opcode); +LatencySnippetGenerator::generateCodeTemplate(const Instruction &Instr) const { if (Instr.hasMemoryOperands()) return llvm::make_error( "Infeasible : has memory operands"); diff --git a/tools/llvm-exegesis/lib/Latency.h b/tools/llvm-exegesis/lib/Latency.h index 37feb62e3dc..83c798f60f3 100644 --- a/tools/llvm-exegesis/lib/Latency.h +++ b/tools/llvm-exegesis/lib/Latency.h @@ -27,7 +27,7 @@ public: ~LatencySnippetGenerator() override; llvm::Expected - generateCodeTemplate(unsigned Opcode) const override; + generateCodeTemplate(const Instruction &Instr) const override; private: llvm::Expected diff --git a/tools/llvm-exegesis/lib/MCInstrDescView.cpp b/tools/llvm-exegesis/lib/MCInstrDescView.cpp index d54f3ca2a45..fa9378856f4 100644 --- a/tools/llvm-exegesis/lib/MCInstrDescView.cpp +++ b/tools/llvm-exegesis/lib/MCInstrDescView.cpp @@ -88,7 +88,8 @@ const llvm::MCOperandInfo &Operand::getExplicitOperandInfo() const { } Instruction::Instruction(const LLVMState &State, unsigned Opcode) - : Description(&State.getInstrInfo().get(Opcode)) { + : Description(&State.getInstrInfo().get(Opcode)), + Name(State.getInstrInfo().getName(Opcode)) { const auto &RATC = State.getRATC(); unsigned OpIndex = 0; for (; OpIndex < Description->getNumOperands(); ++OpIndex) { @@ -198,6 +199,7 @@ bool Instruction::hasAliasingRegisters() const { void Instruction::dump(const llvm::MCRegisterInfo &RegInfo, llvm::raw_ostream &Stream) const { + Stream << "- " << Name << "\n"; for (const auto &Op : Operands) { Stream << "- Op" << Op.getIndex(); if (Op.isExplicit()) @@ -227,10 +229,15 @@ void Instruction::dump(const llvm::MCRegisterInfo &RegInfo, } for (const auto &Var : Variables) { Stream << "- Var" << Var.getIndex(); - Stream << " ("; - for (auto OperandIndex : Var.TiedOperands) + Stream << " ["; + bool IsFirst = true; + for (auto OperandIndex : Var.TiedOperands) { + if (!IsFirst) + Stream << ","; Stream << "Op" << OperandIndex; - Stream << ")"; + IsFirst = false; + } + Stream << "]"; Stream << "\n"; } if (hasMemoryOperands()) diff --git a/tools/llvm-exegesis/lib/MCInstrDescView.h b/tools/llvm-exegesis/lib/MCInstrDescView.h index 914bf51a22b..265476ae125 100644 --- a/tools/llvm-exegesis/lib/MCInstrDescView.h +++ b/tools/llvm-exegesis/lib/MCInstrDescView.h @@ -130,6 +130,7 @@ struct Instruction { llvm::raw_ostream &Stream) const; const llvm::MCInstrDesc *Description; // Never nullptr. + llvm::StringRef Name; // The name of this instruction. llvm::SmallVector Operands; llvm::SmallVector Variables; llvm::BitVector ImplDefRegs; // The set of aliased implicit def registers. diff --git a/tools/llvm-exegesis/lib/SnippetGenerator.cpp b/tools/llvm-exegesis/lib/SnippetGenerator.cpp index 16dbd214e95..f7a76d88ccf 100644 --- a/tools/llvm-exegesis/lib/SnippetGenerator.cpp +++ b/tools/llvm-exegesis/lib/SnippetGenerator.cpp @@ -30,8 +30,8 @@ SnippetGenerator::SnippetGenerator(const LLVMState &State) : State(State) {} SnippetGenerator::~SnippetGenerator() = default; llvm::Expected> -SnippetGenerator::generateConfigurations(unsigned Opcode) const { - if (auto E = generateCodeTemplate(Opcode)) { +SnippetGenerator::generateConfigurations(const Instruction &Instr) const { + if (auto E = generateCodeTemplate(Instr)) { CodeTemplate &CT = E.get(); const auto &RATC = State.getRATC(); const llvm::BitVector &ForbiddenRegs = diff --git a/tools/llvm-exegesis/lib/SnippetGenerator.h b/tools/llvm-exegesis/lib/SnippetGenerator.h index 24afe95fda0..c9a19cd0eeb 100644 --- a/tools/llvm-exegesis/lib/SnippetGenerator.h +++ b/tools/llvm-exegesis/lib/SnippetGenerator.h @@ -46,7 +46,7 @@ public: // Calls generateCodeTemplate and expands it into one or more BenchmarkCode. llvm::Expected> - generateConfigurations(unsigned Opcode) const; + generateConfigurations(const Instruction &Instr) const; // Given a snippet, computes which registers the setup code needs to define. std::vector computeRegisterInitialValues( @@ -66,7 +66,7 @@ protected: private: // API to be implemented by subclasses. virtual llvm::Expected - generateCodeTemplate(unsigned Opcode) const = 0; + generateCodeTemplate(const Instruction &Instr) const = 0; }; // A global Random Number Generator to randomize configurations. diff --git a/tools/llvm-exegesis/lib/Uops.cpp b/tools/llvm-exegesis/lib/Uops.cpp index fdb6a27ab59..1a701d169eb 100644 --- a/tools/llvm-exegesis/lib/Uops.cpp +++ b/tools/llvm-exegesis/lib/Uops.cpp @@ -125,13 +125,11 @@ void UopsSnippetGenerator::instantiateMemoryOperands( } llvm::Expected -UopsSnippetGenerator::generateCodeTemplate(unsigned Opcode) const { - const auto &ET = State.getExegesisTarget(); +UopsSnippetGenerator::generateCodeTemplate(const Instruction &Instr) const { CodeTemplate CT; - const llvm::BitVector *ScratchSpaceAliasedRegs = nullptr; - const Instruction Instr(State, Opcode); if (Instr.hasMemoryOperands()) { + const auto &ET = State.getExegesisTarget(); CT.ScratchSpacePointerInReg = ET.getScratchMemoryRegister(State.getTargetMachine().getTargetTriple()); if (CT.ScratchSpacePointerInReg == 0) diff --git a/tools/llvm-exegesis/lib/Uops.h b/tools/llvm-exegesis/lib/Uops.h index 33d0d8b1596..1cfa8242078 100644 --- a/tools/llvm-exegesis/lib/Uops.h +++ b/tools/llvm-exegesis/lib/Uops.h @@ -26,7 +26,7 @@ public: ~UopsSnippetGenerator() override; llvm::Expected - generateCodeTemplate(unsigned Opcode) const override; + generateCodeTemplate(const Instruction &Instr) const override; static constexpr const size_t kMinNumDifferentAddresses = 6; diff --git a/tools/llvm-exegesis/lib/X86/Target.cpp b/tools/llvm-exegesis/lib/X86/Target.cpp index 8c03f1ac826..440996ad555 100644 --- a/tools/llvm-exegesis/lib/X86/Target.cpp +++ b/tools/llvm-exegesis/lib/X86/Target.cpp @@ -26,10 +26,9 @@ template class X86SnippetGenerator : public Impl { using Impl::Impl; llvm::Expected - generateCodeTemplate(unsigned Opcode) const override { + generateCodeTemplate(const Instruction &Instr) const override { // Test whether we can generate a snippet for this instruction. - const auto &InstrInfo = this->State.getInstrInfo(); - const auto OpcodeName = InstrInfo.getName(Opcode); + const auto OpcodeName = Instr.Name; if (OpcodeName.startswith("POPF") || OpcodeName.startswith("PUSHF") || OpcodeName.startswith("ADJCALLSTACK")) { return llvm::make_error( @@ -38,8 +37,7 @@ template class X86SnippetGenerator : public Impl { // Handle X87. const unsigned FPInstClass = - InstrInfo.get(Opcode).TSFlags & llvm::X86II::FPTypeMask; - const Instruction Instr(this->State, Opcode); + Instr.Description->TSFlags & llvm::X86II::FPTypeMask; switch (FPInstClass) { case llvm::X86II::NotFP: break; @@ -67,7 +65,7 @@ template class X86SnippetGenerator : public Impl { } // Fallback to generic implementation. - return Impl::Base::generateCodeTemplate(Opcode); + return Impl::Base::generateCodeTemplate(Instr); } }; diff --git a/tools/llvm-exegesis/llvm-exegesis.cpp b/tools/llvm-exegesis/llvm-exegesis.cpp index 8fed1375c6f..b4891f1f1db 100644 --- a/tools/llvm-exegesis/llvm-exegesis.cpp +++ b/tools/llvm-exegesis/llvm-exegesis.cpp @@ -124,12 +124,8 @@ static unsigned getOpcodeOrDie(const llvm::MCInstrInfo &MCInstrInfo) { // Generates code snippets for opcode `Opcode`. static llvm::Expected> generateSnippets(const LLVMState &State, unsigned Opcode) { - const std::unique_ptr Generator = - State.getExegesisTarget().createSnippetGenerator(BenchmarkMode, State); - if (!Generator) - llvm::report_fatal_error("cannot create snippet generator"); - - const llvm::MCInstrDesc &InstrDesc = State.getInstrInfo().get(Opcode); + const Instruction Instr(State, Opcode); + const llvm::MCInstrDesc &InstrDesc = *Instr.Description; // Ignore instructions that we cannot run. if (InstrDesc.isPseudo()) return llvm::make_error("Unsupported opcode: isPseudo"); @@ -140,7 +136,11 @@ generateSnippets(const LLVMState &State, unsigned Opcode) { return llvm::make_error( "Unsupported opcode: isCall/isReturn"); - return Generator->generateConfigurations(Opcode); + const std::unique_ptr Generator = + State.getExegesisTarget().createSnippetGenerator(BenchmarkMode, State); + if (!Generator) + llvm::report_fatal_error("cannot create snippet generator"); + return Generator->generateConfigurations(Instr); } namespace { diff --git a/unittests/tools/llvm-exegesis/X86/SnippetGeneratorTest.cpp b/unittests/tools/llvm-exegesis/X86/SnippetGeneratorTest.cpp index f2539aaea18..4df489df06f 100644 --- a/unittests/tools/llvm-exegesis/X86/SnippetGeneratorTest.cpp +++ b/unittests/tools/llvm-exegesis/X86/SnippetGeneratorTest.cpp @@ -59,7 +59,8 @@ protected: CodeTemplate checkAndGetCodeTemplate(unsigned Opcode) { randomGenerator().seed(0); // Initialize seed. - auto CodeTemplateOrError = Generator.generateCodeTemplate(Opcode); + const Instruction Instr(State, Opcode); + auto CodeTemplateOrError = Generator.generateCodeTemplate(Instr); EXPECT_FALSE(CodeTemplateOrError.takeError()); // Valid configuration. return std::move(CodeTemplateOrError.get()); } @@ -238,7 +239,8 @@ TEST_F(UopsSnippetGeneratorTest, MemoryUse) { TEST_F(UopsSnippetGeneratorTest, MemoryUse_Movsb) { // MOVSB writes to scratch memory register. const unsigned Opcode = llvm::X86::MOVSB; - auto Error = Generator.generateCodeTemplate(Opcode).takeError(); + const Instruction Instr(State, Opcode); + auto Error = Generator.generateCodeTemplate(Instr).takeError(); EXPECT_TRUE((bool)Error); llvm::consumeError(std::move(Error)); } @@ -253,7 +255,7 @@ public: private: llvm::Expected - generateCodeTemplate(unsigned Opcode) const override { + generateCodeTemplate(const Instruction &Instr) const override { return llvm::make_error("not implemented", llvm::inconvertibleErrorCode()); } -- GitLab From d93bcaaa5bc5c40187a582a78c39d240db9cebbc Mon Sep 17 00:00:00 2001 From: Andrea Di Biagio Date: Wed, 10 Oct 2018 16:08:02 +0000 Subject: [PATCH 0006/1116] [llvm-mca] Minor refactoring in preparation for a patch that will fully fix PR36671. NFCI git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344149 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../include/HardwareUnits/RegisterFile.h | 11 ++++++----- .../llvm-mca/lib/HardwareUnits/RegisterFile.cpp | 16 +++++++++------- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/tools/llvm-mca/include/HardwareUnits/RegisterFile.h b/tools/llvm-mca/include/HardwareUnits/RegisterFile.h index 1026079c377..6a45c707de0 100644 --- a/tools/llvm-mca/include/HardwareUnits/RegisterFile.h +++ b/tools/llvm-mca/include/HardwareUnits/RegisterFile.h @@ -68,9 +68,11 @@ class RegisterFile : public HardwareUnit { bool AllowZeroMoveEliminationOnly; RegisterMappingTracker(unsigned NumPhysRegisters, - unsigned MaxMoveEliminated = 0U) + unsigned MaxMoveEliminated = 0U, + bool AllowZeroMoveElimOnly = false) : NumPhysRegs(NumPhysRegisters), NumUsedPhysRegs(0), - MaxMoveEliminatedPerCycle(MaxMoveEliminated), NumMoveEliminated(0U) {} + MaxMoveEliminatedPerCycle(MaxMoveEliminated), NumMoveEliminated(0U), + AllowZeroMoveEliminationOnly(AllowZeroMoveElimOnly) {} }; // A vector of register file descriptors. This set always contains at least @@ -151,9 +153,8 @@ class RegisterFile : public HardwareUnit { // Here FPRegisterFile contains all the registers defined by register class // VR128RegClass and VR256RegClass. FPRegisterFile implements 60 // registers which can be used for register renaming purpose. - void - addRegisterFile(llvm::ArrayRef RegisterClasses, - unsigned NumPhysRegs); + void addRegisterFile(const llvm::MCRegisterFileDesc &RF, + llvm::ArrayRef Entries); // Consumes physical registers in each register file specified by the // `IndexPlusCostPairTy`. This method is called from `addRegisterMapping()`. diff --git a/tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp b/tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp index 51a24786139..01131253b5b 100644 --- a/tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp +++ b/tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp @@ -37,7 +37,7 @@ void RegisterFile::initialize(const MCSchedModel &SM, unsigned NumRegs) { // declared by the target. The number of physical registers in the default // register file is set equal to `NumRegs`. A value of zero for `NumRegs` // means: this register file has an unbounded number of physical registers. - addRegisterFile({} /* all registers */, NumRegs); + RegisterFiles.emplace_back(NumRegs); if (!SM.hasExtraProcessorInfo()) return; @@ -48,15 +48,17 @@ void RegisterFile::initialize(const MCSchedModel &SM, unsigned NumRegs) { for (unsigned I = 0, E = Info.NumRegisterFiles; I < E; ++I) { const MCRegisterFileDesc &RF = Info.RegisterFiles[I]; // Skip invalid register files with zero physical registers. - unsigned Length = RF.NumRegisterCostEntries; + // TODO: verify this constraint in SubtargetEmitter, and convert this + // statement into an assert. if (!RF.NumPhysRegs) continue; + // The cost of a register definition is equivalent to the number of // physical registers that are allocated at register renaming stage. + unsigned Length = RF.NumRegisterCostEntries; const MCRegisterCostEntry *FirstElt = &Info.RegisterCostTable[RF.RegisterCostEntryIdx]; - addRegisterFile(ArrayRef(FirstElt, Length), - RF.NumPhysRegs); + addRegisterFile(RF, ArrayRef(FirstElt, Length)); } } @@ -65,15 +67,15 @@ void RegisterFile::cycleStart() { RMT.NumMoveEliminated = 0; } -void RegisterFile::addRegisterFile(ArrayRef Entries, - unsigned NumPhysRegs) { +void RegisterFile::addRegisterFile(const MCRegisterFileDesc &RF, + ArrayRef Entries) { // A default register file is always allocated at index #0. That register file // is mainly used to count the total number of mappings created by all // register files at runtime. Users can limit the number of available physical // registers in register file #0 through the command line flag // `-register-file-size`. unsigned RegisterFileIndex = RegisterFiles.size(); - RegisterFiles.emplace_back(NumPhysRegs); + RegisterFiles.emplace_back(RF.NumPhysRegs); // Special case where there is no register class identifier in the set. // An empty set of register classes means: this register file contains all -- GitLab From 0768811666e1fa7b814858ed657b2b2e0055a8f7 Mon Sep 17 00:00:00 2001 From: Guillaume Chatelet Date: Wed, 10 Oct 2018 16:16:43 +0000 Subject: [PATCH 0007/1116] [llvm-exegesis] Fix always true assert git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344151 91177308-0d34-0410-b5e6-96231b3b80d8 --- tools/llvm-exegesis/lib/MCInstrDescView.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/llvm-exegesis/lib/MCInstrDescView.h b/tools/llvm-exegesis/lib/MCInstrDescView.h index 265476ae125..6910538a31f 100644 --- a/tools/llvm-exegesis/lib/MCInstrDescView.h +++ b/tools/llvm-exegesis/lib/MCInstrDescView.h @@ -81,7 +81,7 @@ struct Operand { const llvm::MCOperandInfo &getExplicitOperandInfo() const; // Please use the accessors above and not the following fields. - unsigned Index = 0; + int Index = -1; bool IsDef = false; const RegisterAliasingTracker *Tracker = nullptr; // Set for Register Op. const llvm::MCOperandInfo *Info = nullptr; // Set for Explicit Op. -- GitLab From 7b3bebb197858be97876e0eb8b98b2f0be71387b Mon Sep 17 00:00:00 2001 From: Scott Linder Date: Wed, 10 Oct 2018 16:35:47 +0000 Subject: [PATCH 0008/1116] Relax trivial cast requirements in CallPromotionUtils Differential Revision: https://reviews.llvm.org/D52792 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344153 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Utils/CallPromotionUtils.cpp | 14 +++--- .../Util/call-promotion-utils-ptrcast.ll | 50 +++++++++++++++++++ 2 files changed, 58 insertions(+), 6 deletions(-) create mode 100644 test/Transforms/Util/call-promotion-utils-ptrcast.ll diff --git a/lib/Transforms/Utils/CallPromotionUtils.cpp b/lib/Transforms/Utils/CallPromotionUtils.cpp index 6d18d061461..261ab87c3e7 100644 --- a/lib/Transforms/Utils/CallPromotionUtils.cpp +++ b/lib/Transforms/Utils/CallPromotionUtils.cpp @@ -177,8 +177,8 @@ static void createRetBitCast(CallSite CS, Type *RetTy, CastInst **RetBitCast) { InsertBefore = &*std::next(CS.getInstruction()->getIterator()); // Bitcast the return value to the correct type. - auto *Cast = CastInst::Create(Instruction::BitCast, CS.getInstruction(), - RetTy, "", InsertBefore); + auto *Cast = CastInst::CreateBitOrPointerCast(CS.getInstruction(), RetTy, "", + InsertBefore); if (RetBitCast) *RetBitCast = Cast; @@ -321,12 +321,14 @@ bool llvm::isLegalToPromote(CallSite CS, Function *Callee, const char **FailureReason) { assert(!CS.getCalledFunction() && "Only indirect call sites can be promoted"); + auto &DL = Callee->getParent()->getDataLayout(); + // Check the return type. The callee's return value type must be bitcast // compatible with the call site's type. Type *CallRetTy = CS.getInstruction()->getType(); Type *FuncRetTy = Callee->getReturnType(); if (CallRetTy != FuncRetTy) - if (!CastInst::isBitCastable(FuncRetTy, CallRetTy)) { + if (!CastInst::isBitOrNoopPointerCastable(FuncRetTy, CallRetTy, DL)) { if (FailureReason) *FailureReason = "Return type mismatch"; return false; @@ -351,7 +353,7 @@ bool llvm::isLegalToPromote(CallSite CS, Function *Callee, Type *ActualTy = CS.getArgument(I)->getType(); if (FormalTy == ActualTy) continue; - if (!CastInst::isBitCastable(ActualTy, FormalTy)) { + if (!CastInst::isBitOrNoopPointerCastable(ActualTy, FormalTy, DL)) { if (FailureReason) *FailureReason = "Argument type mismatch"; return false; @@ -396,8 +398,8 @@ Instruction *llvm::promoteCall(CallSite CS, Function *Callee, Type *FormalTy = CalleeType->getParamType(ArgNo); Type *ActualTy = Arg->getType(); if (FormalTy != ActualTy) { - auto *Cast = CastInst::Create(Instruction::BitCast, Arg, FormalTy, "", - CS.getInstruction()); + auto *Cast = CastInst::CreateBitOrPointerCast(Arg, FormalTy, "", + CS.getInstruction()); CS.setArgument(ArgNo, Cast); } } diff --git a/test/Transforms/Util/call-promotion-utils-ptrcast.ll b/test/Transforms/Util/call-promotion-utils-ptrcast.ll new file mode 100644 index 00000000000..351ec292f18 --- /dev/null +++ b/test/Transforms/Util/call-promotion-utils-ptrcast.ll @@ -0,0 +1,50 @@ +; RUN: opt -S -pgo-icall-prom -icp-total-percent-threshold=0 -icp-max-prom=4 < %s 2>&1 | FileCheck %s + +; Test that CallPromotionUtils will promote calls which require pointer casts. + +@foo = common global i64 (i64)* null, align 8 + +; Check ptrcast arguments. +define i64 @func1(i8* %a) { + ret i64 undef +} + +; Check ptrcast return. +define i8* @func2(i64 %a) { + ret i8* undef +} + +; Check ptrcast arguments and return. +define i8* @func3(i8 *%a) { + ret i8* undef +} + +; Check mixed ptrcast and bitcast. +define i8* @func4(double %f) { + ret i8* undef +} + +define i64 @bar() { + %tmp = load i64 (i64)*, i64 (i64)** @foo, align 8 + +; CHECK: [[ARG:%[0-9]+]] = bitcast i64 1 to double +; CHECK-NEXT: [[RET:%[0-9]+]] = call i8* @func4(double [[ARG]]) +; CHECK-NEXT: ptrtoint i8* [[RET]] to i64 + +; CHECK: [[RET:%[0-9]+]] = call i8* @func2(i64 1) +; CHECK-NEXT: ptrtoint i8* [[RET]] to i64 + +; CHECK: [[ARG:%[0-9]+]] = inttoptr i64 1 to i8* +; CHECK-NEXT: [[RET:%[0-9]+]] = call i8* @func3(i8* [[ARG]]) +; CHECK-NEXT: ptrtoint i8* [[RET]] to i64 + +; CHECK: [[ARG:%[0-9]+]] = inttoptr i64 1 to i8* +; CHECK-NEXT: call i64 @func1(i8* [[ARG]]) +; CHECK-NOT: ptrtoint +; CHECK-NOT: bitcast + + %call = call i64 %tmp(i64 1), !prof !1 + ret i64 %call +} + +!1 = !{!"VP", i32 0, i64 1600, i64 7651369219802541373, i64 1030, i64 -4377547752858689819, i64 410, i64 -6929281286627296573, i64 150, i64 -2545542355363006406, i64 10} -- GitLab From a50609a0fa137e401a255fcbe050e1eeaf7a77a6 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 10 Oct 2018 17:37:32 +0000 Subject: [PATCH 0009/1116] Change the timestamp of llvmcache-foo file to meet the thinLTO prune policy The case will randomly fail if we test it with command " while llvm-lit test/tools/gold/X86/cache.ll ; do true; done". It is because the llvmcache-foo file is younger than llvmcache-349F039B8EB076D412007D82778442BED3148C4E and llvmcache-A8107945C65C2B2BBEE8E61AA604C311D60D58D6. But due to timestamp precision reason their timestamp is the same. Given the same timestamp, the file prune policy is to remove bigger size file first, so mostly foo file is removed for its bigger size. And the files size is under threshold after deleting foo file. That's what test case expect. However sometimes, the precision is enough to measure that timestamp of llvmcache-349F039B8EB076D412007D82778442BED3148C4E and llvmcache-A8107945C65C2B2BBEE8E61AA604C311D60D58D6 are smaller than foo, so llvmcache-349F039B8EB076D412007D82778442BED3148C4E and llvmcache-A8107945C65C2B2BBEE8E61AA604C311D60D58D6 are deleted first. Since the files size is still above the file size threshold after deleting the 2 files, the foo file is also deleted. And then the test case fails, because it expect only one file should be deleted instead of 3. The fix is to change the timestamp of llvmcache-foo file to meet the thinLTO prune policy. Patch by Luo Yuanke. Differential Revision: https://reviews.llvm.org/D52452 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344158 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/tools/gold/X86/cache.ll | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/tools/gold/X86/cache.ll b/test/tools/gold/X86/cache.ll index 51ffee282b1..4446aa6d887 100644 --- a/test/tools/gold/X86/cache.ll +++ b/test/tools/gold/X86/cache.ll @@ -53,6 +53,9 @@ ; RUN: ls %t.cache | count 5 +; Increase the age of llvmcache-foo +; RUN: touch -r %t.cache/llvmcache-foo -d '-2 minutes' %t.cache/llvmcache-foo + ; This should remove it. ; RUN: %gold -m elf_x86_64 -plugin %llvmshlibdir/LLVMgold%shlibext \ ; RUN: --plugin-opt=thinlto \ -- GitLab From 3b607cb1fc8c262c2e8496c44814325a8cd0729a Mon Sep 17 00:00:00 2001 From: Renato Golin Date: Wed, 10 Oct 2018 17:55:21 +0000 Subject: [PATCH 0010/1116] [VPlan] Fix CondBit quoting in dumpBasicBlock Quotes were being printed for VPInstructions but not the rest. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344161 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/VPlan.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/Transforms/Vectorize/VPlan.cpp b/lib/Transforms/Vectorize/VPlan.cpp index 09d20fbdefe..39cb4e9ec68 100644 --- a/lib/Transforms/Vectorize/VPlan.cpp +++ b/lib/Transforms/Vectorize/VPlan.cpp @@ -543,8 +543,10 @@ void VPlanPrinter::dumpBasicBlock(const VPBasicBlock *BasicBlock) { if (const VPInstruction *CBI = dyn_cast(CBV)) { CBI->printAsOperand(OS); OS << " (" << DOT::EscapeString(CBI->getParent()->getName()) << ")\\l\""; - } else + } else { CBV->printAsOperand(OS); + OS << "\""; + } } bumpIndent(-2); -- GitLab From 1cc98e6672b6319fdb00b70dd4474aabdadbe193 Mon Sep 17 00:00:00 2001 From: Francis Visoiu Mistrih Date: Wed, 10 Oct 2018 17:58:09 +0000 Subject: [PATCH 0011/1116] [OptRemarks] Add library for parsing optimization remarks Add a library that parses optimization remarks (currently YAML, so based on the YAMLParser). The goal is to be able to provide tools a remark parser that is not completely dependent on YAML, in case we decide to change the format later. It exposes a C API which takes a handler that is called with the remark structure. It adds a libLLVMOptRemark.a static library, and it's used in-tree by the llvm-opt-report tool (from which the parser has been mostly moved out). Differential Revision: https://reviews.llvm.org/D52776 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344162 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm-c/OptRemarks.h | 197 ++++++++ lib/CMakeLists.txt | 1 + lib/LLVMBuild.txt | 1 + lib/OptRemarks/CMakeLists.txt | 3 + lib/OptRemarks/LLVMBuild.txt | 22 + lib/OptRemarks/OptRemarksParser.cpp | 368 +++++++++++++++ tools/llvm-opt-report/CMakeLists.txt | 2 +- tools/llvm-opt-report/OptReport.cpp | 144 ++---- unittests/CMakeLists.txt | 1 + unittests/OptRemarks/CMakeLists.txt | 8 + .../OptRemarks/OptRemarksParsingTest.cpp | 433 ++++++++++++++++++ 11 files changed, 1073 insertions(+), 107 deletions(-) create mode 100644 include/llvm-c/OptRemarks.h create mode 100644 lib/OptRemarks/CMakeLists.txt create mode 100644 lib/OptRemarks/LLVMBuild.txt create mode 100644 lib/OptRemarks/OptRemarksParser.cpp create mode 100644 unittests/OptRemarks/CMakeLists.txt create mode 100644 unittests/OptRemarks/OptRemarksParsingTest.cpp diff --git a/include/llvm-c/OptRemarks.h b/include/llvm-c/OptRemarks.h new file mode 100644 index 00000000000..f3449cc1b8c --- /dev/null +++ b/include/llvm-c/OptRemarks.h @@ -0,0 +1,197 @@ +/*===-- llvm-c/OptRemarks.h - OptRemarks Public C Interface -------*- C -*-===*\ +|* *| +|* The LLVM Compiler Infrastructure *| +|* *| +|* This file is distributed under the University of Illinois Open Source *| +|* License. See LICENSE.TXT for details. *| +|* *| +|*===----------------------------------------------------------------------===*| +|* *| +|* This header provides a public interface to an opt-remark library. *| +|* LLVM provides an implementation of this interface. *| +|* *| +\*===----------------------------------------------------------------------===*/ + +#ifndef LLVM_C_OPT_REMARKS_H +#define LLVM_C_OPT_REMARKS_H + +#include "llvm-c/Core.h" +#include "llvm-c/Types.h" +#ifdef __cplusplus +#include +extern "C" { +#else +#include +#endif /* !defined(__cplusplus) */ + +/** + * @defgroup LLVMCOPTREMARKS OptRemarks + * @ingroup LLVMC + * + * @{ + */ + +#define OPT_REMARKS_API_VERSION 0 + +/** + * String containing a buffer and a length. The buffer is not guaranteed to be + * zero-terminated. + * + * \since OPT_REMARKS_API_VERSION=0 + */ +typedef struct { + const char *Str; + uint32_t Len; +} LLVMOptRemarkStringRef; + +/** + * DebugLoc containing File, Line and Column. + * + * \since OPT_REMARKS_API_VERSION=0 + */ +typedef struct { + // File: + LLVMOptRemarkStringRef SourceFile; + // Line: + uint32_t SourceLineNumber; + // Column: + uint32_t SourceColumnNumber; +} LLVMOptRemarkDebugLoc; + +/** + * Element of the "Args" list. The key might give more information about what + * are the semantics of the value, e.g. "Callee" will tell you that the value + * is a symbol that names a function. + * + * \since OPT_REMARKS_API_VERSION=0 + */ +typedef struct { + // e.g. "Callee" + LLVMOptRemarkStringRef Key; + // e.g. "malloc" + LLVMOptRemarkStringRef Value; + + // "DebugLoc": Optional + LLVMOptRemarkDebugLoc DebugLoc; +} LLVMOptRemarkArg; + +/** + * One remark entry. + * + * \since OPT_REMARKS_API_VERSION=0 + */ +typedef struct { + // e.g. !Missed, !Passed + LLVMOptRemarkStringRef RemarkType; + // "Pass": Required + LLVMOptRemarkStringRef PassName; + // "Name": Required + LLVMOptRemarkStringRef RemarkName; + // "Function": Required + LLVMOptRemarkStringRef FunctionName; + + // "DebugLoc": Optional + LLVMOptRemarkDebugLoc DebugLoc; + // "Hotness": Optional + uint32_t Hotness; + // "Args": Optional. It is an array of `num_args` elements. + uint32_t NumArgs; + LLVMOptRemarkArg *Args; +} LLVMOptRemarkEntry; + +typedef struct LLVMOptRemarkOpaqueParser *LLVMOptRemarkParserRef; + +/** + * Creates a remark parser that can be used to read and parse the buffer located + * in \p Buf of size \p Size. + * + * \p Buf cannot be NULL. + * + * This function should be paired with LLVMOptRemarkParserDispose() to avoid + * leaking resources. + * + * \since OPT_REMARKS_API_VERSION=0 + */ +extern LLVMOptRemarkParserRef LLVMOptRemarkParserCreate(const void *Buf, + uint64_t Size); + +/** + * Returns the next remark in the file. + * + * The value pointed to by the return value is invalidated by the next call to + * LLVMOptRemarkParserGetNext(). + * + * If the parser reaches the end of the buffer, the return value will be NULL. + * + * In the case of an error, the return value will be NULL, and: + * + * 1) LLVMOptRemarkParserHasError() will return `1`. + * + * 2) LLVMOptRemarkParserGetErrorMessage() will return a descriptive error + * message. + * + * An error may occur if: + * + * 1) An argument is invalid. + * + * 2) There is a YAML parsing error. This type of error aborts parsing + * immediately and returns `1`. It can occur on malformed YAML. + * + * 3) Remark parsing error. If this type of error occurs, the parser won't call + * the handler and will continue to the next one. It can occur on malformed + * remarks, like missing or extra fields in the file. + * + * Here is a quick example of the usage: + * + * ``` + * LLVMOptRemarkParserRef Parser = LLVMOptRemarkParserCreate(Buf, Size); + * LLVMOptRemarkEntry *Remark = NULL; + * while ((Remark == LLVMOptRemarkParserGetNext(Parser))) { + * // use Remark + * } + * bool HasError = LLVMOptRemarkParserHasError(Parser); + * LLVMOptRemarkParserDispose(Parser); + * ``` + * + * \since OPT_REMARKS_API_VERSION=0 + */ +extern LLVMOptRemarkEntry * +LLVMOptRemarkParserGetNext(LLVMOptRemarkParserRef Parser); + +/** + * Returns `1` if the parser encountered an error while parsing the buffer. + * + * \since OPT_REMARKS_API_VERSION=0 + */ +extern LLVMBool LLVMOptRemarkParserHasError(LLVMOptRemarkParserRef Parser); + +/** + * Returns a null-terminated string containing an error message. + * + * In case of no error, the result is `NULL`. + * + * The memory of the string is bound to the lifetime of \p Parser. If + * LLVMOptRemarkParserDispose() is called, the memory of the string will be + * released. + * + * \since OPT_REMARKS_API_VERSION=0 + */ +extern const char * +LLVMOptRemarkParserGetErrorMessage(LLVMOptRemarkParserRef Parser); + +/** + * Releases all the resources used by \p Parser. + * + * \since OPT_REMARKS_API_VERSION=0 + */ +extern void LLVMOptRemarkParserDispose(LLVMOptRemarkParserRef Parser); + +/** + * @} // endgoup LLVMCOPTREMARKS + */ + +#ifdef __cplusplus +} +#endif /* !defined(__cplusplus) */ + +#endif /* LLVM_C_OPT_REMARKS_H */ diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index ecf8b93d253..1f54c611bad 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -15,6 +15,7 @@ add_subdirectory(MC) add_subdirectory(Object) add_subdirectory(ObjectYAML) add_subdirectory(Option) +add_subdirectory(OptRemarks) add_subdirectory(DebugInfo) add_subdirectory(ExecutionEngine) add_subdirectory(Target) diff --git a/lib/LLVMBuild.txt b/lib/LLVMBuild.txt index a6cd15699fb..0eb4bba2676 100644 --- a/lib/LLVMBuild.txt +++ b/lib/LLVMBuild.txt @@ -35,6 +35,7 @@ subdirectories = BinaryFormat ObjectYAML Option + OptRemarks Passes ProfileData Support diff --git a/lib/OptRemarks/CMakeLists.txt b/lib/OptRemarks/CMakeLists.txt new file mode 100644 index 00000000000..8fefe1d986b --- /dev/null +++ b/lib/OptRemarks/CMakeLists.txt @@ -0,0 +1,3 @@ +add_llvm_library(LLVMOptRemarks + OptRemarksParser.cpp +) diff --git a/lib/OptRemarks/LLVMBuild.txt b/lib/OptRemarks/LLVMBuild.txt new file mode 100644 index 00000000000..4c1032296dc --- /dev/null +++ b/lib/OptRemarks/LLVMBuild.txt @@ -0,0 +1,22 @@ +;===- ./lib/OptRemarks/LLVMBuild.txt ---------------------------*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[component_0] +type = Library +name = OptRemarks +parent = Libraries +required_libraries = Support diff --git a/lib/OptRemarks/OptRemarksParser.cpp b/lib/OptRemarks/OptRemarksParser.cpp new file mode 100644 index 00000000000..4b8b038c832 --- /dev/null +++ b/lib/OptRemarks/OptRemarksParser.cpp @@ -0,0 +1,368 @@ +//===- OptRemarksParser.cpp -----------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file provides utility methods used by clients that want to use the +// parser for optimization remarks in LLVM. +// +//===----------------------------------------------------------------------===// + +#include "llvm-c/OptRemarks.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Support/YAMLTraits.h" + +using namespace llvm; + +namespace { +struct RemarkParser { + /// Source manager for better error messages. + SourceMgr SM; + /// Stream for yaml parsing. + yaml::Stream Stream; + /// Storage for the error stream. + std::string ErrorString; + /// The error stream. + raw_string_ostream ErrorStream; + /// Iterator in the YAML stream. + yaml::document_iterator DI; + /// The parsed remark (if any). + Optional LastRemark; + /// Temporary parsing buffer for the arguments. + SmallVector TmpArgs; + /// The state used by the parser to parse a remark entry. Invalidated with + /// every call to `parseYAMLElement`. + struct ParseState { + /// Temporary parsing buffer for the arguments. + SmallVectorImpl *Args; + StringRef Type; + StringRef Pass; + StringRef Name; + StringRef Function; + /// Optional. + Optional File; + Optional Line; + Optional Column; + Optional Hotness; + + ParseState(SmallVectorImpl &Args) : Args(&Args) {} + /// Use Args only as a **temporary** buffer. + ~ParseState() { Args->clear(); } + }; + + ParseState State; + + /// Set to `true` if we had any errors during parsing. + bool HadAnyErrors = false; + + RemarkParser(StringRef Buf) + : SM(), Stream(Buf, SM), ErrorString(), ErrorStream(ErrorString), + DI(Stream.begin()), LastRemark(), TmpArgs(), State(TmpArgs) { + SM.setDiagHandler(RemarkParser::HandleDiagnostic, this); + } + + /// Parse a YAML element. + Error parseYAMLElement(yaml::Document &Remark); + +private: + /// Parse one key to a string. + /// otherwise. + Error parseKey(StringRef &Result, yaml::KeyValueNode &Node); + /// Parse one value to a string. + Error parseValue(StringRef &Result, yaml::KeyValueNode &Node); + /// Parse one value to an unsigned. + Error parseValue(Optional &Result, yaml::KeyValueNode &Node); + /// Parse a debug location. + Error parseDebugLoc(Optional &File, Optional &Line, + Optional &Column, yaml::KeyValueNode &Node); + /// Parse an argument. + Error parseArg(SmallVectorImpl &TmpArgs, yaml::Node &Node); + + /// Handle a diagnostic from the YAML stream. Records the error in the + /// RemarkParser class. + static void HandleDiagnostic(const SMDiagnostic &Diag, void *Ctx) { + assert(Ctx && "Expected non-null Ctx in diagnostic handler."); + auto *Parser = static_cast(Ctx); + Diag.print(/*ProgName=*/nullptr, Parser->ErrorStream, /*ShowColors*/ false, + /*ShowKindLabels*/ true); + } +}; + +class ParseError : public ErrorInfo { +public: + static char ID; + + ParseError(StringRef Message, yaml::Node &Node) + : Message(Message), Node(Node) {} + + void log(raw_ostream &OS) const override { OS << Message; } + std::error_code convertToErrorCode() const override { + return inconvertibleErrorCode(); + } + + StringRef getMessage() const { return Message; } + yaml::Node &getNode() const { return Node; } + +private: + StringRef Message; // No need to hold a full copy of the buffer. + yaml::Node &Node; +}; + +char ParseError::ID = 0; + +static LLVMOptRemarkStringRef toOptRemarkStr(StringRef Str) { + return {Str.data(), static_cast(Str.size())}; +} + +Error RemarkParser::parseKey(StringRef &Result, yaml::KeyValueNode &Node) { + auto *Key = dyn_cast(Node.getKey()); + if (!Key) + return make_error("key is not a string.", Node); + + Result = Key->getRawValue(); + return Error::success(); +} + +Error RemarkParser::parseValue(StringRef &Result, yaml::KeyValueNode &Node) { + auto *Value = dyn_cast(Node.getValue()); + if (!Value) + return make_error("expected a value of scalar type.", Node); + Result = Value->getRawValue(); + + if (Result.front() == '\'') + Result = Result.drop_front(); + + if (Result.back() == '\'') + Result = Result.drop_back(); + + return Error::success(); +} + +Error RemarkParser::parseValue(Optional &Result, + yaml::KeyValueNode &Node) { + SmallVector Tmp; + auto *Value = dyn_cast(Node.getValue()); + if (!Value) + return make_error("expected a value of scalar type.", Node); + unsigned UnsignedValue = 0; + if (Value->getValue(Tmp).getAsInteger(10, UnsignedValue)) + return make_error("expected a value of integer type.", *Value); + Result = UnsignedValue; + return Error::success(); +} + +Error RemarkParser::parseDebugLoc(Optional &File, + Optional &Line, + Optional &Column, + yaml::KeyValueNode &Node) { + auto *DebugLoc = dyn_cast(Node.getValue()); + if (!DebugLoc) + return make_error("expected a value of mapping type.", Node); + + for (yaml::KeyValueNode &DLNode : *DebugLoc) { + StringRef KeyName; + if (Error E = parseKey(KeyName, DLNode)) + return E; + if (KeyName == "File") { + File = StringRef(); // Set the optional to contain a default constructed + // value, to be passed to the parsing function. + if (Error E = parseValue(*File, DLNode)) + return E; + } else if (KeyName == "Column") { + if (Error E = parseValue(Column, DLNode)) + return E; + } else if (KeyName == "Line") { + if (Error E = parseValue(Line, DLNode)) + return E; + } else { + return make_error("unknown entry in DebugLoc map.", DLNode); + } + } + + // If any of the debug loc fields is missing, return an error. + if (!File || !Line || !Column) + return make_error("DebugLoc node incomplete.", Node); + + return Error::success(); +} + +Error RemarkParser::parseArg(SmallVectorImpl &Args, + yaml::Node &Node) { + auto *ArgMap = dyn_cast(&Node); + if (!ArgMap) + return make_error("expected a value of mapping type.", Node); + + StringRef ValueStr; + StringRef KeyStr; + Optional File; + Optional Line; + Optional Column; + + for (yaml::KeyValueNode &ArgEntry : *ArgMap) { + StringRef KeyName; + if (Error E = parseKey(KeyName, ArgEntry)) + return E; + + // Try to parse debug locs. + if (KeyName == "DebugLoc") { + // Can't have multiple DebugLoc entries per argument. + if (File || Line || Column) + return make_error( + "only one DebugLoc entry is allowed per argument.", ArgEntry); + + if (Error E = parseDebugLoc(File, Line, Column, ArgEntry)) + return E; + continue; + } + + // If we already have a string, error out. + if (!ValueStr.empty()) + return make_error( + "only one string entry is allowed per argument.", ArgEntry); + + // Try to parse a string. + if (Error E = parseValue(ValueStr, ArgEntry)) + return E; + + // Keep the key from the string. + KeyStr = KeyName; + } + + if (KeyStr.empty()) + return make_error("argument key is missing.", *ArgMap); + if (ValueStr.empty()) + return make_error("argument value is missing.", *ArgMap); + + Args.push_back(LLVMOptRemarkArg{ + toOptRemarkStr(KeyStr), toOptRemarkStr(ValueStr), + LLVMOptRemarkDebugLoc{toOptRemarkStr(File.getValueOr(StringRef())), + Line.getValueOr(0), Column.getValueOr(0)}}); + + return Error::success(); +} + +Error RemarkParser::parseYAMLElement(yaml::Document &Remark) { + // Parsing a new remark, clear the previous one. + LastRemark = None; + State = ParseState(TmpArgs); + + auto *Root = dyn_cast(Remark.getRoot()); + if (!Root) + return make_error("document root is not of mapping type.", + *Remark.getRoot()); + + State.Type = Root->getRawTag(); + + for (yaml::KeyValueNode &RemarkField : *Root) { + StringRef KeyName; + if (Error E = parseKey(KeyName, RemarkField)) + return E; + + if (KeyName == "Pass") { + if (Error E = parseValue(State.Pass, RemarkField)) + return E; + } else if (KeyName == "Name") { + if (Error E = parseValue(State.Name, RemarkField)) + return E; + } else if (KeyName == "Function") { + if (Error E = parseValue(State.Function, RemarkField)) + return E; + } else if (KeyName == "Hotness") { + if (Error E = parseValue(State.Hotness, RemarkField)) + return E; + } else if (KeyName == "DebugLoc") { + if (Error E = + parseDebugLoc(State.File, State.Line, State.Column, RemarkField)) + return E; + } else if (KeyName == "Args") { + auto *Args = dyn_cast(RemarkField.getValue()); + if (!Args) + return make_error("wrong value type for key.", RemarkField); + + for (yaml::Node &Arg : *Args) + if (Error E = parseArg(*State.Args, Arg)) + return E; + } else { + return make_error("unknown key.", RemarkField); + } + } + + // If the YAML parsing failed, don't even continue parsing. We might + // encounter malformed YAML. + if (Stream.failed()) + return make_error("YAML parsing failed.", *Remark.getRoot()); + + // Check if any of the mandatory fields are missing. + if (State.Type.empty() || State.Pass.empty() || State.Name.empty() || + State.Function.empty()) + return make_error("Type, Pass, Name or Function missing.", + *Remark.getRoot()); + + LastRemark = LLVMOptRemarkEntry{ + toOptRemarkStr(State.Type), + toOptRemarkStr(State.Pass), + toOptRemarkStr(State.Name), + toOptRemarkStr(State.Function), + LLVMOptRemarkDebugLoc{toOptRemarkStr(State.File.getValueOr(StringRef())), + State.Line.getValueOr(0), + State.Column.getValueOr(0)}, + State.Hotness.getValueOr(0), + static_cast(State.Args->size()), + State.Args->data()}; + + return Error::success(); +} +} // namespace + +// Create wrappers for C Binding types (see CBindingWrapping.h). +DEFINE_SIMPLE_CONVERSION_FUNCTIONS(RemarkParser, LLVMOptRemarkParserRef); + +extern "C" LLVMOptRemarkParserRef LLVMOptRemarkParserCreate(const void *Buf, + uint64_t Size) { + return wrap( + new RemarkParser(StringRef(static_cast(Buf), Size))); +} + +extern "C" LLVMOptRemarkEntry * +LLVMOptRemarkParserGetNext(LLVMOptRemarkParserRef Parser) { + RemarkParser &TheParser = *unwrap(Parser); + // Check for EOF. + if (TheParser.HadAnyErrors || TheParser.DI == TheParser.Stream.end()) + return nullptr; + + // Try to parse an entry. + if (Error E = TheParser.parseYAMLElement(*TheParser.DI)) { + handleAllErrors(std::move(E), [&](const ParseError &PE) { + TheParser.Stream.printError(&PE.getNode(), + Twine(PE.getMessage()) + Twine('\n')); + TheParser.HadAnyErrors = true; + }); + return nullptr; + } + + // Move on. + ++TheParser.DI; + + // Return the just-parsed remark. + if (Optional &Entry = TheParser.LastRemark) + return &*Entry; + return nullptr; +} + +extern "C" LLVMBool LLVMOptRemarkParserHasError(LLVMOptRemarkParserRef Parser) { + return unwrap(Parser)->HadAnyErrors; +} + +extern "C" const char * +LLVMOptRemarkParserGetErrorMessage(LLVMOptRemarkParserRef Parser) { + return unwrap(Parser)->ErrorStream.str().c_str(); +} + +extern "C" void LLVMOptRemarkParserDispose(LLVMOptRemarkParserRef Parser) { + delete unwrap(Parser); +} diff --git a/tools/llvm-opt-report/CMakeLists.txt b/tools/llvm-opt-report/CMakeLists.txt index 777537a54c0..3aabc03ab3f 100644 --- a/tools/llvm-opt-report/CMakeLists.txt +++ b/tools/llvm-opt-report/CMakeLists.txt @@ -1,4 +1,4 @@ -set(LLVM_LINK_COMPONENTS Core Demangle Object Support) +set(LLVM_LINK_COMPONENTS Core Demangle Object OptRemarks Support) add_llvm_tool(llvm-opt-report OptReport.cpp diff --git a/tools/llvm-opt-report/OptReport.cpp b/tools/llvm-opt-report/OptReport.cpp index aa7966132c2..071f779a9e6 100644 --- a/tools/llvm-opt-report/OptReport.cpp +++ b/tools/llvm-opt-report/OptReport.cpp @@ -28,6 +28,7 @@ #include "llvm/Support/WithColor.h" #include "llvm/Support/YAMLTraits.h" #include "llvm/Support/raw_ostream.h" +#include "llvm-c/OptRemarks.h" #include #include #include @@ -142,104 +143,44 @@ typedef std::map>>> LocationInfoTy; } // anonymous namespace -static void collectLocationInfo(yaml::Stream &Stream, - LocationInfoTy &LocationInfo) { - SmallVector Tmp; - - // Note: We're using the YAML parser here directly, instead of using the - // YAMLTraits implementation, because the YAMLTraits implementation does not - // support a way to handle only a subset of the input keys (it will error out - // if there is an input key that you don't map to your class), and - // furthermore, it does not provide a way to handle the Args sequence of - // key/value pairs, where the order must be captured and the 'String' key - // might be repeated. - for (auto &Doc : Stream) { - auto *Root = dyn_cast(Doc.getRoot()); - if (!Root) - continue; +static bool readLocationInfo(LocationInfoTy &LocationInfo) { + ErrorOr> Buf = + MemoryBuffer::getFile(InputFileName.c_str()); + if (std::error_code EC = Buf.getError()) { + WithColor::error() << "Can't open file " << InputFileName << ": " + << EC.message() << "\n"; + return false; + } - bool Transformed = Root->getRawTag() == "!Passed"; - std::string Pass, File, Function; - int Line = 0, Column = 1; + StringRef Buffer = (*Buf)->getBuffer(); + LLVMOptRemarkParserRef Parser = + LLVMOptRemarkParserCreate(Buffer.data(), Buffer.size()); + + LLVMOptRemarkEntry *Remark = nullptr; + while ((Remark = LLVMOptRemarkParserGetNext(Parser))) { + bool Transformed = + StringRef(Remark->RemarkType.Str, Remark->RemarkType.Len) == "!Passed"; + StringRef Pass(Remark->PassName.Str, Remark->PassName.Len); + StringRef File(Remark->DebugLoc.SourceFile.Str, + Remark->DebugLoc.SourceFile.Len); + StringRef Function(Remark->FunctionName.Str, Remark->FunctionName.Len); + uint32_t Line = Remark->DebugLoc.SourceLineNumber; + uint32_t Column = Remark->DebugLoc.SourceColumnNumber; + ArrayRef Args(Remark->Args, Remark->NumArgs); int VectorizationFactor = 1; int InterleaveCount = 1; int UnrollCount = 1; - for (auto &RootChild : *Root) { - auto *Key = dyn_cast(RootChild.getKey()); - if (!Key) - continue; - StringRef KeyName = Key->getValue(Tmp); - if (KeyName == "Pass") { - auto *Value = dyn_cast(RootChild.getValue()); - if (!Value) - continue; - Pass = Value->getValue(Tmp); - } else if (KeyName == "Function") { - auto *Value = dyn_cast(RootChild.getValue()); - if (!Value) - continue; - Function = Value->getValue(Tmp); - } else if (KeyName == "DebugLoc") { - auto *DebugLoc = dyn_cast(RootChild.getValue()); - if (!DebugLoc) - continue; - - for (auto &DLChild : *DebugLoc) { - auto *DLKey = dyn_cast(DLChild.getKey()); - if (!DLKey) - continue; - StringRef DLKeyName = DLKey->getValue(Tmp); - if (DLKeyName == "File") { - auto *Value = dyn_cast(DLChild.getValue()); - if (!Value) - continue; - File = Value->getValue(Tmp); - } else if (DLKeyName == "Line") { - auto *Value = dyn_cast(DLChild.getValue()); - if (!Value) - continue; - Value->getValue(Tmp).getAsInteger(10, Line); - } else if (DLKeyName == "Column") { - auto *Value = dyn_cast(DLChild.getValue()); - if (!Value) - continue; - Value->getValue(Tmp).getAsInteger(10, Column); - } - } - } else if (KeyName == "Args") { - auto *Args = dyn_cast(RootChild.getValue()); - if (!Args) - continue; - for (auto &ArgChild : *Args) { - auto *ArgMap = dyn_cast(&ArgChild); - if (!ArgMap) - continue; - for (auto &ArgKV : *ArgMap) { - auto *ArgKey = dyn_cast(ArgKV.getKey()); - if (!ArgKey) - continue; - StringRef ArgKeyName = ArgKey->getValue(Tmp); - if (ArgKeyName == "VectorizationFactor") { - auto *Value = dyn_cast(ArgKV.getValue()); - if (!Value) - continue; - Value->getValue(Tmp).getAsInteger(10, VectorizationFactor); - } else if (ArgKeyName == "InterleaveCount") { - auto *Value = dyn_cast(ArgKV.getValue()); - if (!Value) - continue; - Value->getValue(Tmp).getAsInteger(10, InterleaveCount); - } else if (ArgKeyName == "UnrollCount") { - auto *Value = dyn_cast(ArgKV.getValue()); - if (!Value) - continue; - Value->getValue(Tmp).getAsInteger(10, UnrollCount); - } - } - } - } + for (const LLVMOptRemarkArg &Arg : Args) { + StringRef ArgKeyName(Arg.Key.Str, Arg.Key.Len); + StringRef ArgValue(Arg.Value.Str, Arg.Value.Len); + if (ArgKeyName == "VectorizationFactor") + ArgValue.getAsInteger(10, VectorizationFactor); + else if (ArgKeyName == "InterleaveCount") + ArgValue.getAsInteger(10, InterleaveCount); + else if (ArgKeyName == "UnrollCount") + ArgValue.getAsInteger(10, UnrollCount); } if (Line < 1 || File.empty()) @@ -268,22 +209,13 @@ static void collectLocationInfo(yaml::Stream &Stream, UpdateLLII(LI.Vectorized); } } -} - -static bool readLocationInfo(LocationInfoTy &LocationInfo) { - ErrorOr> Buf = - MemoryBuffer::getFileOrSTDIN(InputFileName); - if (std::error_code EC = Buf.getError()) { - WithColor::error() << "Can't open file " << InputFileName << ": " - << EC.message() << "\n"; - return false; - } - SourceMgr SM; - yaml::Stream Stream(Buf.get()->getBuffer(), SM); - collectLocationInfo(Stream, LocationInfo); + bool HasError = LLVMOptRemarkParserHasError(Parser); + if (HasError) + WithColor::error() << LLVMOptRemarkParserGetErrorMessage(Parser) << "\n"; - return true; + LLVMOptRemarkParserDispose(Parser); + return !HasError; } static bool writeReport(LocationInfoTy &LocationInfo) { diff --git a/unittests/CMakeLists.txt b/unittests/CMakeLists.txt index bc41ab66a23..5dba2de4a88 100644 --- a/unittests/CMakeLists.txt +++ b/unittests/CMakeLists.txt @@ -26,6 +26,7 @@ add_subdirectory(MI) add_subdirectory(Object) add_subdirectory(ObjectYAML) add_subdirectory(Option) +add_subdirectory(OptRemarks) add_subdirectory(Passes) add_subdirectory(ProfileData) add_subdirectory(Support) diff --git a/unittests/OptRemarks/CMakeLists.txt b/unittests/OptRemarks/CMakeLists.txt new file mode 100644 index 00000000000..94c74867cc4 --- /dev/null +++ b/unittests/OptRemarks/CMakeLists.txt @@ -0,0 +1,8 @@ +set(LLVM_LINK_COMPONENTS + OptRemarks + Support + ) + +add_llvm_unittest(OptRemarksTests + OptRemarksParsingTest.cpp + ) diff --git a/unittests/OptRemarks/OptRemarksParsingTest.cpp b/unittests/OptRemarks/OptRemarksParsingTest.cpp new file mode 100644 index 00000000000..a28820ffb7f --- /dev/null +++ b/unittests/OptRemarks/OptRemarksParsingTest.cpp @@ -0,0 +1,433 @@ +//===- unittest/Support/OptRemarksParsingTest.cpp - OptTable tests --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm-c/OptRemarks.h" +#include "gtest/gtest.h" + +using namespace llvm; + +template bool tryParse(const char (&Buf)[N]) { + LLVMOptRemarkParserRef Parser = LLVMOptRemarkParserCreate(Buf, N - 1); + LLVMOptRemarkEntry *Remark = nullptr; + while (LLVMOptRemarkEntry *NewRemark = LLVMOptRemarkParserGetNext(Parser)) { + EXPECT_TRUE(Remark == nullptr); // Only one remark per test. + Remark = NewRemark; + } + EXPECT_TRUE(Remark != nullptr); // We need *exactly* one remark per test. + bool HasError = LLVMOptRemarkParserHasError(Parser); + LLVMOptRemarkParserDispose(Parser); + return !HasError; +} + +template +bool parseExpectError(const char (&Buf)[N], const char *Error) { + LLVMOptRemarkParserRef Parser = LLVMOptRemarkParserCreate(Buf, N - 1); + LLVMOptRemarkEntry *Remark = nullptr; + while (LLVMOptRemarkEntry *NewRemark = LLVMOptRemarkParserGetNext(Parser)) { + EXPECT_FALSE(NewRemark); + } + EXPECT_TRUE(Remark == nullptr); // We are parsing only one malformed remark. + EXPECT_TRUE(LLVMOptRemarkParserHasError(Parser)); + bool MatchesError = + StringRef(LLVMOptRemarkParserGetErrorMessage(Parser)).contains(Error); + LLVMOptRemarkParserDispose(Parser); + + return MatchesError; +} + +TEST(OptRemarks, OptRemarksParsingEmpty) { + StringRef Buf = R"YAML( +)YAML"; + LLVMOptRemarkParserRef Parser = + LLVMOptRemarkParserCreate(Buf.data(), Buf.size()); + LLVMOptRemarkEntry *NewRemark = LLVMOptRemarkParserGetNext(Parser); + EXPECT_TRUE(NewRemark == nullptr); // No remark expected. + EXPECT_TRUE(LLVMOptRemarkParserHasError(Parser)); + EXPECT_TRUE(StringRef(LLVMOptRemarkParserGetErrorMessage(Parser)) + .contains("document root is not of mapping type.")); + LLVMOptRemarkParserDispose(Parser); +} + +TEST(OptRemarks, OptRemarksParsingGood) { + EXPECT_TRUE(tryParse(R"YAML( +--- !Missed +Pass: inline +Name: NoDefinition +DebugLoc: { File: file.c, Line: 3, Column: 12 } +Function: foo +Args: + - Callee: bar + - String: ' will not be inlined into ' + - Caller: foo + DebugLoc: { File: file.c, Line: 2, Column: 0 } + - String: ' because its definition is unavailable' +)YAML")); + + // No debug loc should also pass. + EXPECT_TRUE(tryParse(R"YAML( +--- !Missed +Pass: inline +Name: NoDefinition +Function: foo +Args: + - Callee: bar + - String: ' will not be inlined into ' + - Caller: foo + DebugLoc: { File: file.c, Line: 2, Column: 0 } + - String: ' because its definition is unavailable' +)YAML")); + + // No args is also ok. + EXPECT_TRUE(tryParse(R"YAML( +--- !Missed +Pass: inline +Name: NoDefinition +DebugLoc: { File: file.c, Line: 3, Column: 12 } +Function: foo +)YAML")); + + // Different order. + EXPECT_TRUE(tryParse(R"YAML( +--- !Missed +DebugLoc: { Line: 3, Column: 12, File: file.c } +Function: foo +Name: NoDefinition +Args: + - Callee: bar + - String: ' will not be inlined into ' + - Caller: foo + DebugLoc: { File: file.c, Line: 2, Column: 0 } + - String: ' because its definition is unavailable' +Pass: inline +)YAML")); +} + +// Mandatory common part of a remark. +#define COMMON_REMARK "\nPass: inline\nName: NoDefinition\nFunction: foo\n" +// Test all the types. +TEST(OptRemarks, OptRemarksParsingTypes) { + // Type: Passed + EXPECT_TRUE(tryParse("--- !Passed" COMMON_REMARK)); + // Type: Missed + EXPECT_TRUE(tryParse("--- !Missed" COMMON_REMARK)); + // Type: Analysis + EXPECT_TRUE(tryParse("--- !Analysis" COMMON_REMARK)); + // Type: AnalysisFPCompute + EXPECT_TRUE(tryParse("--- !AnalysisFPCompute" COMMON_REMARK)); + // Type: AnalysisAliasing + EXPECT_TRUE(tryParse("--- !AnalysisAliasing" COMMON_REMARK)); + // Type: Failure + EXPECT_TRUE(tryParse("--- !Failure" COMMON_REMARK)); +} +#undef COMMON_REMARK + +TEST(OptRemarks, OptRemarksParsingMissingFields) { + // No type. + EXPECT_TRUE(parseExpectError(R"YAML( +--- +Pass: inline +Name: NoDefinition +Function: foo +)YAML", + "error: Type, Pass, Name or Function missing.")); + // No pass. + EXPECT_TRUE(parseExpectError(R"YAML( +--- !Missed +Name: NoDefinition +Function: foo +)YAML", + "error: Type, Pass, Name or Function missing.")); + // No name. + EXPECT_TRUE(parseExpectError(R"YAML( +--- !Missed +Pass: inline +Function: foo +)YAML", + "error: Type, Pass, Name or Function missing.")); + // No function. + EXPECT_TRUE(parseExpectError(R"YAML( +--- !Missed +Pass: inline +Name: NoDefinition +)YAML", + "error: Type, Pass, Name or Function missing.")); + // Debug loc but no file. + EXPECT_TRUE(parseExpectError(R"YAML( +--- !Missed +Pass: inline +Name: NoDefinition +Function: foo +DebugLoc: { Line: 3, Column: 12 } +)YAML", + "DebugLoc node incomplete.")); + // Debug loc but no line. + EXPECT_TRUE(parseExpectError(R"YAML( +--- !Missed +Pass: inline +Name: NoDefinition +Function: foo +DebugLoc: { File: file.c, Column: 12 } +)YAML", + "DebugLoc node incomplete.")); + // Debug loc but no column. + EXPECT_TRUE(parseExpectError(R"YAML( +--- !Missed +Pass: inline +Name: NoDefinition +Function: foo +DebugLoc: { File: file.c, Line: 3 } +)YAML", + "DebugLoc node incomplete.")); +} + +TEST(OptRemarks, OptRemarksParsingWrongTypes) { + // Wrong debug loc type. + EXPECT_TRUE(parseExpectError(R"YAML( +--- !Missed +Pass: inline +Name: NoDefinition +Function: foo +DebugLoc: foo +)YAML", + "expected a value of mapping type.")); + // Wrong line type. + EXPECT_TRUE(parseExpectError(R"YAML( +--- !Missed +Pass: inline +Name: NoDefinition +Function: foo +DebugLoc: { File: file.c, Line: b, Column: 12 } +)YAML", + "expected a value of integer type.")); + // Wrong column type. + EXPECT_TRUE(parseExpectError(R"YAML( +--- !Missed +Pass: inline +Name: NoDefinition +Function: foo +DebugLoc: { File: file.c, Line: 3, Column: c } +)YAML", + "expected a value of integer type.")); + // Wrong args type. + EXPECT_TRUE(parseExpectError(R"YAML( +--- !Missed +Pass: inline +Name: NoDefinition +Function: foo +Args: foo +)YAML", + "wrong value type for key.")); + // Wrong key type. + EXPECT_TRUE(parseExpectError(R"YAML( +--- !Missed +{ A: a }: inline +Name: NoDefinition +Function: foo +)YAML", + "key is not a string.")); + // Debug loc with unknown entry. + EXPECT_TRUE(parseExpectError(R"YAML( +--- !Missed +Pass: inline +Name: NoDefinition +Function: foo +DebugLoc: { File: file.c, Column: 12, Unknown: 12 } +)YAML", + "unknown entry in DebugLoc map.")); + // Unknown entry. + EXPECT_TRUE(parseExpectError(R"YAML( +--- !Missed +Unknown: inline +)YAML", + "unknown key.")); + // Not a scalar. + EXPECT_TRUE(parseExpectError(R"YAML( +--- !Missed +Pass: { File: a, Line: 1, Column: 2 } +Name: NoDefinition +Function: foo +)YAML", + "expected a value of scalar type.")); + // Not a string file in debug loc. + EXPECT_TRUE(parseExpectError(R"YAML( +--- !Missed +Pass: inline +Name: NoDefinition +Function: foo +DebugLoc: { File: { a: b }, Column: 12, Line: 12 } +)YAML", + "expected a value of scalar type.")); + // Not a integer column in debug loc. + EXPECT_TRUE(parseExpectError(R"YAML( +--- !Missed +Pass: inline +Name: NoDefinition +Function: foo +DebugLoc: { File: file.c, Column: { a: b }, Line: 12 } +)YAML", + "expected a value of scalar type.")); + // Not a integer line in debug loc. + EXPECT_TRUE(parseExpectError(R"YAML( +--- !Missed +Pass: inline +Name: NoDefinition +Function: foo +DebugLoc: { File: file.c, Column: 12, Line: { a: b } } +)YAML", + "expected a value of scalar type.")); + // Not a mapping type value for args. + EXPECT_TRUE(parseExpectError(R"YAML( +--- !Missed +Pass: inline +Name: NoDefinition +Function: foo +DebugLoc: { File: file.c, Column: 12, Line: { a: b } } +)YAML", + "expected a value of scalar type.")); +} + +TEST(OptRemarks, OptRemarksParsingWrongArgs) { + // Multiple debug locs per arg. + EXPECT_TRUE( + parseExpectError(R"YAML( +--- !Missed +Pass: inline +Name: NoDefinition +Function: foo +Args: + - Str: string + DebugLoc: { File: a, Line: 1, Column: 2 } + DebugLoc: { File: a, Line: 1, Column: 2 } +)YAML", + "only one DebugLoc entry is allowed per argument.")); + // Multiple strings per arg. + EXPECT_TRUE( + parseExpectError(R"YAML( +--- !Missed +Pass: inline +Name: NoDefinition +Function: foo +Args: + - Str: string + Str2: string + DebugLoc: { File: a, Line: 1, Column: 2 } +)YAML", + "only one string entry is allowed per argument.")); + // No arg value. + EXPECT_TRUE(parseExpectError(R"YAML( +--- !Missed +Pass: inline +Name: NoDefinition +Function: foo +Args: + - Callee: '' + - DebugLoc: { File: a, Line: 1, Column: 2 } +)YAML", + "argument value is missing.")); + // No arg value. + EXPECT_TRUE(parseExpectError(R"YAML( +--- !Missed +Pass: inline +Name: NoDefinition +Function: foo +Args: + - DebugLoc: { File: a, Line: 1, Column: 2 } +)YAML", + "argument key is missing.")); + +} + +TEST(OptRemarks, OptRemarksGoodStruct) { + StringRef Buf = R"YAML( +--- !Missed +Pass: inline +Name: NoDefinition +DebugLoc: { File: file.c, Line: 3, Column: 12 } +Function: foo +Args: + - Callee: bar + - String: ' will not be inlined into ' + - Caller: foo + DebugLoc: { File: file.c, Line: 2, Column: 0 } + - String: ' because its definition is unavailable' +)YAML"; + + LLVMOptRemarkParserRef Parser = + LLVMOptRemarkParserCreate(Buf.data(), Buf.size()); + LLVMOptRemarkEntry *Remark = LLVMOptRemarkParserGetNext(Parser); + EXPECT_FALSE(Remark == nullptr); + EXPECT_EQ(StringRef(Remark->RemarkType.Str, 7), "!Missed"); + EXPECT_EQ(Remark->RemarkType.Len, 7U); + EXPECT_EQ(StringRef(Remark->PassName.Str, 6), "inline"); + EXPECT_EQ(Remark->PassName.Len, 6U); + EXPECT_EQ(StringRef(Remark->RemarkName.Str, 12), "NoDefinition"); + EXPECT_EQ(Remark->RemarkName.Len, 12U); + EXPECT_EQ(StringRef(Remark->FunctionName.Str, 3), "foo"); + EXPECT_EQ(Remark->FunctionName.Len, 3U); + EXPECT_EQ(StringRef(Remark->DebugLoc.SourceFile.Str, 6), "file.c"); + EXPECT_EQ(Remark->DebugLoc.SourceFile.Len, 6U); + EXPECT_EQ(Remark->DebugLoc.SourceLineNumber, 3U); + EXPECT_EQ(Remark->DebugLoc.SourceColumnNumber, 12U); + EXPECT_EQ(Remark->Hotness, 0U); + EXPECT_EQ(Remark->NumArgs, 4U); + // Arg 0 + { + LLVMOptRemarkArg &Arg = Remark->Args[0]; + EXPECT_EQ(StringRef(Arg.Key.Str, 6), "Callee"); + EXPECT_EQ(Arg.Key.Len, 6U); + EXPECT_EQ(StringRef(Arg.Value.Str, 3), "bar"); + EXPECT_EQ(Arg.Value.Len, 3U); + EXPECT_EQ(StringRef(Arg.DebugLoc.SourceFile.Str, 0), ""); + EXPECT_EQ(Arg.DebugLoc.SourceFile.Len, 0U); + EXPECT_EQ(Arg.DebugLoc.SourceLineNumber, 0U); + EXPECT_EQ(Arg.DebugLoc.SourceColumnNumber, 0U); + } + // Arg 1 + { + LLVMOptRemarkArg &Arg = Remark->Args[1]; + EXPECT_EQ(StringRef(Arg.Key.Str, 6), "String"); + EXPECT_EQ(Arg.Key.Len, 6U); + EXPECT_EQ(StringRef(Arg.Value.Str, 26), " will not be inlined into "); + EXPECT_EQ(Arg.Value.Len, 26U); + EXPECT_EQ(StringRef(Arg.DebugLoc.SourceFile.Str, 0), ""); + EXPECT_EQ(Arg.DebugLoc.SourceFile.Len, 0U); + EXPECT_EQ(Arg.DebugLoc.SourceLineNumber, 0U); + EXPECT_EQ(Arg.DebugLoc.SourceColumnNumber, 0U); + } + // Arg 2 + { + LLVMOptRemarkArg &Arg = Remark->Args[2]; + EXPECT_EQ(StringRef(Arg.Key.Str, 6), "Caller"); + EXPECT_EQ(Arg.Key.Len, 6U); + EXPECT_EQ(StringRef(Arg.Value.Str, 3), "foo"); + EXPECT_EQ(Arg.Value.Len, 3U); + EXPECT_EQ(StringRef(Arg.DebugLoc.SourceFile.Str, 6), "file.c"); + EXPECT_EQ(Arg.DebugLoc.SourceFile.Len, 6U); + EXPECT_EQ(Arg.DebugLoc.SourceLineNumber, 2U); + EXPECT_EQ(Arg.DebugLoc.SourceColumnNumber, 0U); + } + // Arg 3 + { + LLVMOptRemarkArg &Arg = Remark->Args[3]; + EXPECT_EQ(StringRef(Arg.Key.Str, 6), "String"); + EXPECT_EQ(Arg.Key.Len, 6U); + EXPECT_EQ(StringRef(Arg.Value.Str, 38), + " because its definition is unavailable"); + EXPECT_EQ(Arg.Value.Len, 38U); + EXPECT_EQ(StringRef(Arg.DebugLoc.SourceFile.Str, 0), ""); + EXPECT_EQ(Arg.DebugLoc.SourceFile.Len, 0U); + EXPECT_EQ(Arg.DebugLoc.SourceLineNumber, 0U); + EXPECT_EQ(Arg.DebugLoc.SourceColumnNumber, 0U); + } + + EXPECT_EQ(LLVMOptRemarkParserGetNext(Parser), nullptr); + + EXPECT_FALSE(LLVMOptRemarkParserHasError(Parser)); + LLVMOptRemarkParserDispose(Parser); +} -- GitLab From f6b8b02db767ad21db1b66d25278c63a279554ed Mon Sep 17 00:00:00 2001 From: Volkan Keles Date: Wed, 10 Oct 2018 18:01:48 +0000 Subject: [PATCH 0012/1116] [GlobalISel] Fix the artifact combiner to fold G_IMPLICIT_DEF properly Summary: GlobalISel generates incorrect code because the legalizer artifact combiner assumes `G_[SZ]EXT (G_IMPLICIT_DEF)` is equivalent to `G_IMPLICIT_DEF `. Replace `G_[SZ]EXT (G_IMPLICIT_DEF)` with 0 because the top bits will be 0 for G_ZEXT and 0/1 for the G_SEXT. Reviewers: aditya_nandakumar, dsanders, aemerson, javed.absar Reviewed By: aditya_nandakumar Subscribers: rovka, kristof.beyls, llvm-commits Differential Revision: https://reviews.llvm.org/D52996 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344163 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../GlobalISel/LegalizationArtifactCombiner.h | 24 +++++++++---- .../AArch64/GlobalISel/legalize-ext.mir | 20 +++++++++-- test/CodeGen/X86/GlobalISel/legalize-ext.mir | 24 ++++++------- .../CodeGen/X86/GlobalISel/legalize-undef.mir | 36 +++++++++---------- 4 files changed, 65 insertions(+), 39 deletions(-) diff --git a/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h b/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h index 873587651ef..256f1ccbee7 100644 --- a/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h +++ b/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h @@ -109,7 +109,7 @@ public: return tryFoldImplicitDef(MI, DeadInsts); } - /// Try to fold sb = EXTEND (G_IMPLICIT_DEF sa) -> sb = G_IMPLICIT_DEF + /// Try to fold G_[ASZ]EXT (G_IMPLICIT_DEF). bool tryFoldImplicitDef(MachineInstr &MI, SmallVectorImpl &DeadInsts) { unsigned Opcode = MI.getOpcode(); @@ -119,13 +119,25 @@ public: if (MachineInstr *DefMI = getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, MI.getOperand(1).getReg(), MRI)) { + Builder.setInstr(MI); unsigned DstReg = MI.getOperand(0).getReg(); LLT DstTy = MRI.getType(DstReg); - if (isInstUnsupported({TargetOpcode::G_IMPLICIT_DEF, {DstTy}})) - return false; - LLVM_DEBUG(dbgs() << ".. Combine EXT(IMPLICIT_DEF) " << MI;); - Builder.setInstr(MI); - Builder.buildInstr(TargetOpcode::G_IMPLICIT_DEF, DstReg); + + if (Opcode == TargetOpcode::G_ANYEXT) { + // G_ANYEXT (G_IMPLICIT_DEF) -> G_IMPLICIT_DEF + if (isInstUnsupported({TargetOpcode::G_IMPLICIT_DEF, {DstTy}})) + return false; + LLVM_DEBUG(dbgs() << ".. Combine G_ANYEXT(G_IMPLICIT_DEF): " << MI;); + Builder.buildInstr(TargetOpcode::G_IMPLICIT_DEF, DstReg); + } else { + // G_[SZ]EXT (G_IMPLICIT_DEF) -> G_CONSTANT 0 because the top + // bits will be 0 for G_ZEXT and 0/1 for the G_SEXT. + if (isInstUnsupported({TargetOpcode::G_CONSTANT, {DstTy}})) + return false; + LLVM_DEBUG(dbgs() << ".. Combine G_[SZ]EXT(G_IMPLICIT_DEF): " << MI;); + Builder.buildConstant(DstReg, 0); + } + markInstAndDefDead(MI, *DefMI, DeadInsts); return true; } diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-ext.mir b/test/CodeGen/AArch64/GlobalISel/legalize-ext.mir index cf4f687408f..c4bcbb683c1 100644 --- a/test/CodeGen/AArch64/GlobalISel/legalize-ext.mir +++ b/test/CodeGen/AArch64/GlobalISel/legalize-ext.mir @@ -82,9 +82,9 @@ body: | ; CHECK: $w0 = COPY [[ASHR2]](s32) ; CHECK: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; CHECK: [[TRUNC10:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) - ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[TRUNC3]]4(s32) - ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[TRUNC3]]1, [[TRUNC3]]2 - ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[TRUNC3]]3(s32) + ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C6]](s32) + ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[TRUNC10]], [[COPY5]] + ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[AND3]](s32) ; CHECK: $w0 = COPY [[COPY6]](s32) ; CHECK: [[TRUNC11:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) ; CHECK: $w0 = COPY [[TRUNC11]](s32) @@ -92,6 +92,12 @@ body: | ; CHECK: $w0 = COPY [[TRUNC12]](s32) ; CHECK: [[FPEXT:%[0-9]+]]:_(s64) = G_FPEXT [[TRUNC12]](s32) ; CHECK: $x0 = COPY [[FPEXT]](s64) + ; CHECK: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK: $w0 = COPY [[C7]](s32) + ; CHECK: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK: $w0 = COPY [[C8]](s32) + ; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK: $w0 = COPY [[DEF]](s32) %0(s64) = COPY $x0 %1(s1) = G_TRUNC %0 @@ -140,4 +146,12 @@ body: | $w0 = COPY %17 %18(s64) = G_FPEXT %17 $x0 = COPY %18 + + %24:_(s16) = G_IMPLICIT_DEF + %25:_(s32) = G_ZEXT %24(s16) + $w0 = COPY %25(s32) + %26:_(s32) = G_SEXT %24(s16) + $w0 = COPY %26(s32) + %27:_(s32) = G_ANYEXT %24(s16) + $w0 = COPY %27(s32) ... diff --git a/test/CodeGen/X86/GlobalISel/legalize-ext.mir b/test/CodeGen/X86/GlobalISel/legalize-ext.mir index cf9b8039096..71f1facfb81 100644 --- a/test/CodeGen/X86/GlobalISel/legalize-ext.mir +++ b/test/CodeGen/X86/GlobalISel/legalize-ext.mir @@ -288,12 +288,12 @@ body: | liveins: $edi ; X32-LABEL: name: test_sext_i1toi8 - ; X32: [[DEF:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF - ; X32: $al = COPY [[DEF]](s8) + ; X32: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 0 + ; X32: $al = COPY [[C]](s8) ; X32: RET 0, implicit $al ; X64-LABEL: name: test_sext_i1toi8 - ; X64: [[DEF:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF - ; X64: $al = COPY [[DEF]](s8) + ; X64: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 0 + ; X64: $al = COPY [[C]](s8) ; X64: RET 0, implicit $al %0(s1) = G_IMPLICIT_DEF %1(s8) = G_SEXT %0(s1) @@ -314,12 +314,12 @@ body: | liveins: $edi ; X32-LABEL: name: test_sext_i1toi16 - ; X32: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF - ; X32: $ax = COPY [[DEF]](s16) + ; X32: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 0 + ; X32: $ax = COPY [[C]](s16) ; X32: RET 0, implicit $ax ; X64-LABEL: name: test_sext_i1toi16 - ; X64: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF - ; X64: $ax = COPY [[DEF]](s16) + ; X64: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 0 + ; X64: $ax = COPY [[C]](s16) ; X64: RET 0, implicit $ax %0(s1) = G_IMPLICIT_DEF %1(s16) = G_SEXT %0(s1) @@ -341,12 +341,12 @@ body: | liveins: $edi ; X32-LABEL: name: test_sext_i1 - ; X32: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; X32: $eax = COPY [[DEF]](s32) + ; X32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; X32: $eax = COPY [[C]](s32) ; X32: RET 0, implicit $eax ; X64-LABEL: name: test_sext_i1 - ; X64: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; X64: $eax = COPY [[DEF]](s32) + ; X64: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; X64: $eax = COPY [[C]](s32) ; X64: RET 0, implicit $eax %0(s1) = G_IMPLICIT_DEF %2(s32) = G_SEXT %0(s1) diff --git a/test/CodeGen/X86/GlobalISel/legalize-undef.mir b/test/CodeGen/X86/GlobalISel/legalize-undef.mir index 997064b366d..4a865e4e582 100644 --- a/test/CodeGen/X86/GlobalISel/legalize-undef.mir +++ b/test/CodeGen/X86/GlobalISel/legalize-undef.mir @@ -11,32 +11,32 @@ body: | liveins: ; X64-LABEL: name: test_implicit_def ; X64: [[DEF:%[0-9]+]]:_(p0) = G_IMPLICIT_DEF + ; X64: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 0 + ; X64: G_STORE [[C]](s8), [[DEF]](p0) :: (store 1) ; X64: [[DEF1:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF ; X64: G_STORE [[DEF1]](s8), [[DEF]](p0) :: (store 1) - ; X64: [[DEF2:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF - ; X64: G_STORE [[DEF2]](s8), [[DEF]](p0) :: (store 1) - ; X64: [[DEF3:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF - ; X64: G_STORE [[DEF3]](s16), [[DEF]](p0) :: (store 2) - ; X64: [[DEF4:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; X64: G_STORE [[DEF4]](s32), [[DEF]](p0) :: (store 4) - ; X64: [[DEF5:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF - ; X64: G_STORE [[DEF5]](s64), [[DEF]](p0) :: (store 8) + ; X64: [[DEF2:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; X64: G_STORE [[DEF2]](s16), [[DEF]](p0) :: (store 2) + ; X64: [[DEF3:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; X64: G_STORE [[DEF3]](s32), [[DEF]](p0) :: (store 4) + ; X64: [[DEF4:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF + ; X64: G_STORE [[DEF4]](s64), [[DEF]](p0) :: (store 8) ; X32-LABEL: name: test_implicit_def ; X32: [[DEF:%[0-9]+]]:_(p0) = G_IMPLICIT_DEF + ; X32: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 0 + ; X32: G_STORE [[C]](s8), [[DEF]](p0) :: (store 1) ; X32: [[DEF1:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF ; X32: G_STORE [[DEF1]](s8), [[DEF]](p0) :: (store 1) - ; X32: [[DEF2:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF - ; X32: G_STORE [[DEF2]](s8), [[DEF]](p0) :: (store 1) - ; X32: [[DEF3:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF - ; X32: G_STORE [[DEF3]](s16), [[DEF]](p0) :: (store 2) + ; X32: [[DEF2:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; X32: G_STORE [[DEF2]](s16), [[DEF]](p0) :: (store 2) + ; X32: [[DEF3:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; X32: G_STORE [[DEF3]](s32), [[DEF]](p0) :: (store 4) ; X32: [[DEF4:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; X32: G_STORE [[DEF4]](s32), [[DEF]](p0) :: (store 4) ; X32: [[DEF5:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; X32: [[DEF6:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; X32: G_STORE [[DEF5]](s32), [[DEF]](p0) :: (store 4, align 8) - ; X32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; X32: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[DEF]], [[C]](s32) - ; X32: G_STORE [[DEF6]](s32), [[GEP]](p0) :: (store 4) + ; X32: G_STORE [[DEF4]](s32), [[DEF]](p0) :: (store 4, align 8) + ; X32: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; X32: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[DEF]], [[C1]](s32) + ; X32: G_STORE [[DEF5]](s32), [[GEP]](p0) :: (store 4) %5:_(p0) = G_IMPLICIT_DEF %0:_(s1) = G_IMPLICIT_DEF G_STORE %0, %5 ::(store 1) -- GitLab From b501cdb9f5588ae98681dd5d8cc6ccc22ad40cb4 Mon Sep 17 00:00:00 2001 From: Francis Visoiu Mistrih Date: Wed, 10 Oct 2018 18:07:44 +0000 Subject: [PATCH 0013/1116] Revert "[OptRemarks] Add library for parsing optimization remarks" This reverts commit 1cc98e6672b6319fdb00b70dd4474aabdadbe193. Seems to break bots: http://lab.llvm.org:8011/builders/clang-x86_64-linux-abi-test/builds/33398/steps/build-unified-tree/logs/stdio git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344164 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm-c/OptRemarks.h | 197 -------- lib/CMakeLists.txt | 1 - lib/LLVMBuild.txt | 1 - lib/OptRemarks/CMakeLists.txt | 3 - lib/OptRemarks/LLVMBuild.txt | 22 - lib/OptRemarks/OptRemarksParser.cpp | 368 --------------- tools/llvm-opt-report/CMakeLists.txt | 2 +- tools/llvm-opt-report/OptReport.cpp | 144 ++++-- unittests/CMakeLists.txt | 1 - unittests/OptRemarks/CMakeLists.txt | 8 - .../OptRemarks/OptRemarksParsingTest.cpp | 433 ------------------ 11 files changed, 107 insertions(+), 1073 deletions(-) delete mode 100644 include/llvm-c/OptRemarks.h delete mode 100644 lib/OptRemarks/CMakeLists.txt delete mode 100644 lib/OptRemarks/LLVMBuild.txt delete mode 100644 lib/OptRemarks/OptRemarksParser.cpp delete mode 100644 unittests/OptRemarks/CMakeLists.txt delete mode 100644 unittests/OptRemarks/OptRemarksParsingTest.cpp diff --git a/include/llvm-c/OptRemarks.h b/include/llvm-c/OptRemarks.h deleted file mode 100644 index f3449cc1b8c..00000000000 --- a/include/llvm-c/OptRemarks.h +++ /dev/null @@ -1,197 +0,0 @@ -/*===-- llvm-c/OptRemarks.h - OptRemarks Public C Interface -------*- C -*-===*\ -|* *| -|* The LLVM Compiler Infrastructure *| -|* *| -|* This file is distributed under the University of Illinois Open Source *| -|* License. See LICENSE.TXT for details. *| -|* *| -|*===----------------------------------------------------------------------===*| -|* *| -|* This header provides a public interface to an opt-remark library. *| -|* LLVM provides an implementation of this interface. *| -|* *| -\*===----------------------------------------------------------------------===*/ - -#ifndef LLVM_C_OPT_REMARKS_H -#define LLVM_C_OPT_REMARKS_H - -#include "llvm-c/Core.h" -#include "llvm-c/Types.h" -#ifdef __cplusplus -#include -extern "C" { -#else -#include -#endif /* !defined(__cplusplus) */ - -/** - * @defgroup LLVMCOPTREMARKS OptRemarks - * @ingroup LLVMC - * - * @{ - */ - -#define OPT_REMARKS_API_VERSION 0 - -/** - * String containing a buffer and a length. The buffer is not guaranteed to be - * zero-terminated. - * - * \since OPT_REMARKS_API_VERSION=0 - */ -typedef struct { - const char *Str; - uint32_t Len; -} LLVMOptRemarkStringRef; - -/** - * DebugLoc containing File, Line and Column. - * - * \since OPT_REMARKS_API_VERSION=0 - */ -typedef struct { - // File: - LLVMOptRemarkStringRef SourceFile; - // Line: - uint32_t SourceLineNumber; - // Column: - uint32_t SourceColumnNumber; -} LLVMOptRemarkDebugLoc; - -/** - * Element of the "Args" list. The key might give more information about what - * are the semantics of the value, e.g. "Callee" will tell you that the value - * is a symbol that names a function. - * - * \since OPT_REMARKS_API_VERSION=0 - */ -typedef struct { - // e.g. "Callee" - LLVMOptRemarkStringRef Key; - // e.g. "malloc" - LLVMOptRemarkStringRef Value; - - // "DebugLoc": Optional - LLVMOptRemarkDebugLoc DebugLoc; -} LLVMOptRemarkArg; - -/** - * One remark entry. - * - * \since OPT_REMARKS_API_VERSION=0 - */ -typedef struct { - // e.g. !Missed, !Passed - LLVMOptRemarkStringRef RemarkType; - // "Pass": Required - LLVMOptRemarkStringRef PassName; - // "Name": Required - LLVMOptRemarkStringRef RemarkName; - // "Function": Required - LLVMOptRemarkStringRef FunctionName; - - // "DebugLoc": Optional - LLVMOptRemarkDebugLoc DebugLoc; - // "Hotness": Optional - uint32_t Hotness; - // "Args": Optional. It is an array of `num_args` elements. - uint32_t NumArgs; - LLVMOptRemarkArg *Args; -} LLVMOptRemarkEntry; - -typedef struct LLVMOptRemarkOpaqueParser *LLVMOptRemarkParserRef; - -/** - * Creates a remark parser that can be used to read and parse the buffer located - * in \p Buf of size \p Size. - * - * \p Buf cannot be NULL. - * - * This function should be paired with LLVMOptRemarkParserDispose() to avoid - * leaking resources. - * - * \since OPT_REMARKS_API_VERSION=0 - */ -extern LLVMOptRemarkParserRef LLVMOptRemarkParserCreate(const void *Buf, - uint64_t Size); - -/** - * Returns the next remark in the file. - * - * The value pointed to by the return value is invalidated by the next call to - * LLVMOptRemarkParserGetNext(). - * - * If the parser reaches the end of the buffer, the return value will be NULL. - * - * In the case of an error, the return value will be NULL, and: - * - * 1) LLVMOptRemarkParserHasError() will return `1`. - * - * 2) LLVMOptRemarkParserGetErrorMessage() will return a descriptive error - * message. - * - * An error may occur if: - * - * 1) An argument is invalid. - * - * 2) There is a YAML parsing error. This type of error aborts parsing - * immediately and returns `1`. It can occur on malformed YAML. - * - * 3) Remark parsing error. If this type of error occurs, the parser won't call - * the handler and will continue to the next one. It can occur on malformed - * remarks, like missing or extra fields in the file. - * - * Here is a quick example of the usage: - * - * ``` - * LLVMOptRemarkParserRef Parser = LLVMOptRemarkParserCreate(Buf, Size); - * LLVMOptRemarkEntry *Remark = NULL; - * while ((Remark == LLVMOptRemarkParserGetNext(Parser))) { - * // use Remark - * } - * bool HasError = LLVMOptRemarkParserHasError(Parser); - * LLVMOptRemarkParserDispose(Parser); - * ``` - * - * \since OPT_REMARKS_API_VERSION=0 - */ -extern LLVMOptRemarkEntry * -LLVMOptRemarkParserGetNext(LLVMOptRemarkParserRef Parser); - -/** - * Returns `1` if the parser encountered an error while parsing the buffer. - * - * \since OPT_REMARKS_API_VERSION=0 - */ -extern LLVMBool LLVMOptRemarkParserHasError(LLVMOptRemarkParserRef Parser); - -/** - * Returns a null-terminated string containing an error message. - * - * In case of no error, the result is `NULL`. - * - * The memory of the string is bound to the lifetime of \p Parser. If - * LLVMOptRemarkParserDispose() is called, the memory of the string will be - * released. - * - * \since OPT_REMARKS_API_VERSION=0 - */ -extern const char * -LLVMOptRemarkParserGetErrorMessage(LLVMOptRemarkParserRef Parser); - -/** - * Releases all the resources used by \p Parser. - * - * \since OPT_REMARKS_API_VERSION=0 - */ -extern void LLVMOptRemarkParserDispose(LLVMOptRemarkParserRef Parser); - -/** - * @} // endgoup LLVMCOPTREMARKS - */ - -#ifdef __cplusplus -} -#endif /* !defined(__cplusplus) */ - -#endif /* LLVM_C_OPT_REMARKS_H */ diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index 1f54c611bad..ecf8b93d253 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -15,7 +15,6 @@ add_subdirectory(MC) add_subdirectory(Object) add_subdirectory(ObjectYAML) add_subdirectory(Option) -add_subdirectory(OptRemarks) add_subdirectory(DebugInfo) add_subdirectory(ExecutionEngine) add_subdirectory(Target) diff --git a/lib/LLVMBuild.txt b/lib/LLVMBuild.txt index 0eb4bba2676..a6cd15699fb 100644 --- a/lib/LLVMBuild.txt +++ b/lib/LLVMBuild.txt @@ -35,7 +35,6 @@ subdirectories = BinaryFormat ObjectYAML Option - OptRemarks Passes ProfileData Support diff --git a/lib/OptRemarks/CMakeLists.txt b/lib/OptRemarks/CMakeLists.txt deleted file mode 100644 index 8fefe1d986b..00000000000 --- a/lib/OptRemarks/CMakeLists.txt +++ /dev/null @@ -1,3 +0,0 @@ -add_llvm_library(LLVMOptRemarks - OptRemarksParser.cpp -) diff --git a/lib/OptRemarks/LLVMBuild.txt b/lib/OptRemarks/LLVMBuild.txt deleted file mode 100644 index 4c1032296dc..00000000000 --- a/lib/OptRemarks/LLVMBuild.txt +++ /dev/null @@ -1,22 +0,0 @@ -;===- ./lib/OptRemarks/LLVMBuild.txt ---------------------------*- Conf -*--===; -; -; The LLVM Compiler Infrastructure -; -; This file is distributed under the University of Illinois Open Source -; License. See LICENSE.TXT for details. -; -;===------------------------------------------------------------------------===; -; -; This is an LLVMBuild description file for the components in this subdirectory. -; -; For more information on the LLVMBuild system, please see: -; -; http://llvm.org/docs/LLVMBuild.html -; -;===------------------------------------------------------------------------===; - -[component_0] -type = Library -name = OptRemarks -parent = Libraries -required_libraries = Support diff --git a/lib/OptRemarks/OptRemarksParser.cpp b/lib/OptRemarks/OptRemarksParser.cpp deleted file mode 100644 index 4b8b038c832..00000000000 --- a/lib/OptRemarks/OptRemarksParser.cpp +++ /dev/null @@ -1,368 +0,0 @@ -//===- OptRemarksParser.cpp -----------------------------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is dual licensed under the MIT and the University of Illinois Open -// Source Licenses. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file provides utility methods used by clients that want to use the -// parser for optimization remarks in LLVM. -// -//===----------------------------------------------------------------------===// - -#include "llvm-c/OptRemarks.h" -#include "llvm/ADT/STLExtras.h" -#include "llvm/Support/SourceMgr.h" -#include "llvm/Support/YAMLTraits.h" - -using namespace llvm; - -namespace { -struct RemarkParser { - /// Source manager for better error messages. - SourceMgr SM; - /// Stream for yaml parsing. - yaml::Stream Stream; - /// Storage for the error stream. - std::string ErrorString; - /// The error stream. - raw_string_ostream ErrorStream; - /// Iterator in the YAML stream. - yaml::document_iterator DI; - /// The parsed remark (if any). - Optional LastRemark; - /// Temporary parsing buffer for the arguments. - SmallVector TmpArgs; - /// The state used by the parser to parse a remark entry. Invalidated with - /// every call to `parseYAMLElement`. - struct ParseState { - /// Temporary parsing buffer for the arguments. - SmallVectorImpl *Args; - StringRef Type; - StringRef Pass; - StringRef Name; - StringRef Function; - /// Optional. - Optional File; - Optional Line; - Optional Column; - Optional Hotness; - - ParseState(SmallVectorImpl &Args) : Args(&Args) {} - /// Use Args only as a **temporary** buffer. - ~ParseState() { Args->clear(); } - }; - - ParseState State; - - /// Set to `true` if we had any errors during parsing. - bool HadAnyErrors = false; - - RemarkParser(StringRef Buf) - : SM(), Stream(Buf, SM), ErrorString(), ErrorStream(ErrorString), - DI(Stream.begin()), LastRemark(), TmpArgs(), State(TmpArgs) { - SM.setDiagHandler(RemarkParser::HandleDiagnostic, this); - } - - /// Parse a YAML element. - Error parseYAMLElement(yaml::Document &Remark); - -private: - /// Parse one key to a string. - /// otherwise. - Error parseKey(StringRef &Result, yaml::KeyValueNode &Node); - /// Parse one value to a string. - Error parseValue(StringRef &Result, yaml::KeyValueNode &Node); - /// Parse one value to an unsigned. - Error parseValue(Optional &Result, yaml::KeyValueNode &Node); - /// Parse a debug location. - Error parseDebugLoc(Optional &File, Optional &Line, - Optional &Column, yaml::KeyValueNode &Node); - /// Parse an argument. - Error parseArg(SmallVectorImpl &TmpArgs, yaml::Node &Node); - - /// Handle a diagnostic from the YAML stream. Records the error in the - /// RemarkParser class. - static void HandleDiagnostic(const SMDiagnostic &Diag, void *Ctx) { - assert(Ctx && "Expected non-null Ctx in diagnostic handler."); - auto *Parser = static_cast(Ctx); - Diag.print(/*ProgName=*/nullptr, Parser->ErrorStream, /*ShowColors*/ false, - /*ShowKindLabels*/ true); - } -}; - -class ParseError : public ErrorInfo { -public: - static char ID; - - ParseError(StringRef Message, yaml::Node &Node) - : Message(Message), Node(Node) {} - - void log(raw_ostream &OS) const override { OS << Message; } - std::error_code convertToErrorCode() const override { - return inconvertibleErrorCode(); - } - - StringRef getMessage() const { return Message; } - yaml::Node &getNode() const { return Node; } - -private: - StringRef Message; // No need to hold a full copy of the buffer. - yaml::Node &Node; -}; - -char ParseError::ID = 0; - -static LLVMOptRemarkStringRef toOptRemarkStr(StringRef Str) { - return {Str.data(), static_cast(Str.size())}; -} - -Error RemarkParser::parseKey(StringRef &Result, yaml::KeyValueNode &Node) { - auto *Key = dyn_cast(Node.getKey()); - if (!Key) - return make_error("key is not a string.", Node); - - Result = Key->getRawValue(); - return Error::success(); -} - -Error RemarkParser::parseValue(StringRef &Result, yaml::KeyValueNode &Node) { - auto *Value = dyn_cast(Node.getValue()); - if (!Value) - return make_error("expected a value of scalar type.", Node); - Result = Value->getRawValue(); - - if (Result.front() == '\'') - Result = Result.drop_front(); - - if (Result.back() == '\'') - Result = Result.drop_back(); - - return Error::success(); -} - -Error RemarkParser::parseValue(Optional &Result, - yaml::KeyValueNode &Node) { - SmallVector Tmp; - auto *Value = dyn_cast(Node.getValue()); - if (!Value) - return make_error("expected a value of scalar type.", Node); - unsigned UnsignedValue = 0; - if (Value->getValue(Tmp).getAsInteger(10, UnsignedValue)) - return make_error("expected a value of integer type.", *Value); - Result = UnsignedValue; - return Error::success(); -} - -Error RemarkParser::parseDebugLoc(Optional &File, - Optional &Line, - Optional &Column, - yaml::KeyValueNode &Node) { - auto *DebugLoc = dyn_cast(Node.getValue()); - if (!DebugLoc) - return make_error("expected a value of mapping type.", Node); - - for (yaml::KeyValueNode &DLNode : *DebugLoc) { - StringRef KeyName; - if (Error E = parseKey(KeyName, DLNode)) - return E; - if (KeyName == "File") { - File = StringRef(); // Set the optional to contain a default constructed - // value, to be passed to the parsing function. - if (Error E = parseValue(*File, DLNode)) - return E; - } else if (KeyName == "Column") { - if (Error E = parseValue(Column, DLNode)) - return E; - } else if (KeyName == "Line") { - if (Error E = parseValue(Line, DLNode)) - return E; - } else { - return make_error("unknown entry in DebugLoc map.", DLNode); - } - } - - // If any of the debug loc fields is missing, return an error. - if (!File || !Line || !Column) - return make_error("DebugLoc node incomplete.", Node); - - return Error::success(); -} - -Error RemarkParser::parseArg(SmallVectorImpl &Args, - yaml::Node &Node) { - auto *ArgMap = dyn_cast(&Node); - if (!ArgMap) - return make_error("expected a value of mapping type.", Node); - - StringRef ValueStr; - StringRef KeyStr; - Optional File; - Optional Line; - Optional Column; - - for (yaml::KeyValueNode &ArgEntry : *ArgMap) { - StringRef KeyName; - if (Error E = parseKey(KeyName, ArgEntry)) - return E; - - // Try to parse debug locs. - if (KeyName == "DebugLoc") { - // Can't have multiple DebugLoc entries per argument. - if (File || Line || Column) - return make_error( - "only one DebugLoc entry is allowed per argument.", ArgEntry); - - if (Error E = parseDebugLoc(File, Line, Column, ArgEntry)) - return E; - continue; - } - - // If we already have a string, error out. - if (!ValueStr.empty()) - return make_error( - "only one string entry is allowed per argument.", ArgEntry); - - // Try to parse a string. - if (Error E = parseValue(ValueStr, ArgEntry)) - return E; - - // Keep the key from the string. - KeyStr = KeyName; - } - - if (KeyStr.empty()) - return make_error("argument key is missing.", *ArgMap); - if (ValueStr.empty()) - return make_error("argument value is missing.", *ArgMap); - - Args.push_back(LLVMOptRemarkArg{ - toOptRemarkStr(KeyStr), toOptRemarkStr(ValueStr), - LLVMOptRemarkDebugLoc{toOptRemarkStr(File.getValueOr(StringRef())), - Line.getValueOr(0), Column.getValueOr(0)}}); - - return Error::success(); -} - -Error RemarkParser::parseYAMLElement(yaml::Document &Remark) { - // Parsing a new remark, clear the previous one. - LastRemark = None; - State = ParseState(TmpArgs); - - auto *Root = dyn_cast(Remark.getRoot()); - if (!Root) - return make_error("document root is not of mapping type.", - *Remark.getRoot()); - - State.Type = Root->getRawTag(); - - for (yaml::KeyValueNode &RemarkField : *Root) { - StringRef KeyName; - if (Error E = parseKey(KeyName, RemarkField)) - return E; - - if (KeyName == "Pass") { - if (Error E = parseValue(State.Pass, RemarkField)) - return E; - } else if (KeyName == "Name") { - if (Error E = parseValue(State.Name, RemarkField)) - return E; - } else if (KeyName == "Function") { - if (Error E = parseValue(State.Function, RemarkField)) - return E; - } else if (KeyName == "Hotness") { - if (Error E = parseValue(State.Hotness, RemarkField)) - return E; - } else if (KeyName == "DebugLoc") { - if (Error E = - parseDebugLoc(State.File, State.Line, State.Column, RemarkField)) - return E; - } else if (KeyName == "Args") { - auto *Args = dyn_cast(RemarkField.getValue()); - if (!Args) - return make_error("wrong value type for key.", RemarkField); - - for (yaml::Node &Arg : *Args) - if (Error E = parseArg(*State.Args, Arg)) - return E; - } else { - return make_error("unknown key.", RemarkField); - } - } - - // If the YAML parsing failed, don't even continue parsing. We might - // encounter malformed YAML. - if (Stream.failed()) - return make_error("YAML parsing failed.", *Remark.getRoot()); - - // Check if any of the mandatory fields are missing. - if (State.Type.empty() || State.Pass.empty() || State.Name.empty() || - State.Function.empty()) - return make_error("Type, Pass, Name or Function missing.", - *Remark.getRoot()); - - LastRemark = LLVMOptRemarkEntry{ - toOptRemarkStr(State.Type), - toOptRemarkStr(State.Pass), - toOptRemarkStr(State.Name), - toOptRemarkStr(State.Function), - LLVMOptRemarkDebugLoc{toOptRemarkStr(State.File.getValueOr(StringRef())), - State.Line.getValueOr(0), - State.Column.getValueOr(0)}, - State.Hotness.getValueOr(0), - static_cast(State.Args->size()), - State.Args->data()}; - - return Error::success(); -} -} // namespace - -// Create wrappers for C Binding types (see CBindingWrapping.h). -DEFINE_SIMPLE_CONVERSION_FUNCTIONS(RemarkParser, LLVMOptRemarkParserRef); - -extern "C" LLVMOptRemarkParserRef LLVMOptRemarkParserCreate(const void *Buf, - uint64_t Size) { - return wrap( - new RemarkParser(StringRef(static_cast(Buf), Size))); -} - -extern "C" LLVMOptRemarkEntry * -LLVMOptRemarkParserGetNext(LLVMOptRemarkParserRef Parser) { - RemarkParser &TheParser = *unwrap(Parser); - // Check for EOF. - if (TheParser.HadAnyErrors || TheParser.DI == TheParser.Stream.end()) - return nullptr; - - // Try to parse an entry. - if (Error E = TheParser.parseYAMLElement(*TheParser.DI)) { - handleAllErrors(std::move(E), [&](const ParseError &PE) { - TheParser.Stream.printError(&PE.getNode(), - Twine(PE.getMessage()) + Twine('\n')); - TheParser.HadAnyErrors = true; - }); - return nullptr; - } - - // Move on. - ++TheParser.DI; - - // Return the just-parsed remark. - if (Optional &Entry = TheParser.LastRemark) - return &*Entry; - return nullptr; -} - -extern "C" LLVMBool LLVMOptRemarkParserHasError(LLVMOptRemarkParserRef Parser) { - return unwrap(Parser)->HadAnyErrors; -} - -extern "C" const char * -LLVMOptRemarkParserGetErrorMessage(LLVMOptRemarkParserRef Parser) { - return unwrap(Parser)->ErrorStream.str().c_str(); -} - -extern "C" void LLVMOptRemarkParserDispose(LLVMOptRemarkParserRef Parser) { - delete unwrap(Parser); -} diff --git a/tools/llvm-opt-report/CMakeLists.txt b/tools/llvm-opt-report/CMakeLists.txt index 3aabc03ab3f..777537a54c0 100644 --- a/tools/llvm-opt-report/CMakeLists.txt +++ b/tools/llvm-opt-report/CMakeLists.txt @@ -1,4 +1,4 @@ -set(LLVM_LINK_COMPONENTS Core Demangle Object OptRemarks Support) +set(LLVM_LINK_COMPONENTS Core Demangle Object Support) add_llvm_tool(llvm-opt-report OptReport.cpp diff --git a/tools/llvm-opt-report/OptReport.cpp b/tools/llvm-opt-report/OptReport.cpp index 071f779a9e6..aa7966132c2 100644 --- a/tools/llvm-opt-report/OptReport.cpp +++ b/tools/llvm-opt-report/OptReport.cpp @@ -28,7 +28,6 @@ #include "llvm/Support/WithColor.h" #include "llvm/Support/YAMLTraits.h" #include "llvm/Support/raw_ostream.h" -#include "llvm-c/OptRemarks.h" #include #include #include @@ -143,44 +142,104 @@ typedef std::map>>> LocationInfoTy; } // anonymous namespace -static bool readLocationInfo(LocationInfoTy &LocationInfo) { - ErrorOr> Buf = - MemoryBuffer::getFile(InputFileName.c_str()); - if (std::error_code EC = Buf.getError()) { - WithColor::error() << "Can't open file " << InputFileName << ": " - << EC.message() << "\n"; - return false; - } +static void collectLocationInfo(yaml::Stream &Stream, + LocationInfoTy &LocationInfo) { + SmallVector Tmp; + + // Note: We're using the YAML parser here directly, instead of using the + // YAMLTraits implementation, because the YAMLTraits implementation does not + // support a way to handle only a subset of the input keys (it will error out + // if there is an input key that you don't map to your class), and + // furthermore, it does not provide a way to handle the Args sequence of + // key/value pairs, where the order must be captured and the 'String' key + // might be repeated. + for (auto &Doc : Stream) { + auto *Root = dyn_cast(Doc.getRoot()); + if (!Root) + continue; - StringRef Buffer = (*Buf)->getBuffer(); - LLVMOptRemarkParserRef Parser = - LLVMOptRemarkParserCreate(Buffer.data(), Buffer.size()); - - LLVMOptRemarkEntry *Remark = nullptr; - while ((Remark = LLVMOptRemarkParserGetNext(Parser))) { - bool Transformed = - StringRef(Remark->RemarkType.Str, Remark->RemarkType.Len) == "!Passed"; - StringRef Pass(Remark->PassName.Str, Remark->PassName.Len); - StringRef File(Remark->DebugLoc.SourceFile.Str, - Remark->DebugLoc.SourceFile.Len); - StringRef Function(Remark->FunctionName.Str, Remark->FunctionName.Len); - uint32_t Line = Remark->DebugLoc.SourceLineNumber; - uint32_t Column = Remark->DebugLoc.SourceColumnNumber; - ArrayRef Args(Remark->Args, Remark->NumArgs); + bool Transformed = Root->getRawTag() == "!Passed"; + std::string Pass, File, Function; + int Line = 0, Column = 1; int VectorizationFactor = 1; int InterleaveCount = 1; int UnrollCount = 1; - for (const LLVMOptRemarkArg &Arg : Args) { - StringRef ArgKeyName(Arg.Key.Str, Arg.Key.Len); - StringRef ArgValue(Arg.Value.Str, Arg.Value.Len); - if (ArgKeyName == "VectorizationFactor") - ArgValue.getAsInteger(10, VectorizationFactor); - else if (ArgKeyName == "InterleaveCount") - ArgValue.getAsInteger(10, InterleaveCount); - else if (ArgKeyName == "UnrollCount") - ArgValue.getAsInteger(10, UnrollCount); + for (auto &RootChild : *Root) { + auto *Key = dyn_cast(RootChild.getKey()); + if (!Key) + continue; + StringRef KeyName = Key->getValue(Tmp); + if (KeyName == "Pass") { + auto *Value = dyn_cast(RootChild.getValue()); + if (!Value) + continue; + Pass = Value->getValue(Tmp); + } else if (KeyName == "Function") { + auto *Value = dyn_cast(RootChild.getValue()); + if (!Value) + continue; + Function = Value->getValue(Tmp); + } else if (KeyName == "DebugLoc") { + auto *DebugLoc = dyn_cast(RootChild.getValue()); + if (!DebugLoc) + continue; + + for (auto &DLChild : *DebugLoc) { + auto *DLKey = dyn_cast(DLChild.getKey()); + if (!DLKey) + continue; + StringRef DLKeyName = DLKey->getValue(Tmp); + if (DLKeyName == "File") { + auto *Value = dyn_cast(DLChild.getValue()); + if (!Value) + continue; + File = Value->getValue(Tmp); + } else if (DLKeyName == "Line") { + auto *Value = dyn_cast(DLChild.getValue()); + if (!Value) + continue; + Value->getValue(Tmp).getAsInteger(10, Line); + } else if (DLKeyName == "Column") { + auto *Value = dyn_cast(DLChild.getValue()); + if (!Value) + continue; + Value->getValue(Tmp).getAsInteger(10, Column); + } + } + } else if (KeyName == "Args") { + auto *Args = dyn_cast(RootChild.getValue()); + if (!Args) + continue; + for (auto &ArgChild : *Args) { + auto *ArgMap = dyn_cast(&ArgChild); + if (!ArgMap) + continue; + for (auto &ArgKV : *ArgMap) { + auto *ArgKey = dyn_cast(ArgKV.getKey()); + if (!ArgKey) + continue; + StringRef ArgKeyName = ArgKey->getValue(Tmp); + if (ArgKeyName == "VectorizationFactor") { + auto *Value = dyn_cast(ArgKV.getValue()); + if (!Value) + continue; + Value->getValue(Tmp).getAsInteger(10, VectorizationFactor); + } else if (ArgKeyName == "InterleaveCount") { + auto *Value = dyn_cast(ArgKV.getValue()); + if (!Value) + continue; + Value->getValue(Tmp).getAsInteger(10, InterleaveCount); + } else if (ArgKeyName == "UnrollCount") { + auto *Value = dyn_cast(ArgKV.getValue()); + if (!Value) + continue; + Value->getValue(Tmp).getAsInteger(10, UnrollCount); + } + } + } + } } if (Line < 1 || File.empty()) @@ -209,13 +268,22 @@ static bool readLocationInfo(LocationInfoTy &LocationInfo) { UpdateLLII(LI.Vectorized); } } +} + +static bool readLocationInfo(LocationInfoTy &LocationInfo) { + ErrorOr> Buf = + MemoryBuffer::getFileOrSTDIN(InputFileName); + if (std::error_code EC = Buf.getError()) { + WithColor::error() << "Can't open file " << InputFileName << ": " + << EC.message() << "\n"; + return false; + } - bool HasError = LLVMOptRemarkParserHasError(Parser); - if (HasError) - WithColor::error() << LLVMOptRemarkParserGetErrorMessage(Parser) << "\n"; + SourceMgr SM; + yaml::Stream Stream(Buf.get()->getBuffer(), SM); + collectLocationInfo(Stream, LocationInfo); - LLVMOptRemarkParserDispose(Parser); - return !HasError; + return true; } static bool writeReport(LocationInfoTy &LocationInfo) { diff --git a/unittests/CMakeLists.txt b/unittests/CMakeLists.txt index 5dba2de4a88..bc41ab66a23 100644 --- a/unittests/CMakeLists.txt +++ b/unittests/CMakeLists.txt @@ -26,7 +26,6 @@ add_subdirectory(MI) add_subdirectory(Object) add_subdirectory(ObjectYAML) add_subdirectory(Option) -add_subdirectory(OptRemarks) add_subdirectory(Passes) add_subdirectory(ProfileData) add_subdirectory(Support) diff --git a/unittests/OptRemarks/CMakeLists.txt b/unittests/OptRemarks/CMakeLists.txt deleted file mode 100644 index 94c74867cc4..00000000000 --- a/unittests/OptRemarks/CMakeLists.txt +++ /dev/null @@ -1,8 +0,0 @@ -set(LLVM_LINK_COMPONENTS - OptRemarks - Support - ) - -add_llvm_unittest(OptRemarksTests - OptRemarksParsingTest.cpp - ) diff --git a/unittests/OptRemarks/OptRemarksParsingTest.cpp b/unittests/OptRemarks/OptRemarksParsingTest.cpp deleted file mode 100644 index a28820ffb7f..00000000000 --- a/unittests/OptRemarks/OptRemarksParsingTest.cpp +++ /dev/null @@ -1,433 +0,0 @@ -//===- unittest/Support/OptRemarksParsingTest.cpp - OptTable tests --------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#include "llvm-c/OptRemarks.h" -#include "gtest/gtest.h" - -using namespace llvm; - -template bool tryParse(const char (&Buf)[N]) { - LLVMOptRemarkParserRef Parser = LLVMOptRemarkParserCreate(Buf, N - 1); - LLVMOptRemarkEntry *Remark = nullptr; - while (LLVMOptRemarkEntry *NewRemark = LLVMOptRemarkParserGetNext(Parser)) { - EXPECT_TRUE(Remark == nullptr); // Only one remark per test. - Remark = NewRemark; - } - EXPECT_TRUE(Remark != nullptr); // We need *exactly* one remark per test. - bool HasError = LLVMOptRemarkParserHasError(Parser); - LLVMOptRemarkParserDispose(Parser); - return !HasError; -} - -template -bool parseExpectError(const char (&Buf)[N], const char *Error) { - LLVMOptRemarkParserRef Parser = LLVMOptRemarkParserCreate(Buf, N - 1); - LLVMOptRemarkEntry *Remark = nullptr; - while (LLVMOptRemarkEntry *NewRemark = LLVMOptRemarkParserGetNext(Parser)) { - EXPECT_FALSE(NewRemark); - } - EXPECT_TRUE(Remark == nullptr); // We are parsing only one malformed remark. - EXPECT_TRUE(LLVMOptRemarkParserHasError(Parser)); - bool MatchesError = - StringRef(LLVMOptRemarkParserGetErrorMessage(Parser)).contains(Error); - LLVMOptRemarkParserDispose(Parser); - - return MatchesError; -} - -TEST(OptRemarks, OptRemarksParsingEmpty) { - StringRef Buf = R"YAML( -)YAML"; - LLVMOptRemarkParserRef Parser = - LLVMOptRemarkParserCreate(Buf.data(), Buf.size()); - LLVMOptRemarkEntry *NewRemark = LLVMOptRemarkParserGetNext(Parser); - EXPECT_TRUE(NewRemark == nullptr); // No remark expected. - EXPECT_TRUE(LLVMOptRemarkParserHasError(Parser)); - EXPECT_TRUE(StringRef(LLVMOptRemarkParserGetErrorMessage(Parser)) - .contains("document root is not of mapping type.")); - LLVMOptRemarkParserDispose(Parser); -} - -TEST(OptRemarks, OptRemarksParsingGood) { - EXPECT_TRUE(tryParse(R"YAML( ---- !Missed -Pass: inline -Name: NoDefinition -DebugLoc: { File: file.c, Line: 3, Column: 12 } -Function: foo -Args: - - Callee: bar - - String: ' will not be inlined into ' - - Caller: foo - DebugLoc: { File: file.c, Line: 2, Column: 0 } - - String: ' because its definition is unavailable' -)YAML")); - - // No debug loc should also pass. - EXPECT_TRUE(tryParse(R"YAML( ---- !Missed -Pass: inline -Name: NoDefinition -Function: foo -Args: - - Callee: bar - - String: ' will not be inlined into ' - - Caller: foo - DebugLoc: { File: file.c, Line: 2, Column: 0 } - - String: ' because its definition is unavailable' -)YAML")); - - // No args is also ok. - EXPECT_TRUE(tryParse(R"YAML( ---- !Missed -Pass: inline -Name: NoDefinition -DebugLoc: { File: file.c, Line: 3, Column: 12 } -Function: foo -)YAML")); - - // Different order. - EXPECT_TRUE(tryParse(R"YAML( ---- !Missed -DebugLoc: { Line: 3, Column: 12, File: file.c } -Function: foo -Name: NoDefinition -Args: - - Callee: bar - - String: ' will not be inlined into ' - - Caller: foo - DebugLoc: { File: file.c, Line: 2, Column: 0 } - - String: ' because its definition is unavailable' -Pass: inline -)YAML")); -} - -// Mandatory common part of a remark. -#define COMMON_REMARK "\nPass: inline\nName: NoDefinition\nFunction: foo\n" -// Test all the types. -TEST(OptRemarks, OptRemarksParsingTypes) { - // Type: Passed - EXPECT_TRUE(tryParse("--- !Passed" COMMON_REMARK)); - // Type: Missed - EXPECT_TRUE(tryParse("--- !Missed" COMMON_REMARK)); - // Type: Analysis - EXPECT_TRUE(tryParse("--- !Analysis" COMMON_REMARK)); - // Type: AnalysisFPCompute - EXPECT_TRUE(tryParse("--- !AnalysisFPCompute" COMMON_REMARK)); - // Type: AnalysisAliasing - EXPECT_TRUE(tryParse("--- !AnalysisAliasing" COMMON_REMARK)); - // Type: Failure - EXPECT_TRUE(tryParse("--- !Failure" COMMON_REMARK)); -} -#undef COMMON_REMARK - -TEST(OptRemarks, OptRemarksParsingMissingFields) { - // No type. - EXPECT_TRUE(parseExpectError(R"YAML( ---- -Pass: inline -Name: NoDefinition -Function: foo -)YAML", - "error: Type, Pass, Name or Function missing.")); - // No pass. - EXPECT_TRUE(parseExpectError(R"YAML( ---- !Missed -Name: NoDefinition -Function: foo -)YAML", - "error: Type, Pass, Name or Function missing.")); - // No name. - EXPECT_TRUE(parseExpectError(R"YAML( ---- !Missed -Pass: inline -Function: foo -)YAML", - "error: Type, Pass, Name or Function missing.")); - // No function. - EXPECT_TRUE(parseExpectError(R"YAML( ---- !Missed -Pass: inline -Name: NoDefinition -)YAML", - "error: Type, Pass, Name or Function missing.")); - // Debug loc but no file. - EXPECT_TRUE(parseExpectError(R"YAML( ---- !Missed -Pass: inline -Name: NoDefinition -Function: foo -DebugLoc: { Line: 3, Column: 12 } -)YAML", - "DebugLoc node incomplete.")); - // Debug loc but no line. - EXPECT_TRUE(parseExpectError(R"YAML( ---- !Missed -Pass: inline -Name: NoDefinition -Function: foo -DebugLoc: { File: file.c, Column: 12 } -)YAML", - "DebugLoc node incomplete.")); - // Debug loc but no column. - EXPECT_TRUE(parseExpectError(R"YAML( ---- !Missed -Pass: inline -Name: NoDefinition -Function: foo -DebugLoc: { File: file.c, Line: 3 } -)YAML", - "DebugLoc node incomplete.")); -} - -TEST(OptRemarks, OptRemarksParsingWrongTypes) { - // Wrong debug loc type. - EXPECT_TRUE(parseExpectError(R"YAML( ---- !Missed -Pass: inline -Name: NoDefinition -Function: foo -DebugLoc: foo -)YAML", - "expected a value of mapping type.")); - // Wrong line type. - EXPECT_TRUE(parseExpectError(R"YAML( ---- !Missed -Pass: inline -Name: NoDefinition -Function: foo -DebugLoc: { File: file.c, Line: b, Column: 12 } -)YAML", - "expected a value of integer type.")); - // Wrong column type. - EXPECT_TRUE(parseExpectError(R"YAML( ---- !Missed -Pass: inline -Name: NoDefinition -Function: foo -DebugLoc: { File: file.c, Line: 3, Column: c } -)YAML", - "expected a value of integer type.")); - // Wrong args type. - EXPECT_TRUE(parseExpectError(R"YAML( ---- !Missed -Pass: inline -Name: NoDefinition -Function: foo -Args: foo -)YAML", - "wrong value type for key.")); - // Wrong key type. - EXPECT_TRUE(parseExpectError(R"YAML( ---- !Missed -{ A: a }: inline -Name: NoDefinition -Function: foo -)YAML", - "key is not a string.")); - // Debug loc with unknown entry. - EXPECT_TRUE(parseExpectError(R"YAML( ---- !Missed -Pass: inline -Name: NoDefinition -Function: foo -DebugLoc: { File: file.c, Column: 12, Unknown: 12 } -)YAML", - "unknown entry in DebugLoc map.")); - // Unknown entry. - EXPECT_TRUE(parseExpectError(R"YAML( ---- !Missed -Unknown: inline -)YAML", - "unknown key.")); - // Not a scalar. - EXPECT_TRUE(parseExpectError(R"YAML( ---- !Missed -Pass: { File: a, Line: 1, Column: 2 } -Name: NoDefinition -Function: foo -)YAML", - "expected a value of scalar type.")); - // Not a string file in debug loc. - EXPECT_TRUE(parseExpectError(R"YAML( ---- !Missed -Pass: inline -Name: NoDefinition -Function: foo -DebugLoc: { File: { a: b }, Column: 12, Line: 12 } -)YAML", - "expected a value of scalar type.")); - // Not a integer column in debug loc. - EXPECT_TRUE(parseExpectError(R"YAML( ---- !Missed -Pass: inline -Name: NoDefinition -Function: foo -DebugLoc: { File: file.c, Column: { a: b }, Line: 12 } -)YAML", - "expected a value of scalar type.")); - // Not a integer line in debug loc. - EXPECT_TRUE(parseExpectError(R"YAML( ---- !Missed -Pass: inline -Name: NoDefinition -Function: foo -DebugLoc: { File: file.c, Column: 12, Line: { a: b } } -)YAML", - "expected a value of scalar type.")); - // Not a mapping type value for args. - EXPECT_TRUE(parseExpectError(R"YAML( ---- !Missed -Pass: inline -Name: NoDefinition -Function: foo -DebugLoc: { File: file.c, Column: 12, Line: { a: b } } -)YAML", - "expected a value of scalar type.")); -} - -TEST(OptRemarks, OptRemarksParsingWrongArgs) { - // Multiple debug locs per arg. - EXPECT_TRUE( - parseExpectError(R"YAML( ---- !Missed -Pass: inline -Name: NoDefinition -Function: foo -Args: - - Str: string - DebugLoc: { File: a, Line: 1, Column: 2 } - DebugLoc: { File: a, Line: 1, Column: 2 } -)YAML", - "only one DebugLoc entry is allowed per argument.")); - // Multiple strings per arg. - EXPECT_TRUE( - parseExpectError(R"YAML( ---- !Missed -Pass: inline -Name: NoDefinition -Function: foo -Args: - - Str: string - Str2: string - DebugLoc: { File: a, Line: 1, Column: 2 } -)YAML", - "only one string entry is allowed per argument.")); - // No arg value. - EXPECT_TRUE(parseExpectError(R"YAML( ---- !Missed -Pass: inline -Name: NoDefinition -Function: foo -Args: - - Callee: '' - - DebugLoc: { File: a, Line: 1, Column: 2 } -)YAML", - "argument value is missing.")); - // No arg value. - EXPECT_TRUE(parseExpectError(R"YAML( ---- !Missed -Pass: inline -Name: NoDefinition -Function: foo -Args: - - DebugLoc: { File: a, Line: 1, Column: 2 } -)YAML", - "argument key is missing.")); - -} - -TEST(OptRemarks, OptRemarksGoodStruct) { - StringRef Buf = R"YAML( ---- !Missed -Pass: inline -Name: NoDefinition -DebugLoc: { File: file.c, Line: 3, Column: 12 } -Function: foo -Args: - - Callee: bar - - String: ' will not be inlined into ' - - Caller: foo - DebugLoc: { File: file.c, Line: 2, Column: 0 } - - String: ' because its definition is unavailable' -)YAML"; - - LLVMOptRemarkParserRef Parser = - LLVMOptRemarkParserCreate(Buf.data(), Buf.size()); - LLVMOptRemarkEntry *Remark = LLVMOptRemarkParserGetNext(Parser); - EXPECT_FALSE(Remark == nullptr); - EXPECT_EQ(StringRef(Remark->RemarkType.Str, 7), "!Missed"); - EXPECT_EQ(Remark->RemarkType.Len, 7U); - EXPECT_EQ(StringRef(Remark->PassName.Str, 6), "inline"); - EXPECT_EQ(Remark->PassName.Len, 6U); - EXPECT_EQ(StringRef(Remark->RemarkName.Str, 12), "NoDefinition"); - EXPECT_EQ(Remark->RemarkName.Len, 12U); - EXPECT_EQ(StringRef(Remark->FunctionName.Str, 3), "foo"); - EXPECT_EQ(Remark->FunctionName.Len, 3U); - EXPECT_EQ(StringRef(Remark->DebugLoc.SourceFile.Str, 6), "file.c"); - EXPECT_EQ(Remark->DebugLoc.SourceFile.Len, 6U); - EXPECT_EQ(Remark->DebugLoc.SourceLineNumber, 3U); - EXPECT_EQ(Remark->DebugLoc.SourceColumnNumber, 12U); - EXPECT_EQ(Remark->Hotness, 0U); - EXPECT_EQ(Remark->NumArgs, 4U); - // Arg 0 - { - LLVMOptRemarkArg &Arg = Remark->Args[0]; - EXPECT_EQ(StringRef(Arg.Key.Str, 6), "Callee"); - EXPECT_EQ(Arg.Key.Len, 6U); - EXPECT_EQ(StringRef(Arg.Value.Str, 3), "bar"); - EXPECT_EQ(Arg.Value.Len, 3U); - EXPECT_EQ(StringRef(Arg.DebugLoc.SourceFile.Str, 0), ""); - EXPECT_EQ(Arg.DebugLoc.SourceFile.Len, 0U); - EXPECT_EQ(Arg.DebugLoc.SourceLineNumber, 0U); - EXPECT_EQ(Arg.DebugLoc.SourceColumnNumber, 0U); - } - // Arg 1 - { - LLVMOptRemarkArg &Arg = Remark->Args[1]; - EXPECT_EQ(StringRef(Arg.Key.Str, 6), "String"); - EXPECT_EQ(Arg.Key.Len, 6U); - EXPECT_EQ(StringRef(Arg.Value.Str, 26), " will not be inlined into "); - EXPECT_EQ(Arg.Value.Len, 26U); - EXPECT_EQ(StringRef(Arg.DebugLoc.SourceFile.Str, 0), ""); - EXPECT_EQ(Arg.DebugLoc.SourceFile.Len, 0U); - EXPECT_EQ(Arg.DebugLoc.SourceLineNumber, 0U); - EXPECT_EQ(Arg.DebugLoc.SourceColumnNumber, 0U); - } - // Arg 2 - { - LLVMOptRemarkArg &Arg = Remark->Args[2]; - EXPECT_EQ(StringRef(Arg.Key.Str, 6), "Caller"); - EXPECT_EQ(Arg.Key.Len, 6U); - EXPECT_EQ(StringRef(Arg.Value.Str, 3), "foo"); - EXPECT_EQ(Arg.Value.Len, 3U); - EXPECT_EQ(StringRef(Arg.DebugLoc.SourceFile.Str, 6), "file.c"); - EXPECT_EQ(Arg.DebugLoc.SourceFile.Len, 6U); - EXPECT_EQ(Arg.DebugLoc.SourceLineNumber, 2U); - EXPECT_EQ(Arg.DebugLoc.SourceColumnNumber, 0U); - } - // Arg 3 - { - LLVMOptRemarkArg &Arg = Remark->Args[3]; - EXPECT_EQ(StringRef(Arg.Key.Str, 6), "String"); - EXPECT_EQ(Arg.Key.Len, 6U); - EXPECT_EQ(StringRef(Arg.Value.Str, 38), - " because its definition is unavailable"); - EXPECT_EQ(Arg.Value.Len, 38U); - EXPECT_EQ(StringRef(Arg.DebugLoc.SourceFile.Str, 0), ""); - EXPECT_EQ(Arg.DebugLoc.SourceFile.Len, 0U); - EXPECT_EQ(Arg.DebugLoc.SourceLineNumber, 0U); - EXPECT_EQ(Arg.DebugLoc.SourceColumnNumber, 0U); - } - - EXPECT_EQ(LLVMOptRemarkParserGetNext(Parser), nullptr); - - EXPECT_FALSE(LLVMOptRemarkParserHasError(Parser)); - LLVMOptRemarkParserDispose(Parser); -} -- GitLab From 2a9ea3459b2a29c629a23d1d9b1287a20e7f7c1a Mon Sep 17 00:00:00 2001 From: Scott Linder Date: Wed, 10 Oct 2018 18:14:02 +0000 Subject: [PATCH 0014/1116] [Support] Remove redundant qualifiers in YAMLTraits (NFC) git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344166 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Support/YAMLTraits.h | 12 ------- lib/Support/YAMLTraits.cpp | 56 +++++++++++++++---------------- 2 files changed, 28 insertions(+), 40 deletions(-) diff --git a/include/llvm/Support/YAMLTraits.h b/include/llvm/Support/YAMLTraits.h index 6836aa2aa06..5d029ad5ce9 100644 --- a/include/llvm/Support/YAMLTraits.h +++ b/include/llvm/Support/YAMLTraits.h @@ -250,7 +250,6 @@ struct has_ScalarEnumerationTraits template static double test(...); -public: static bool const value = (sizeof(test>(nullptr)) == 1); }; @@ -267,7 +266,6 @@ struct has_ScalarBitSetTraits template static double test(...); -public: static bool const value = (sizeof(test>(nullptr)) == 1); }; @@ -287,7 +285,6 @@ struct has_ScalarTraits template static double test(...); -public: static bool const value = (sizeof(test>(nullptr, nullptr, nullptr)) == 1); }; @@ -306,7 +303,6 @@ struct has_BlockScalarTraits template static double test(...); -public: static bool const value = (sizeof(test>(nullptr, nullptr)) == 1); }; @@ -321,7 +317,6 @@ template struct has_MappingTraits { template static double test(...); -public: static bool const value = (sizeof(test>(nullptr)) == 1); }; @@ -335,7 +330,6 @@ template struct has_MappingTraits { template static double test(...); -public: static bool const value = (sizeof(test>(nullptr)) == 1); }; @@ -349,7 +343,6 @@ template struct has_MappingValidateTraits { template static double test(...); -public: static bool const value = (sizeof(test>(nullptr)) == 1); }; @@ -363,7 +356,6 @@ template struct has_MappingValidateTraits { template static double test(...); -public: static bool const value = (sizeof(test>(nullptr)) == 1); }; @@ -379,7 +371,6 @@ struct has_SequenceMethodTraits template static double test(...); -public: static bool const value = (sizeof(test>(nullptr)) == 1); }; @@ -395,7 +386,6 @@ struct has_CustomMappingTraits template static double test(...); -public: static bool const value = (sizeof(test>(nullptr)) == 1); }; @@ -425,7 +415,6 @@ struct has_FlowTraits template static char (&f(...))[2]; -public: static bool const value = sizeof(f(nullptr)) == 2; }; @@ -446,7 +435,6 @@ struct has_DocumentListTraits template static double test(...); -public: static bool const value = (sizeof(test>(nullptr))==1); }; diff --git a/lib/Support/YAMLTraits.cpp b/lib/Support/YAMLTraits.cpp index d6345efd00c..f8492c96bab 100644 --- a/lib/Support/YAMLTraits.cpp +++ b/lib/Support/YAMLTraits.cpp @@ -98,7 +98,7 @@ bool Input::setCurrentDocument() { ++DocIterator; return setCurrentDocument(); } - TopNode = this->createHNodes(N); + TopNode = createHNodes(N); CurrentNode = TopNode.get(); return true; } @@ -343,7 +343,7 @@ void Input::blockScalarString(StringRef &S) { scalarString(S, QuotingType::None) void Input::setError(HNode *hnode, const Twine &message) { assert(hnode && "HNode must not be NULL"); - this->setError(hnode->_node, message); + setError(hnode->_node, message); } void Input::setError(Node *node, const Twine &message) { @@ -366,7 +366,7 @@ std::unique_ptr Input::createHNodes(Node *N) { } else if (SequenceNode *SQ = dyn_cast(N)) { auto SQHNode = llvm::make_unique(N); for (Node &SN : *SQ) { - auto Entry = this->createHNodes(&SN); + auto Entry = createHNodes(&SN); if (EC) break; SQHNode->Entries.push_back(std::move(Entry)); @@ -391,7 +391,7 @@ std::unique_ptr Input::createHNodes(Node *N) { // Copy string to permanent storage KeyStr = StringStorage.str().copy(StringAllocator); } - auto ValueHNode = this->createHNodes(Value); + auto ValueHNode = createHNodes(Value); if (EC) break; mapHNode->Mapping[KeyStr] = std::move(ValueHNode); @@ -406,7 +406,7 @@ std::unique_ptr Input::createHNodes(Node *N) { } void Input::setError(const Twine &Message) { - this->setError(CurrentNode, Message); + setError(CurrentNode, Message); } bool Input::canElideEmptySequence() { @@ -440,11 +440,11 @@ bool Output::mapTag(StringRef Tag, bool Use) { StateStack.size() > 1 && (StateStack[StateStack.size() - 2] == inSeq || StateStack[StateStack.size() - 2] == inFlowSeq); if (SequenceElement && StateStack.back() == inMapFirstKey) { - this->newLineCheck(); + newLineCheck(); } else { - this->output(" "); + output(" "); } - this->output(Tag); + output(Tag); if (SequenceElement) { // If we're writing the tag during the first element of a map, the tag // takes the place of the first element in the sequence. @@ -476,8 +476,8 @@ bool Output::preflightKey(const char *Key, bool Required, bool SameAsDefault, if (State == inFlowMapFirstKey || State == inFlowMapOtherKey) { flowKey(Key); } else { - this->newLineCheck(); - this->paddedKey(Key); + newLineCheck(); + paddedKey(Key); } return true; } @@ -496,23 +496,23 @@ void Output::postflightKey(void *) { void Output::beginFlowMapping() { StateStack.push_back(inFlowMapFirstKey); - this->newLineCheck(); + newLineCheck(); ColumnAtMapFlowStart = Column; output("{ "); } void Output::endFlowMapping() { StateStack.pop_back(); - this->outputUpToEndOfLine(" }"); + outputUpToEndOfLine(" }"); } void Output::beginDocuments() { - this->outputUpToEndOfLine("---"); + outputUpToEndOfLine("---"); } bool Output::preflightDocument(unsigned index) { if (index > 0) - this->outputUpToEndOfLine("\n---"); + outputUpToEndOfLine("\n---"); return true; } @@ -542,7 +542,7 @@ void Output::postflightElement(void *) { unsigned Output::beginFlowSequence() { StateStack.push_back(inFlowSeq); - this->newLineCheck(); + newLineCheck(); ColumnAtFlowStart = Column; output("[ "); NeedFlowSequenceComma = false; @@ -551,7 +551,7 @@ unsigned Output::beginFlowSequence() { void Output::endFlowSequence() { StateStack.pop_back(); - this->outputUpToEndOfLine(" ]"); + outputUpToEndOfLine(" ]"); } bool Output::preflightFlowElement(unsigned, void *&) { @@ -577,8 +577,8 @@ void Output::beginEnumScalar() { bool Output::matchEnumScalar(const char *Str, bool Match) { if (Match && !EnumerationMatchFound) { - this->newLineCheck(); - this->outputUpToEndOfLine(Str); + newLineCheck(); + outputUpToEndOfLine(Str); EnumerationMatchFound = true; } return false; @@ -597,7 +597,7 @@ void Output::endEnumScalar() { } bool Output::beginBitSetScalar(bool &DoClear) { - this->newLineCheck(); + newLineCheck(); output("[ "); NeedBitValueComma = false; DoClear = false; @@ -608,27 +608,27 @@ bool Output::bitSetMatch(const char *Str, bool Matches) { if (Matches) { if (NeedBitValueComma) output(", "); - this->output(Str); + output(Str); NeedBitValueComma = true; } return false; } void Output::endBitSetScalar() { - this->outputUpToEndOfLine(" ]"); + outputUpToEndOfLine(" ]"); } void Output::scalarString(StringRef &S, QuotingType MustQuote) { - this->newLineCheck(); + newLineCheck(); if (S.empty()) { // Print '' for the empty string because leaving the field empty is not // allowed. - this->outputUpToEndOfLine("''"); + outputUpToEndOfLine("''"); return; } if (MustQuote == QuotingType::None) { // Only quote if we must. - this->outputUpToEndOfLine(S); + outputUpToEndOfLine(S); return; } @@ -645,7 +645,7 @@ void Output::scalarString(StringRef &S, QuotingType MustQuote) { // escapes. This is handled in yaml::escape. if (MustQuote == QuotingType::Double) { output(yaml::escape(Base, /* EscapePrintable= */ false)); - this->outputUpToEndOfLine(Quote); + outputUpToEndOfLine(Quote); return; } @@ -659,7 +659,7 @@ void Output::scalarString(StringRef &S, QuotingType MustQuote) { ++j; } output(StringRef(&Base[i], j - i)); - this->outputUpToEndOfLine(Quote); // Ending quote. + outputUpToEndOfLine(Quote); // Ending quote. } void Output::blockScalarString(StringRef &S) { @@ -702,7 +702,7 @@ void Output::output(StringRef s) { } void Output::outputUpToEndOfLine(StringRef s) { - this->output(s); + output(s); if (StateStack.empty() || (StateStack.back() != inFlowSeq && StateStack.back() != inFlowMapFirstKey && StateStack.back() != inFlowMapOtherKey)) @@ -723,7 +723,7 @@ void Output::newLineCheck() { return; NeedsNewLine = false; - this->outputNewLine(); + outputNewLine(); assert(StateStack.size() > 0); unsigned Indent = StateStack.size() - 1; -- GitLab From 105b05e085580cdd3b9ef95db08e3661b8532232 Mon Sep 17 00:00:00 2001 From: Francis Visoiu Mistrih Date: Wed, 10 Oct 2018 18:43:42 +0000 Subject: [PATCH 0015/1116] Reland: [OptRemarks] Add library for parsing optimization remarks Add a library that parses optimization remarks (currently YAML, so based on the YAMLParser). The goal is to be able to provide tools a remark parser that is not completely dependent on YAML, in case we decide to change the format later. It exposes a C API which takes a handler that is called with the remark structure. It adds a libLLVMOptRemark.a static library, and it's used in-tree by the llvm-opt-report tool (from which the parser has been mostly moved out). Differential Revision: https://reviews.llvm.org/D52776 Fixed the tests by removing the usage of C++11 strings, which seems not to be supported by gcc 4.8.4 if they're used as a macro argument. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344171 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm-c/OptRemarks.h | 197 ++++++++ lib/CMakeLists.txt | 1 + lib/LLVMBuild.txt | 1 + lib/OptRemarks/CMakeLists.txt | 3 + lib/OptRemarks/LLVMBuild.txt | 22 + lib/OptRemarks/OptRemarksParser.cpp | 368 +++++++++++++++ tools/llvm-opt-report/CMakeLists.txt | 2 +- tools/llvm-opt-report/OptReport.cpp | 144 ++---- unittests/CMakeLists.txt | 1 + unittests/OptRemarks/CMakeLists.txt | 8 + .../OptRemarks/OptRemarksParsingTest.cpp | 433 ++++++++++++++++++ 11 files changed, 1073 insertions(+), 107 deletions(-) create mode 100644 include/llvm-c/OptRemarks.h create mode 100644 lib/OptRemarks/CMakeLists.txt create mode 100644 lib/OptRemarks/LLVMBuild.txt create mode 100644 lib/OptRemarks/OptRemarksParser.cpp create mode 100644 unittests/OptRemarks/CMakeLists.txt create mode 100644 unittests/OptRemarks/OptRemarksParsingTest.cpp diff --git a/include/llvm-c/OptRemarks.h b/include/llvm-c/OptRemarks.h new file mode 100644 index 00000000000..f3449cc1b8c --- /dev/null +++ b/include/llvm-c/OptRemarks.h @@ -0,0 +1,197 @@ +/*===-- llvm-c/OptRemarks.h - OptRemarks Public C Interface -------*- C -*-===*\ +|* *| +|* The LLVM Compiler Infrastructure *| +|* *| +|* This file is distributed under the University of Illinois Open Source *| +|* License. See LICENSE.TXT for details. *| +|* *| +|*===----------------------------------------------------------------------===*| +|* *| +|* This header provides a public interface to an opt-remark library. *| +|* LLVM provides an implementation of this interface. *| +|* *| +\*===----------------------------------------------------------------------===*/ + +#ifndef LLVM_C_OPT_REMARKS_H +#define LLVM_C_OPT_REMARKS_H + +#include "llvm-c/Core.h" +#include "llvm-c/Types.h" +#ifdef __cplusplus +#include +extern "C" { +#else +#include +#endif /* !defined(__cplusplus) */ + +/** + * @defgroup LLVMCOPTREMARKS OptRemarks + * @ingroup LLVMC + * + * @{ + */ + +#define OPT_REMARKS_API_VERSION 0 + +/** + * String containing a buffer and a length. The buffer is not guaranteed to be + * zero-terminated. + * + * \since OPT_REMARKS_API_VERSION=0 + */ +typedef struct { + const char *Str; + uint32_t Len; +} LLVMOptRemarkStringRef; + +/** + * DebugLoc containing File, Line and Column. + * + * \since OPT_REMARKS_API_VERSION=0 + */ +typedef struct { + // File: + LLVMOptRemarkStringRef SourceFile; + // Line: + uint32_t SourceLineNumber; + // Column: + uint32_t SourceColumnNumber; +} LLVMOptRemarkDebugLoc; + +/** + * Element of the "Args" list. The key might give more information about what + * are the semantics of the value, e.g. "Callee" will tell you that the value + * is a symbol that names a function. + * + * \since OPT_REMARKS_API_VERSION=0 + */ +typedef struct { + // e.g. "Callee" + LLVMOptRemarkStringRef Key; + // e.g. "malloc" + LLVMOptRemarkStringRef Value; + + // "DebugLoc": Optional + LLVMOptRemarkDebugLoc DebugLoc; +} LLVMOptRemarkArg; + +/** + * One remark entry. + * + * \since OPT_REMARKS_API_VERSION=0 + */ +typedef struct { + // e.g. !Missed, !Passed + LLVMOptRemarkStringRef RemarkType; + // "Pass": Required + LLVMOptRemarkStringRef PassName; + // "Name": Required + LLVMOptRemarkStringRef RemarkName; + // "Function": Required + LLVMOptRemarkStringRef FunctionName; + + // "DebugLoc": Optional + LLVMOptRemarkDebugLoc DebugLoc; + // "Hotness": Optional + uint32_t Hotness; + // "Args": Optional. It is an array of `num_args` elements. + uint32_t NumArgs; + LLVMOptRemarkArg *Args; +} LLVMOptRemarkEntry; + +typedef struct LLVMOptRemarkOpaqueParser *LLVMOptRemarkParserRef; + +/** + * Creates a remark parser that can be used to read and parse the buffer located + * in \p Buf of size \p Size. + * + * \p Buf cannot be NULL. + * + * This function should be paired with LLVMOptRemarkParserDispose() to avoid + * leaking resources. + * + * \since OPT_REMARKS_API_VERSION=0 + */ +extern LLVMOptRemarkParserRef LLVMOptRemarkParserCreate(const void *Buf, + uint64_t Size); + +/** + * Returns the next remark in the file. + * + * The value pointed to by the return value is invalidated by the next call to + * LLVMOptRemarkParserGetNext(). + * + * If the parser reaches the end of the buffer, the return value will be NULL. + * + * In the case of an error, the return value will be NULL, and: + * + * 1) LLVMOptRemarkParserHasError() will return `1`. + * + * 2) LLVMOptRemarkParserGetErrorMessage() will return a descriptive error + * message. + * + * An error may occur if: + * + * 1) An argument is invalid. + * + * 2) There is a YAML parsing error. This type of error aborts parsing + * immediately and returns `1`. It can occur on malformed YAML. + * + * 3) Remark parsing error. If this type of error occurs, the parser won't call + * the handler and will continue to the next one. It can occur on malformed + * remarks, like missing or extra fields in the file. + * + * Here is a quick example of the usage: + * + * ``` + * LLVMOptRemarkParserRef Parser = LLVMOptRemarkParserCreate(Buf, Size); + * LLVMOptRemarkEntry *Remark = NULL; + * while ((Remark == LLVMOptRemarkParserGetNext(Parser))) { + * // use Remark + * } + * bool HasError = LLVMOptRemarkParserHasError(Parser); + * LLVMOptRemarkParserDispose(Parser); + * ``` + * + * \since OPT_REMARKS_API_VERSION=0 + */ +extern LLVMOptRemarkEntry * +LLVMOptRemarkParserGetNext(LLVMOptRemarkParserRef Parser); + +/** + * Returns `1` if the parser encountered an error while parsing the buffer. + * + * \since OPT_REMARKS_API_VERSION=0 + */ +extern LLVMBool LLVMOptRemarkParserHasError(LLVMOptRemarkParserRef Parser); + +/** + * Returns a null-terminated string containing an error message. + * + * In case of no error, the result is `NULL`. + * + * The memory of the string is bound to the lifetime of \p Parser. If + * LLVMOptRemarkParserDispose() is called, the memory of the string will be + * released. + * + * \since OPT_REMARKS_API_VERSION=0 + */ +extern const char * +LLVMOptRemarkParserGetErrorMessage(LLVMOptRemarkParserRef Parser); + +/** + * Releases all the resources used by \p Parser. + * + * \since OPT_REMARKS_API_VERSION=0 + */ +extern void LLVMOptRemarkParserDispose(LLVMOptRemarkParserRef Parser); + +/** + * @} // endgoup LLVMCOPTREMARKS + */ + +#ifdef __cplusplus +} +#endif /* !defined(__cplusplus) */ + +#endif /* LLVM_C_OPT_REMARKS_H */ diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index ecf8b93d253..1f54c611bad 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -15,6 +15,7 @@ add_subdirectory(MC) add_subdirectory(Object) add_subdirectory(ObjectYAML) add_subdirectory(Option) +add_subdirectory(OptRemarks) add_subdirectory(DebugInfo) add_subdirectory(ExecutionEngine) add_subdirectory(Target) diff --git a/lib/LLVMBuild.txt b/lib/LLVMBuild.txt index a6cd15699fb..0eb4bba2676 100644 --- a/lib/LLVMBuild.txt +++ b/lib/LLVMBuild.txt @@ -35,6 +35,7 @@ subdirectories = BinaryFormat ObjectYAML Option + OptRemarks Passes ProfileData Support diff --git a/lib/OptRemarks/CMakeLists.txt b/lib/OptRemarks/CMakeLists.txt new file mode 100644 index 00000000000..8fefe1d986b --- /dev/null +++ b/lib/OptRemarks/CMakeLists.txt @@ -0,0 +1,3 @@ +add_llvm_library(LLVMOptRemarks + OptRemarksParser.cpp +) diff --git a/lib/OptRemarks/LLVMBuild.txt b/lib/OptRemarks/LLVMBuild.txt new file mode 100644 index 00000000000..4c1032296dc --- /dev/null +++ b/lib/OptRemarks/LLVMBuild.txt @@ -0,0 +1,22 @@ +;===- ./lib/OptRemarks/LLVMBuild.txt ---------------------------*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[component_0] +type = Library +name = OptRemarks +parent = Libraries +required_libraries = Support diff --git a/lib/OptRemarks/OptRemarksParser.cpp b/lib/OptRemarks/OptRemarksParser.cpp new file mode 100644 index 00000000000..0478d2bfbfa --- /dev/null +++ b/lib/OptRemarks/OptRemarksParser.cpp @@ -0,0 +1,368 @@ +//===- OptRemarksParser.cpp -----------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file provides utility methods used by clients that want to use the +// parser for optimization remarks in LLVM. +// +//===----------------------------------------------------------------------===// + +#include "llvm-c/OptRemarks.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Support/YAMLTraits.h" + +using namespace llvm; + +namespace { +struct RemarkParser { + /// Source manager for better error messages. + SourceMgr SM; + /// Stream for yaml parsing. + yaml::Stream Stream; + /// Storage for the error stream. + std::string ErrorString; + /// The error stream. + raw_string_ostream ErrorStream; + /// Iterator in the YAML stream. + yaml::document_iterator DI; + /// The parsed remark (if any). + Optional LastRemark; + /// Temporary parsing buffer for the arguments. + SmallVector TmpArgs; + /// The state used by the parser to parse a remark entry. Invalidated with + /// every call to `parseYAMLElement`. + struct ParseState { + /// Temporary parsing buffer for the arguments. + SmallVectorImpl *Args; + StringRef Type; + StringRef Pass; + StringRef Name; + StringRef Function; + /// Optional. + Optional File; + Optional Line; + Optional Column; + Optional Hotness; + + ParseState(SmallVectorImpl &Args) : Args(&Args) {} + /// Use Args only as a **temporary** buffer. + ~ParseState() { Args->clear(); } + }; + + ParseState State; + + /// Set to `true` if we had any errors during parsing. + bool HadAnyErrors = false; + + RemarkParser(StringRef Buf) + : SM(), Stream(Buf, SM), ErrorString(), ErrorStream(ErrorString), + DI(Stream.begin()), LastRemark(), TmpArgs(), State(TmpArgs) { + SM.setDiagHandler(RemarkParser::HandleDiagnostic, this); + } + + /// Parse a YAML element. + Error parseYAMLElement(yaml::Document &Remark); + +private: + /// Parse one key to a string. + /// otherwise. + Error parseKey(StringRef &Result, yaml::KeyValueNode &Node); + /// Parse one value to a string. + Error parseValue(StringRef &Result, yaml::KeyValueNode &Node); + /// Parse one value to an unsigned. + Error parseValue(Optional &Result, yaml::KeyValueNode &Node); + /// Parse a debug location. + Error parseDebugLoc(Optional &File, Optional &Line, + Optional &Column, yaml::KeyValueNode &Node); + /// Parse an argument. + Error parseArg(SmallVectorImpl &TmpArgs, yaml::Node &Node); + + /// Handle a diagnostic from the YAML stream. Records the error in the + /// RemarkParser class. + static void HandleDiagnostic(const SMDiagnostic &Diag, void *Ctx) { + assert(Ctx && "Expected non-null Ctx in diagnostic handler."); + auto *Parser = static_cast(Ctx); + Diag.print(/*ProgName=*/nullptr, Parser->ErrorStream, /*ShowColors*/ false, + /*ShowKindLabels*/ true); + } +}; + +class ParseError : public ErrorInfo { +public: + static char ID; + + ParseError(StringRef Message, yaml::Node &Node) + : Message(Message), Node(Node) {} + + void log(raw_ostream &OS) const override { OS << Message; } + std::error_code convertToErrorCode() const override { + return inconvertibleErrorCode(); + } + + StringRef getMessage() const { return Message; } + yaml::Node &getNode() const { return Node; } + +private: + StringRef Message; // No need to hold a full copy of the buffer. + yaml::Node &Node; +}; + +char ParseError::ID = 0; + +static LLVMOptRemarkStringRef toOptRemarkStr(StringRef Str) { + return {Str.data(), static_cast(Str.size())}; +} + +Error RemarkParser::parseKey(StringRef &Result, yaml::KeyValueNode &Node) { + auto *Key = dyn_cast(Node.getKey()); + if (!Key) + return make_error("key is not a string.", Node); + + Result = Key->getRawValue(); + return Error::success(); +} + +Error RemarkParser::parseValue(StringRef &Result, yaml::KeyValueNode &Node) { + auto *Value = dyn_cast(Node.getValue()); + if (!Value) + return make_error("expected a value of scalar type.", Node); + Result = Value->getRawValue(); + + if (Result.front() == '\'') + Result = Result.drop_front(); + + if (Result.back() == '\'') + Result = Result.drop_back(); + + return Error::success(); +} + +Error RemarkParser::parseValue(Optional &Result, + yaml::KeyValueNode &Node) { + SmallVector Tmp; + auto *Value = dyn_cast(Node.getValue()); + if (!Value) + return make_error("expected a value of scalar type.", Node); + unsigned UnsignedValue = 0; + if (Value->getValue(Tmp).getAsInteger(10, UnsignedValue)) + return make_error("expected a value of integer type.", *Value); + Result = UnsignedValue; + return Error::success(); +} + +Error RemarkParser::parseDebugLoc(Optional &File, + Optional &Line, + Optional &Column, + yaml::KeyValueNode &Node) { + auto *DebugLoc = dyn_cast(Node.getValue()); + if (!DebugLoc) + return make_error("expected a value of mapping type.", Node); + + for (yaml::KeyValueNode &DLNode : *DebugLoc) { + StringRef KeyName; + if (Error E = parseKey(KeyName, DLNode)) + return E; + if (KeyName == "File") { + File = StringRef(); // Set the optional to contain a default constructed + // value, to be passed to the parsing function. + if (Error E = parseValue(*File, DLNode)) + return E; + } else if (KeyName == "Column") { + if (Error E = parseValue(Column, DLNode)) + return E; + } else if (KeyName == "Line") { + if (Error E = parseValue(Line, DLNode)) + return E; + } else { + return make_error("unknown entry in DebugLoc map.", DLNode); + } + } + + // If any of the debug loc fields is missing, return an error. + if (!File || !Line || !Column) + return make_error("DebugLoc node incomplete.", Node); + + return Error::success(); +} + +Error RemarkParser::parseArg(SmallVectorImpl &Args, + yaml::Node &Node) { + auto *ArgMap = dyn_cast(&Node); + if (!ArgMap) + return make_error("expected a value of mapping type.", Node); + + StringRef ValueStr; + StringRef KeyStr; + Optional File; + Optional Line; + Optional Column; + + for (yaml::KeyValueNode &ArgEntry : *ArgMap) { + StringRef KeyName; + if (Error E = parseKey(KeyName, ArgEntry)) + return E; + + // Try to parse debug locs. + if (KeyName == "DebugLoc") { + // Can't have multiple DebugLoc entries per argument. + if (File || Line || Column) + return make_error( + "only one DebugLoc entry is allowed per argument.", ArgEntry); + + if (Error E = parseDebugLoc(File, Line, Column, ArgEntry)) + return E; + continue; + } + + // If we already have a string, error out. + if (!ValueStr.empty()) + return make_error( + "only one string entry is allowed per argument.", ArgEntry); + + // Try to parse a string. + if (Error E = parseValue(ValueStr, ArgEntry)) + return E; + + // Keep the key from the string. + KeyStr = KeyName; + } + + if (KeyStr.empty()) + return make_error("argument key is missing.", *ArgMap); + if (ValueStr.empty()) + return make_error("argument value is missing.", *ArgMap); + + Args.push_back(LLVMOptRemarkArg{ + toOptRemarkStr(KeyStr), toOptRemarkStr(ValueStr), + LLVMOptRemarkDebugLoc{toOptRemarkStr(File.getValueOr(StringRef())), + Line.getValueOr(0), Column.getValueOr(0)}}); + + return Error::success(); +} + +Error RemarkParser::parseYAMLElement(yaml::Document &Remark) { + // Parsing a new remark, clear the previous one. + LastRemark = None; + State = ParseState(TmpArgs); + + auto *Root = dyn_cast(Remark.getRoot()); + if (!Root) + return make_error("document root is not of mapping type.", + *Remark.getRoot()); + + State.Type = Root->getRawTag(); + + for (yaml::KeyValueNode &RemarkField : *Root) { + StringRef KeyName; + if (Error E = parseKey(KeyName, RemarkField)) + return E; + + if (KeyName == "Pass") { + if (Error E = parseValue(State.Pass, RemarkField)) + return E; + } else if (KeyName == "Name") { + if (Error E = parseValue(State.Name, RemarkField)) + return E; + } else if (KeyName == "Function") { + if (Error E = parseValue(State.Function, RemarkField)) + return E; + } else if (KeyName == "Hotness") { + if (Error E = parseValue(State.Hotness, RemarkField)) + return E; + } else if (KeyName == "DebugLoc") { + if (Error E = + parseDebugLoc(State.File, State.Line, State.Column, RemarkField)) + return E; + } else if (KeyName == "Args") { + auto *Args = dyn_cast(RemarkField.getValue()); + if (!Args) + return make_error("wrong value type for key.", RemarkField); + + for (yaml::Node &Arg : *Args) + if (Error E = parseArg(*State.Args, Arg)) + return E; + } else { + return make_error("unknown key.", RemarkField); + } + } + + // If the YAML parsing failed, don't even continue parsing. We might + // encounter malformed YAML. + if (Stream.failed()) + return make_error("YAML parsing failed.", *Remark.getRoot()); + + // Check if any of the mandatory fields are missing. + if (State.Type.empty() || State.Pass.empty() || State.Name.empty() || + State.Function.empty()) + return make_error("Type, Pass, Name or Function missing.", + *Remark.getRoot()); + + LastRemark = LLVMOptRemarkEntry{ + toOptRemarkStr(State.Type), + toOptRemarkStr(State.Pass), + toOptRemarkStr(State.Name), + toOptRemarkStr(State.Function), + LLVMOptRemarkDebugLoc{toOptRemarkStr(State.File.getValueOr(StringRef())), + State.Line.getValueOr(0), + State.Column.getValueOr(0)}, + State.Hotness.getValueOr(0), + static_cast(State.Args->size()), + State.Args->data()}; + + return Error::success(); +} +} // namespace + +// Create wrappers for C Binding types (see CBindingWrapping.h). +DEFINE_SIMPLE_CONVERSION_FUNCTIONS(RemarkParser, LLVMOptRemarkParserRef) + +extern "C" LLVMOptRemarkParserRef LLVMOptRemarkParserCreate(const void *Buf, + uint64_t Size) { + return wrap( + new RemarkParser(StringRef(static_cast(Buf), Size))); +} + +extern "C" LLVMOptRemarkEntry * +LLVMOptRemarkParserGetNext(LLVMOptRemarkParserRef Parser) { + RemarkParser &TheParser = *unwrap(Parser); + // Check for EOF. + if (TheParser.HadAnyErrors || TheParser.DI == TheParser.Stream.end()) + return nullptr; + + // Try to parse an entry. + if (Error E = TheParser.parseYAMLElement(*TheParser.DI)) { + handleAllErrors(std::move(E), [&](const ParseError &PE) { + TheParser.Stream.printError(&PE.getNode(), + Twine(PE.getMessage()) + Twine('\n')); + TheParser.HadAnyErrors = true; + }); + return nullptr; + } + + // Move on. + ++TheParser.DI; + + // Return the just-parsed remark. + if (Optional &Entry = TheParser.LastRemark) + return &*Entry; + return nullptr; +} + +extern "C" LLVMBool LLVMOptRemarkParserHasError(LLVMOptRemarkParserRef Parser) { + return unwrap(Parser)->HadAnyErrors; +} + +extern "C" const char * +LLVMOptRemarkParserGetErrorMessage(LLVMOptRemarkParserRef Parser) { + return unwrap(Parser)->ErrorStream.str().c_str(); +} + +extern "C" void LLVMOptRemarkParserDispose(LLVMOptRemarkParserRef Parser) { + delete unwrap(Parser); +} diff --git a/tools/llvm-opt-report/CMakeLists.txt b/tools/llvm-opt-report/CMakeLists.txt index 777537a54c0..3aabc03ab3f 100644 --- a/tools/llvm-opt-report/CMakeLists.txt +++ b/tools/llvm-opt-report/CMakeLists.txt @@ -1,4 +1,4 @@ -set(LLVM_LINK_COMPONENTS Core Demangle Object Support) +set(LLVM_LINK_COMPONENTS Core Demangle Object OptRemarks Support) add_llvm_tool(llvm-opt-report OptReport.cpp diff --git a/tools/llvm-opt-report/OptReport.cpp b/tools/llvm-opt-report/OptReport.cpp index aa7966132c2..071f779a9e6 100644 --- a/tools/llvm-opt-report/OptReport.cpp +++ b/tools/llvm-opt-report/OptReport.cpp @@ -28,6 +28,7 @@ #include "llvm/Support/WithColor.h" #include "llvm/Support/YAMLTraits.h" #include "llvm/Support/raw_ostream.h" +#include "llvm-c/OptRemarks.h" #include #include #include @@ -142,104 +143,44 @@ typedef std::map>>> LocationInfoTy; } // anonymous namespace -static void collectLocationInfo(yaml::Stream &Stream, - LocationInfoTy &LocationInfo) { - SmallVector Tmp; - - // Note: We're using the YAML parser here directly, instead of using the - // YAMLTraits implementation, because the YAMLTraits implementation does not - // support a way to handle only a subset of the input keys (it will error out - // if there is an input key that you don't map to your class), and - // furthermore, it does not provide a way to handle the Args sequence of - // key/value pairs, where the order must be captured and the 'String' key - // might be repeated. - for (auto &Doc : Stream) { - auto *Root = dyn_cast(Doc.getRoot()); - if (!Root) - continue; +static bool readLocationInfo(LocationInfoTy &LocationInfo) { + ErrorOr> Buf = + MemoryBuffer::getFile(InputFileName.c_str()); + if (std::error_code EC = Buf.getError()) { + WithColor::error() << "Can't open file " << InputFileName << ": " + << EC.message() << "\n"; + return false; + } - bool Transformed = Root->getRawTag() == "!Passed"; - std::string Pass, File, Function; - int Line = 0, Column = 1; + StringRef Buffer = (*Buf)->getBuffer(); + LLVMOptRemarkParserRef Parser = + LLVMOptRemarkParserCreate(Buffer.data(), Buffer.size()); + + LLVMOptRemarkEntry *Remark = nullptr; + while ((Remark = LLVMOptRemarkParserGetNext(Parser))) { + bool Transformed = + StringRef(Remark->RemarkType.Str, Remark->RemarkType.Len) == "!Passed"; + StringRef Pass(Remark->PassName.Str, Remark->PassName.Len); + StringRef File(Remark->DebugLoc.SourceFile.Str, + Remark->DebugLoc.SourceFile.Len); + StringRef Function(Remark->FunctionName.Str, Remark->FunctionName.Len); + uint32_t Line = Remark->DebugLoc.SourceLineNumber; + uint32_t Column = Remark->DebugLoc.SourceColumnNumber; + ArrayRef Args(Remark->Args, Remark->NumArgs); int VectorizationFactor = 1; int InterleaveCount = 1; int UnrollCount = 1; - for (auto &RootChild : *Root) { - auto *Key = dyn_cast(RootChild.getKey()); - if (!Key) - continue; - StringRef KeyName = Key->getValue(Tmp); - if (KeyName == "Pass") { - auto *Value = dyn_cast(RootChild.getValue()); - if (!Value) - continue; - Pass = Value->getValue(Tmp); - } else if (KeyName == "Function") { - auto *Value = dyn_cast(RootChild.getValue()); - if (!Value) - continue; - Function = Value->getValue(Tmp); - } else if (KeyName == "DebugLoc") { - auto *DebugLoc = dyn_cast(RootChild.getValue()); - if (!DebugLoc) - continue; - - for (auto &DLChild : *DebugLoc) { - auto *DLKey = dyn_cast(DLChild.getKey()); - if (!DLKey) - continue; - StringRef DLKeyName = DLKey->getValue(Tmp); - if (DLKeyName == "File") { - auto *Value = dyn_cast(DLChild.getValue()); - if (!Value) - continue; - File = Value->getValue(Tmp); - } else if (DLKeyName == "Line") { - auto *Value = dyn_cast(DLChild.getValue()); - if (!Value) - continue; - Value->getValue(Tmp).getAsInteger(10, Line); - } else if (DLKeyName == "Column") { - auto *Value = dyn_cast(DLChild.getValue()); - if (!Value) - continue; - Value->getValue(Tmp).getAsInteger(10, Column); - } - } - } else if (KeyName == "Args") { - auto *Args = dyn_cast(RootChild.getValue()); - if (!Args) - continue; - for (auto &ArgChild : *Args) { - auto *ArgMap = dyn_cast(&ArgChild); - if (!ArgMap) - continue; - for (auto &ArgKV : *ArgMap) { - auto *ArgKey = dyn_cast(ArgKV.getKey()); - if (!ArgKey) - continue; - StringRef ArgKeyName = ArgKey->getValue(Tmp); - if (ArgKeyName == "VectorizationFactor") { - auto *Value = dyn_cast(ArgKV.getValue()); - if (!Value) - continue; - Value->getValue(Tmp).getAsInteger(10, VectorizationFactor); - } else if (ArgKeyName == "InterleaveCount") { - auto *Value = dyn_cast(ArgKV.getValue()); - if (!Value) - continue; - Value->getValue(Tmp).getAsInteger(10, InterleaveCount); - } else if (ArgKeyName == "UnrollCount") { - auto *Value = dyn_cast(ArgKV.getValue()); - if (!Value) - continue; - Value->getValue(Tmp).getAsInteger(10, UnrollCount); - } - } - } - } + for (const LLVMOptRemarkArg &Arg : Args) { + StringRef ArgKeyName(Arg.Key.Str, Arg.Key.Len); + StringRef ArgValue(Arg.Value.Str, Arg.Value.Len); + if (ArgKeyName == "VectorizationFactor") + ArgValue.getAsInteger(10, VectorizationFactor); + else if (ArgKeyName == "InterleaveCount") + ArgValue.getAsInteger(10, InterleaveCount); + else if (ArgKeyName == "UnrollCount") + ArgValue.getAsInteger(10, UnrollCount); } if (Line < 1 || File.empty()) @@ -268,22 +209,13 @@ static void collectLocationInfo(yaml::Stream &Stream, UpdateLLII(LI.Vectorized); } } -} - -static bool readLocationInfo(LocationInfoTy &LocationInfo) { - ErrorOr> Buf = - MemoryBuffer::getFileOrSTDIN(InputFileName); - if (std::error_code EC = Buf.getError()) { - WithColor::error() << "Can't open file " << InputFileName << ": " - << EC.message() << "\n"; - return false; - } - SourceMgr SM; - yaml::Stream Stream(Buf.get()->getBuffer(), SM); - collectLocationInfo(Stream, LocationInfo); + bool HasError = LLVMOptRemarkParserHasError(Parser); + if (HasError) + WithColor::error() << LLVMOptRemarkParserGetErrorMessage(Parser) << "\n"; - return true; + LLVMOptRemarkParserDispose(Parser); + return !HasError; } static bool writeReport(LocationInfoTy &LocationInfo) { diff --git a/unittests/CMakeLists.txt b/unittests/CMakeLists.txt index bc41ab66a23..5dba2de4a88 100644 --- a/unittests/CMakeLists.txt +++ b/unittests/CMakeLists.txt @@ -26,6 +26,7 @@ add_subdirectory(MI) add_subdirectory(Object) add_subdirectory(ObjectYAML) add_subdirectory(Option) +add_subdirectory(OptRemarks) add_subdirectory(Passes) add_subdirectory(ProfileData) add_subdirectory(Support) diff --git a/unittests/OptRemarks/CMakeLists.txt b/unittests/OptRemarks/CMakeLists.txt new file mode 100644 index 00000000000..94c74867cc4 --- /dev/null +++ b/unittests/OptRemarks/CMakeLists.txt @@ -0,0 +1,8 @@ +set(LLVM_LINK_COMPONENTS + OptRemarks + Support + ) + +add_llvm_unittest(OptRemarksTests + OptRemarksParsingTest.cpp + ) diff --git a/unittests/OptRemarks/OptRemarksParsingTest.cpp b/unittests/OptRemarks/OptRemarksParsingTest.cpp new file mode 100644 index 00000000000..a3b28f038b5 --- /dev/null +++ b/unittests/OptRemarks/OptRemarksParsingTest.cpp @@ -0,0 +1,433 @@ +//===- unittest/Support/OptRemarksParsingTest.cpp - OptTable tests --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm-c/OptRemarks.h" +#include "gtest/gtest.h" + +using namespace llvm; + +template bool tryParse(const char (&Buf)[N]) { + LLVMOptRemarkParserRef Parser = LLVMOptRemarkParserCreate(Buf, N - 1); + LLVMOptRemarkEntry *Remark = nullptr; + while (LLVMOptRemarkEntry *NewRemark = LLVMOptRemarkParserGetNext(Parser)) { + EXPECT_TRUE(Remark == nullptr); // Only one remark per test. + Remark = NewRemark; + } + EXPECT_TRUE(Remark != nullptr); // We need *exactly* one remark per test. + bool HasError = LLVMOptRemarkParserHasError(Parser); + LLVMOptRemarkParserDispose(Parser); + return !HasError; +} + +template +bool parseExpectError(const char (&Buf)[N], const char *Error) { + LLVMOptRemarkParserRef Parser = LLVMOptRemarkParserCreate(Buf, N - 1); + LLVMOptRemarkEntry *Remark = nullptr; + while (LLVMOptRemarkEntry *NewRemark = LLVMOptRemarkParserGetNext(Parser)) { + EXPECT_FALSE(NewRemark); + } + EXPECT_TRUE(Remark == nullptr); // We are parsing only one malformed remark. + EXPECT_TRUE(LLVMOptRemarkParserHasError(Parser)); + bool MatchesError = + StringRef(LLVMOptRemarkParserGetErrorMessage(Parser)).contains(Error); + LLVMOptRemarkParserDispose(Parser); + + return MatchesError; +} + +TEST(OptRemarks, OptRemarksParsingEmpty) { + StringRef Buf = "\n" + "\n"; + LLVMOptRemarkParserRef Parser = + LLVMOptRemarkParserCreate(Buf.data(), Buf.size()); + LLVMOptRemarkEntry *NewRemark = LLVMOptRemarkParserGetNext(Parser); + EXPECT_TRUE(NewRemark == nullptr); // No remark expected. + EXPECT_TRUE(LLVMOptRemarkParserHasError(Parser)); + EXPECT_TRUE(StringRef(LLVMOptRemarkParserGetErrorMessage(Parser)) + .contains("document root is not of mapping type.")); + LLVMOptRemarkParserDispose(Parser); +} + +TEST(OptRemarks, OptRemarksParsingGood) { + EXPECT_TRUE(tryParse("\n" +"--- !Missed\n" +"Pass: inline\n" +"Name: NoDefinition\n" +"DebugLoc: { File: file.c, Line: 3, Column: 12 }\n" +"Function: foo\n" +"Args:\n" +" - Callee: bar\n" +" - String: ' will not be inlined into '\n" +" - Caller: foo\n" +" DebugLoc: { File: file.c, Line: 2, Column: 0 }\n" +" - String: ' because its definition is unavailable'\n" +"")); + + // No debug loc should also pass. + EXPECT_TRUE(tryParse("\n" +"--- !Missed\n" +"Pass: inline\n" +"Name: NoDefinition\n" +"Function: foo\n" +"Args:\n" +" - Callee: bar\n" +" - String: ' will not be inlined into '\n" +" - Caller: foo\n" +" DebugLoc: { File: file.c, Line: 2, Column: 0 }\n" +" - String: ' because its definition is unavailable'\n" +"")); + + // No args is also ok. + EXPECT_TRUE(tryParse("\n" +"--- !Missed\n" +"Pass: inline\n" +"Name: NoDefinition\n" +"DebugLoc: { File: file.c, Line: 3, Column: 12 }\n" +"Function: foo\n" +"")); + + // Different order. + EXPECT_TRUE(tryParse("\n" +"--- !Missed\n" +"DebugLoc: { Line: 3, Column: 12, File: file.c }\n" +"Function: foo\n" +"Name: NoDefinition\n" +"Args:\n" +" - Callee: bar\n" +" - String: ' will not be inlined into '\n" +" - Caller: foo\n" +" DebugLoc: { File: file.c, Line: 2, Column: 0 }\n" +" - String: ' because its definition is unavailable'\n" +"Pass: inline\n" +"")); +} + +// Mandatory common part of a remark. +#define COMMON_REMARK "\nPass: inline\nName: NoDefinition\nFunction: foo\n\n" +// Test all the types. +TEST(OptRemarks, OptRemarksParsingTypes) { + // Type: Passed + EXPECT_TRUE(tryParse("--- !Passed" COMMON_REMARK)); + // Type: Missed + EXPECT_TRUE(tryParse("--- !Missed" COMMON_REMARK)); + // Type: Analysis + EXPECT_TRUE(tryParse("--- !Analysis" COMMON_REMARK)); + // Type: AnalysisFPCompute + EXPECT_TRUE(tryParse("--- !AnalysisFPCompute" COMMON_REMARK)); + // Type: AnalysisAliasing + EXPECT_TRUE(tryParse("--- !AnalysisAliasing" COMMON_REMARK)); + // Type: Failure + EXPECT_TRUE(tryParse("--- !Failure" COMMON_REMARK)); +} +#undef COMMON_REMARK + +TEST(OptRemarks, OptRemarksParsingMissingFields) { + // No type. + EXPECT_TRUE(parseExpectError("\n" +"---\n" +"Pass: inline\n" +"Name: NoDefinition\n" +"Function: foo\n" +"", + "error: Type, Pass, Name or Function missing.")); + // No pass. + EXPECT_TRUE(parseExpectError("\n" +"--- !Missed\n" +"Name: NoDefinition\n" +"Function: foo\n" +"", + "error: Type, Pass, Name or Function missing.")); + // No name. + EXPECT_TRUE(parseExpectError("\n" +"--- !Missed\n" +"Pass: inline\n" +"Function: foo\n" +"", + "error: Type, Pass, Name or Function missing.")); + // No function. + EXPECT_TRUE(parseExpectError("\n" +"--- !Missed\n" +"Pass: inline\n" +"Name: NoDefinition\n" +"", + "error: Type, Pass, Name or Function missing.")); + // Debug loc but no file. + EXPECT_TRUE(parseExpectError("\n" +"--- !Missed\n" +"Pass: inline\n" +"Name: NoDefinition\n" +"Function: foo\n" +"DebugLoc: { Line: 3, Column: 12 }\n" +"", + "DebugLoc node incomplete.")); + // Debug loc but no line. + EXPECT_TRUE(parseExpectError("\n" +"--- !Missed\n" +"Pass: inline\n" +"Name: NoDefinition\n" +"Function: foo\n" +"DebugLoc: { File: file.c, Column: 12 }\n" +"", + "DebugLoc node incomplete.")); + // Debug loc but no column. + EXPECT_TRUE(parseExpectError("\n" +"--- !Missed\n" +"Pass: inline\n" +"Name: NoDefinition\n" +"Function: foo\n" +"DebugLoc: { File: file.c, Line: 3 }\n" +"", + "DebugLoc node incomplete.")); +} + +TEST(OptRemarks, OptRemarksParsingWrongTypes) { + // Wrong debug loc type. + EXPECT_TRUE(parseExpectError("\n" +"--- !Missed\n" +"Pass: inline\n" +"Name: NoDefinition\n" +"Function: foo\n" +"DebugLoc: foo\n" +"", + "expected a value of mapping type.")); + // Wrong line type. + EXPECT_TRUE(parseExpectError("\n" +"--- !Missed\n" +"Pass: inline\n" +"Name: NoDefinition\n" +"Function: foo\n" +"DebugLoc: { File: file.c, Line: b, Column: 12 }\n" +"", + "expected a value of integer type.")); + // Wrong column type. + EXPECT_TRUE(parseExpectError("\n" +"--- !Missed\n" +"Pass: inline\n" +"Name: NoDefinition\n" +"Function: foo\n" +"DebugLoc: { File: file.c, Line: 3, Column: c }\n" +"", + "expected a value of integer type.")); + // Wrong args type. + EXPECT_TRUE(parseExpectError("\n" +"--- !Missed\n" +"Pass: inline\n" +"Name: NoDefinition\n" +"Function: foo\n" +"Args: foo\n" +"", + "wrong value type for key.")); + // Wrong key type. + EXPECT_TRUE(parseExpectError("\n" +"--- !Missed\n" +"{ A: a }: inline\n" +"Name: NoDefinition\n" +"Function: foo\n" +"", + "key is not a string.")); + // Debug loc with unknown entry. + EXPECT_TRUE(parseExpectError("\n" +"--- !Missed\n" +"Pass: inline\n" +"Name: NoDefinition\n" +"Function: foo\n" +"DebugLoc: { File: file.c, Column: 12, Unknown: 12 }\n" +"", + "unknown entry in DebugLoc map.")); + // Unknown entry. + EXPECT_TRUE(parseExpectError("\n" +"--- !Missed\n" +"Unknown: inline\n" +"", + "unknown key.")); + // Not a scalar. + EXPECT_TRUE(parseExpectError("\n" +"--- !Missed\n" +"Pass: { File: a, Line: 1, Column: 2 }\n" +"Name: NoDefinition\n" +"Function: foo\n" +"", + "expected a value of scalar type.")); + // Not a string file in debug loc. + EXPECT_TRUE(parseExpectError("\n" +"--- !Missed\n" +"Pass: inline\n" +"Name: NoDefinition\n" +"Function: foo\n" +"DebugLoc: { File: { a: b }, Column: 12, Line: 12 }\n" +"", + "expected a value of scalar type.")); + // Not a integer column in debug loc. + EXPECT_TRUE(parseExpectError("\n" +"--- !Missed\n" +"Pass: inline\n" +"Name: NoDefinition\n" +"Function: foo\n" +"DebugLoc: { File: file.c, Column: { a: b }, Line: 12 }\n" +"", + "expected a value of scalar type.")); + // Not a integer line in debug loc. + EXPECT_TRUE(parseExpectError("\n" +"--- !Missed\n" +"Pass: inline\n" +"Name: NoDefinition\n" +"Function: foo\n" +"DebugLoc: { File: file.c, Column: 12, Line: { a: b } }\n" +"", + "expected a value of scalar type.")); + // Not a mapping type value for args. + EXPECT_TRUE(parseExpectError("\n" +"--- !Missed\n" +"Pass: inline\n" +"Name: NoDefinition\n" +"Function: foo\n" +"DebugLoc: { File: file.c, Column: 12, Line: { a: b } }\n" +"", + "expected a value of scalar type.")); +} + +TEST(OptRemarks, OptRemarksParsingWrongArgs) { + // Multiple debug locs per arg. + EXPECT_TRUE( + parseExpectError("\n" +"--- !Missed\n" +"Pass: inline\n" +"Name: NoDefinition\n" +"Function: foo\n" +"Args:\n" +" - Str: string\n" +" DebugLoc: { File: a, Line: 1, Column: 2 }\n" +" DebugLoc: { File: a, Line: 1, Column: 2 }\n" +"", + "only one DebugLoc entry is allowed per argument.")); + // Multiple strings per arg. + EXPECT_TRUE( + parseExpectError("\n" +"--- !Missed\n" +"Pass: inline\n" +"Name: NoDefinition\n" +"Function: foo\n" +"Args:\n" +" - Str: string\n" +" Str2: string\n" +" DebugLoc: { File: a, Line: 1, Column: 2 }\n" +"", + "only one string entry is allowed per argument.")); + // No arg value. + EXPECT_TRUE(parseExpectError("\n" +"--- !Missed\n" +"Pass: inline\n" +"Name: NoDefinition\n" +"Function: foo\n" +"Args:\n" +" - Callee: ''\n" +" - DebugLoc: { File: a, Line: 1, Column: 2 }\n" +"", + "argument value is missing.")); + // No arg value. + EXPECT_TRUE(parseExpectError("\n" +"--- !Missed\n" +"Pass: inline\n" +"Name: NoDefinition\n" +"Function: foo\n" +"Args:\n" +" - DebugLoc: { File: a, Line: 1, Column: 2 }\n" +"", + "argument key is missing.")); + +} + +TEST(OptRemarks, OptRemarksGoodStruct) { + StringRef Buf = "\n" +"--- !Missed\n" +"Pass: inline\n" +"Name: NoDefinition\n" +"DebugLoc: { File: file.c, Line: 3, Column: 12 }\n" +"Function: foo\n" +"Args:\n" +" - Callee: bar\n" +" - String: ' will not be inlined into '\n" +" - Caller: foo\n" +" DebugLoc: { File: file.c, Line: 2, Column: 0 }\n" +" - String: ' because its definition is unavailable'\n" +"\n"; + + LLVMOptRemarkParserRef Parser = + LLVMOptRemarkParserCreate(Buf.data(), Buf.size()); + LLVMOptRemarkEntry *Remark = LLVMOptRemarkParserGetNext(Parser); + EXPECT_FALSE(Remark == nullptr); + EXPECT_EQ(StringRef(Remark->RemarkType.Str, 7), "!Missed"); + EXPECT_EQ(Remark->RemarkType.Len, 7U); + EXPECT_EQ(StringRef(Remark->PassName.Str, 6), "inline"); + EXPECT_EQ(Remark->PassName.Len, 6U); + EXPECT_EQ(StringRef(Remark->RemarkName.Str, 12), "NoDefinition"); + EXPECT_EQ(Remark->RemarkName.Len, 12U); + EXPECT_EQ(StringRef(Remark->FunctionName.Str, 3), "foo"); + EXPECT_EQ(Remark->FunctionName.Len, 3U); + EXPECT_EQ(StringRef(Remark->DebugLoc.SourceFile.Str, 6), "file.c"); + EXPECT_EQ(Remark->DebugLoc.SourceFile.Len, 6U); + EXPECT_EQ(Remark->DebugLoc.SourceLineNumber, 3U); + EXPECT_EQ(Remark->DebugLoc.SourceColumnNumber, 12U); + EXPECT_EQ(Remark->Hotness, 0U); + EXPECT_EQ(Remark->NumArgs, 4U); + // Arg 0 + { + LLVMOptRemarkArg &Arg = Remark->Args[0]; + EXPECT_EQ(StringRef(Arg.Key.Str, 6), "Callee"); + EXPECT_EQ(Arg.Key.Len, 6U); + EXPECT_EQ(StringRef(Arg.Value.Str, 3), "bar"); + EXPECT_EQ(Arg.Value.Len, 3U); + EXPECT_EQ(StringRef(Arg.DebugLoc.SourceFile.Str, 0), ""); + EXPECT_EQ(Arg.DebugLoc.SourceFile.Len, 0U); + EXPECT_EQ(Arg.DebugLoc.SourceLineNumber, 0U); + EXPECT_EQ(Arg.DebugLoc.SourceColumnNumber, 0U); + } + // Arg 1 + { + LLVMOptRemarkArg &Arg = Remark->Args[1]; + EXPECT_EQ(StringRef(Arg.Key.Str, 6), "String"); + EXPECT_EQ(Arg.Key.Len, 6U); + EXPECT_EQ(StringRef(Arg.Value.Str, 26), " will not be inlined into "); + EXPECT_EQ(Arg.Value.Len, 26U); + EXPECT_EQ(StringRef(Arg.DebugLoc.SourceFile.Str, 0), ""); + EXPECT_EQ(Arg.DebugLoc.SourceFile.Len, 0U); + EXPECT_EQ(Arg.DebugLoc.SourceLineNumber, 0U); + EXPECT_EQ(Arg.DebugLoc.SourceColumnNumber, 0U); + } + // Arg 2 + { + LLVMOptRemarkArg &Arg = Remark->Args[2]; + EXPECT_EQ(StringRef(Arg.Key.Str, 6), "Caller"); + EXPECT_EQ(Arg.Key.Len, 6U); + EXPECT_EQ(StringRef(Arg.Value.Str, 3), "foo"); + EXPECT_EQ(Arg.Value.Len, 3U); + EXPECT_EQ(StringRef(Arg.DebugLoc.SourceFile.Str, 6), "file.c"); + EXPECT_EQ(Arg.DebugLoc.SourceFile.Len, 6U); + EXPECT_EQ(Arg.DebugLoc.SourceLineNumber, 2U); + EXPECT_EQ(Arg.DebugLoc.SourceColumnNumber, 0U); + } + // Arg 3 + { + LLVMOptRemarkArg &Arg = Remark->Args[3]; + EXPECT_EQ(StringRef(Arg.Key.Str, 6), "String"); + EXPECT_EQ(Arg.Key.Len, 6U); + EXPECT_EQ(StringRef(Arg.Value.Str, 38), + " because its definition is unavailable"); + EXPECT_EQ(Arg.Value.Len, 38U); + EXPECT_EQ(StringRef(Arg.DebugLoc.SourceFile.Str, 0), ""); + EXPECT_EQ(Arg.DebugLoc.SourceFile.Len, 0U); + EXPECT_EQ(Arg.DebugLoc.SourceLineNumber, 0U); + EXPECT_EQ(Arg.DebugLoc.SourceColumnNumber, 0U); + } + + EXPECT_EQ(LLVMOptRemarkParserGetNext(Parser), nullptr); + + EXPECT_FALSE(LLVMOptRemarkParserHasError(Parser)); + LLVMOptRemarkParserDispose(Parser); +} -- GitLab From ca0c32a3b8c810d7e315661a48b070c5ac2fa150 Mon Sep 17 00:00:00 2001 From: Renato Golin Date: Wed, 10 Oct 2018 18:49:49 +0000 Subject: [PATCH 0016/1116] [LV] Add a new reduction pattern match Adding a new reduction pattern match for vectorizing code similar to TSVC s3111: for (int i = 0; i < N; i++) if (a[i] > b) sum += a[i]; This patch adds support for fadd, fsub and fmull, as well as multiple branches and different (but compatible) instructions (ex. add+sub) in different branches. I have forwarded to trunk, added fsub and fmul functionality and additional tests, but the credit goes to Takahiro, who did most of the actual work. Differential Revision: https://reviews.llvm.org/D49168 Patch by Takahiro Miyoshi . git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344172 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Analysis/IVDescriptors.h | 7 +- lib/Analysis/IVDescriptors.cpp | 71 +- test/Transforms/LoopVectorize/if-reduction.ll | 666 ++++++++++++++++++ 3 files changed, 737 insertions(+), 7 deletions(-) create mode 100644 test/Transforms/LoopVectorize/if-reduction.ll diff --git a/include/llvm/Analysis/IVDescriptors.h b/include/llvm/Analysis/IVDescriptors.h index d1d7e5ef022..64b4ae23cc5 100644 --- a/include/llvm/Analysis/IVDescriptors.h +++ b/include/llvm/Analysis/IVDescriptors.h @@ -140,7 +140,8 @@ public: /// Returns true if instruction I has multiple uses in Insts static bool hasMultipleUsesOf(Instruction *I, - SmallPtrSetImpl &Insts); + SmallPtrSetImpl &Insts, + unsigned MaxNumUses); /// Returns true if all uses of the instruction I is within the Set. static bool areAllUsesIn(Instruction *I, SmallPtrSetImpl &Set); @@ -150,6 +151,10 @@ public: /// or max(X, Y). static InstDesc isMinMaxSelectCmpPattern(Instruction *I, InstDesc &Prev); + /// Returns a struct describing if the instruction is a + /// Select(FCmp(X, Y), (Z = X op PHINode), PHINode) instruction pattern. + static InstDesc isConditionalRdxPattern(RecurrenceKind Kind, Instruction *I); + /// Returns identity corresponding to the RecurrenceKind. static Constant *getRecurrenceIdentity(RecurrenceKind K, Type *Tp); diff --git a/lib/Analysis/IVDescriptors.cpp b/lib/Analysis/IVDescriptors.cpp index 854a95573e9..47bddf68f49 100644 --- a/lib/Analysis/IVDescriptors.cpp +++ b/lib/Analysis/IVDescriptors.cpp @@ -299,9 +299,17 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurrenceKind Kind, return false; } + bool IsASelect = isa(Cur); + + // A conditional reduction operation must only have 2 or less uses in + // VisitedInsts. + if (IsASelect && (Kind == RK_FloatAdd || Kind == RK_FloatMult) && + hasMultipleUsesOf(Cur, VisitedInsts, 2)) + return false; + // A reduction operation must only have one use of the reduction value. - if (!IsAPhi && Kind != RK_IntegerMinMax && Kind != RK_FloatMinMax && - hasMultipleUsesOf(Cur, VisitedInsts)) + if (!IsAPhi && !IsASelect && Kind != RK_IntegerMinMax && + Kind != RK_FloatMinMax && hasMultipleUsesOf(Cur, VisitedInsts, 1)) return false; // All inputs to a PHI node must be a reduction value. @@ -362,7 +370,8 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurrenceKind Kind, } else if (!isa(UI) && ((!isa(UI) && !isa(UI) && !isa(UI)) || - !isMinMaxSelectCmpPattern(UI, IgnoredVal).isRecurrence())) + (!isConditionalRdxPattern(Kind, UI).isRecurrence() && + !isMinMaxSelectCmpPattern(UI, IgnoredVal).isRecurrence()))) return false; // Remember that we completed the cycle. @@ -491,6 +500,52 @@ RecurrenceDescriptor::isMinMaxSelectCmpPattern(Instruction *I, InstDesc &Prev) { return InstDesc(false, I); } +/// Returns true if the select instruction has users in the compare-and-add +/// reduction pattern below. The select instruction argument is the last one +/// in the sequence. +/// +/// %sum.1 = phi ... +/// ... +/// %cmp = fcmp pred %0, %CFP +/// %add = fadd %0, %sum.1 +/// %sum.2 = select %cmp, %add, %sum.1 +RecurrenceDescriptor::InstDesc +RecurrenceDescriptor::isConditionalRdxPattern( + RecurrenceKind Kind, Instruction *I) { + SelectInst *SI = dyn_cast(I); + if (!SI) + return InstDesc(false, I); + + CmpInst *CI = dyn_cast(SI->getCondition()); + // Only handle single use cases for now. + if (!CI || !CI->hasOneUse()) + return InstDesc(false, I); + + Value *TrueVal = SI->getTrueValue(); + Value *FalseVal = SI->getFalseValue(); + // Handle only when either of operands of select instruction is a PHI + // node for now. + if ((isa(*TrueVal) && isa(*FalseVal)) || + (!isa(*TrueVal) && !isa(*FalseVal))) + return InstDesc(false, I); + + Instruction *I1 = + isa(*TrueVal) ? dyn_cast(FalseVal) + : dyn_cast(TrueVal); + if (!I1 || !I1->isBinaryOp()) + return InstDesc(false, I); + + Value *Op1, *Op2; + if (m_FAdd(m_Value(Op1), m_Value(Op2)).match(I1) || + m_FSub(m_Value(Op1), m_Value(Op2)).match(I1)) + return InstDesc(Kind == RK_FloatAdd, SI); + + if (m_FMul(m_Value(Op1), m_Value(Op2)).match(I1)) + return InstDesc(Kind == RK_FloatMult, SI); + + return InstDesc(false, I); +} + RecurrenceDescriptor::InstDesc RecurrenceDescriptor::isRecurrenceInstr(Instruction *I, RecurrenceKind Kind, InstDesc &Prev, bool HasFunNoNaNAttr) { @@ -520,9 +575,12 @@ RecurrenceDescriptor::isRecurrenceInstr(Instruction *I, RecurrenceKind Kind, case Instruction::FSub: case Instruction::FAdd: return InstDesc(Kind == RK_FloatAdd, I, UAI); + case Instruction::Select: + if (Kind == RK_FloatAdd || Kind == RK_FloatMult) + return isConditionalRdxPattern(Kind, I); + LLVM_FALLTHROUGH; case Instruction::FCmp: case Instruction::ICmp: - case Instruction::Select: if (Kind != RK_IntegerMinMax && (!HasFunNoNaNAttr || Kind != RK_FloatMinMax)) return InstDesc(false, I); @@ -531,13 +589,14 @@ RecurrenceDescriptor::isRecurrenceInstr(Instruction *I, RecurrenceKind Kind, } bool RecurrenceDescriptor::hasMultipleUsesOf( - Instruction *I, SmallPtrSetImpl &Insts) { + Instruction *I, SmallPtrSetImpl &Insts, + unsigned MaxNumUses) { unsigned NumUses = 0; for (User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E; ++Use) { if (Insts.count(dyn_cast(*Use))) ++NumUses; - if (NumUses > 1) + if (NumUses > MaxNumUses) return true; } diff --git a/test/Transforms/LoopVectorize/if-reduction.ll b/test/Transforms/LoopVectorize/if-reduction.ll new file mode 100644 index 00000000000..dd9a6118337 --- /dev/null +++ b/test/Transforms/LoopVectorize/if-reduction.ll @@ -0,0 +1,666 @@ +; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 < %s | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" + +; Float pattern: +; Check vectorization of reduction code which has an fadd instruction after +; an fcmp instruction which compares an array element and 0. +; +; float fcmp_0_fadd_select1(float * restrict x, const int N) { +; float sum = 0. +; for (int i = 0; i < N; ++i) +; if (x[i] > (float)0.) +; sum += x[i]; +; return sum; +; } + +; CHECK-LABEL: @fcmp_0_fadd_select1( +; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x float> %[[V0:.*]], zeroinitializer +; CHECK: %[[V3:.*]] = fadd fast <4 x float> %[[V0]], %[[V2:.*]] +; CHECK: select <4 x i1> %[[V1]], <4 x float> %[[V3]], <4 x float> %[[V2]] +define float @fcmp_0_fadd_select1(float* noalias %x, i32 %N) nounwind readonly { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %for.header, label %for.end + +for.header: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %header, %for.body + %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ] + %sum.1 = phi float [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ] + %arrayidx = getelementptr inbounds float, float* %x, i64 %indvars.iv + %0 = load float, float* %arrayidx, align 4 + %cmp.2 = fcmp fast ogt float %0, 0.000000e+00 + %add = fadd fast float %0, %sum.1 + %sum.2 = select i1 %cmp.2, float %add, float %sum.1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + %1 = phi float [ 0.000000e+00, %entry ], [ %sum.2, %for.body ] + ret float %1 +} + +; Double pattern: +; Check vectorization of reduction code which has an fadd instruction after +; an fcmp instruction which compares an array element and 0. +; +; double fcmp_0_fadd_select2(double * restrict x, const int N) { +; double sum = 0. +; for (int i = 0; i < N; ++i) +; if (x[i] > 0.) +; sum += x[i]; +; return sum; +; } + +; CHECK-LABEL: @fcmp_0_fadd_select2( +; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x double> %[[V0:.*]], zeroinitializer +; CHECK: %[[V3:.*]] = fadd fast <4 x double> %[[V0]], %[[V2:.*]] +; CHECK: select <4 x i1> %[[V1]], <4 x double> %[[V3]], <4 x double> %[[V2]] +define double @fcmp_0_fadd_select2(double* noalias %x, i32 %N) nounwind readonly { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %for.header, label %for.end + +for.header: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %header, %for.body + %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ] + %sum.1 = phi double [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ] + %arrayidx = getelementptr inbounds double, double* %x, i64 %indvars.iv + %0 = load double, double* %arrayidx, align 4 + %cmp.2 = fcmp fast ogt double %0, 0.000000e+00 + %add = fadd fast double %0, %sum.1 + %sum.2 = select i1 %cmp.2, double %add, double %sum.1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + %1 = phi double [ 0.000000e+00, %entry ], [ %sum.2, %for.body ] + ret double %1 +} + +; Float pattern: +; Check vectorization of reduction code which has an fadd instruction after +; an fcmp instruction which compares an array element and a floating-point +; value. +; +; float fcmp_val_fadd_select1(float * restrict x, float y, const int N) { +; float sum = 0. +; for (int i = 0; i < N; ++i) +; if (x[i] > y) +; sum += x[i]; +; return sum; +; } + +; CHECK-LABEL: @fcmp_val_fadd_select1( +; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x float> %[[V0:.*]], %broadcast.splat2 +; CHECK: %[[V3:.*]] = fadd fast <4 x float> %[[V0]], %[[V2:.*]] +; CHECK: select <4 x i1> %[[V1]], <4 x float> %[[V3]], <4 x float> %[[V2]] +define float @fcmp_val_fadd_select1(float* noalias %x, float %y, i32 %N) nounwind readonly { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %for.header, label %for.end + +for.header: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %header, %for.body + %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ] + %sum.1 = phi float [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ] + %arrayidx = getelementptr inbounds float, float* %x, i64 %indvars.iv + %0 = load float, float* %arrayidx, align 4 + %cmp.2 = fcmp fast ogt float %0, %y + %add = fadd fast float %0, %sum.1 + %sum.2 = select i1 %cmp.2, float %add, float %sum.1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + %1 = phi float [ 0.000000e+00, %entry ], [ %sum.2, %for.body ] + ret float %1 +} + +; Double pattern: +; Check vectorization of reduction code which has an fadd instruction after +; an fcmp instruction which compares an array element and a floating-point +; value. +; +; double fcmp_val_fadd_select2(double * restrict x, double y, const int N) { +; double sum = 0. +; for (int i = 0; i < N; ++i) +; if (x[i] > y) +; sum += x[i]; +; return sum; +; } + +; CHECK-LABEL: @fcmp_val_fadd_select2( +; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x double> %[[V0:.*]], %broadcast.splat2 +; CHECK: %[[V3:.*]] = fadd fast <4 x double> %[[V0]], %[[V2:.*]] +; CHECK: select <4 x i1> %[[V1]], <4 x double> %[[V3]], <4 x double> %[[V2]] +define double @fcmp_val_fadd_select2(double* noalias %x, double %y, i32 %N) nounwind readonly { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %for.header, label %for.end + +for.header: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %header, %for.body + %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ] + %sum.1 = phi double [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ] + %arrayidx = getelementptr inbounds double, double* %x, i64 %indvars.iv + %0 = load double, double* %arrayidx, align 4 + %cmp.2 = fcmp fast ogt double %0, %y + %add = fadd fast double %0, %sum.1 + %sum.2 = select i1 %cmp.2, double %add, double %sum.1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + %1 = phi double [ 0.000000e+00, %entry ], [ %sum.2, %for.body ] + ret double %1 +} + +; Float pattern: +; Check vectorization of reduction code which has an fadd instruction after +; an fcmp instruction which compares an array element and another array +; element. +; +; float fcmp_array_elm_fadd_select1(float * restrict x, float * restrict y, +; const int N) { +; float sum = 0. +; for (int i = 0; i < N; ++i) +; if (x[i] > y[i]) +; sum += x[i]; +; return sum; +; } + +; CHECK-LABEL: @fcmp_array_elm_fadd_select1( +; CHECK: %[[V2:.*]] = fcmp fast ogt <4 x float> %[[V0:.*]], %[[V1:.*]] +; CHECK: %[[V4:.*]] = fadd fast <4 x float> %[[V0]], %[[V3:.*]] +; CHECK: select <4 x i1> %[[V2]], <4 x float> %[[V4]], <4 x float> %[[V3]] +define float @fcmp_array_elm_fadd_select1(float* noalias %x, float* noalias %y, i32 %N) nounwind readonly { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %for.header, label %for.end + +for.header: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %for.header + %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ] + %sum.1 = phi float [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ] + %arrayidx.1 = getelementptr inbounds float, float* %x, i64 %indvars.iv + %0 = load float, float* %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds float, float* %y, i64 %indvars.iv + %1 = load float, float* %arrayidx.2, align 4 + %cmp.2 = fcmp fast ogt float %0, %1 + %add = fadd fast float %0, %sum.1 + %sum.2 = select i1 %cmp.2, float %add, float %sum.1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + %2 = phi float [ 0.000000e+00, %entry ], [ %sum.2, %for.body ] + ret float %2 +} + +; Double pattern: +; Check vectorization of reduction code which has an fadd instruction after +; an fcmp instruction which compares an array element and another array +; element. +; +; double fcmp_array_elm_fadd_select2(double * restrict x, double * restrict y, +; const int N) { +; double sum = 0. +; for (int i = 0; i < N; ++i) +; if (x[i] > y[i]) +; sum += x[i]; +; return sum; +; } + +; CHECK-LABEL: @fcmp_array_elm_fadd_select2( +; CHECK: %[[V2:.*]] = fcmp fast ogt <4 x double> %[[V0:.*]], %[[V1:.*]] +; CHECK: %[[V4:.*]] = fadd fast <4 x double> %[[V0]], %[[V3:.*]] +; CHECK: select <4 x i1> %[[V2]], <4 x double> %[[V4]], <4 x double> %[[V3]] +define double @fcmp_array_elm_fadd_select2(double* noalias %x, double* noalias %y, i32 %N) nounwind readonly { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %for.header, label %for.end + +for.header: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %for.header + %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ] + %sum.1 = phi double [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ] + %arrayidx.1 = getelementptr inbounds double, double* %x, i64 %indvars.iv + %0 = load double, double* %arrayidx.1, align 4 + %arrayidx.2 = getelementptr inbounds double, double* %y, i64 %indvars.iv + %1 = load double, double* %arrayidx.2, align 4 + %cmp.2 = fcmp fast ogt double %0, %1 + %add = fadd fast double %0, %sum.1 + %sum.2 = select i1 %cmp.2, double %add, double %sum.1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + %2 = phi double [ 0.000000e+00, %entry ], [ %sum.2, %for.body ] + ret double %2 +} + +; Float pattern: +; Check vectorization of reduction code which has an fsub instruction after +; an fcmp instruction which compares an array element and 0. +; +; float fcmp_0_fsub_select1(float * restrict x, const int N) { +; float sum = 0. +; for (int i = 0; i < N; ++i) +; if (x[i] > (float)0.) +; sum -= x[i]; +; return sum; +; } + +; CHECK-LABEL: @fcmp_0_fsub_select1( +; CHECK: %[[V1:.*]] = fcmp ogt <4 x float> %[[V0:.*]], zeroinitializer +; CHECK: %[[V3:.*]] = fsub <4 x float> %[[V2:.*]], %[[V0]] +; CHECK: select <4 x i1> %[[V1]], <4 x float> %[[V3]], <4 x float> %[[V2]] +define float @fcmp_0_fsub_select1(float* noalias %x, i32 %N) nounwind readonly { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %for.header, label %for.end + +for.header: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %for.header + %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ] + %sum.1 = phi float [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ] + %arrayidx = getelementptr inbounds float, float* %x, i64 %indvars.iv + %0 = load float, float* %arrayidx, align 4 + %cmp.2 = fcmp ogt float %0, 0.000000e+00 + %sub = fsub float %sum.1, %0 + %sum.2 = select i1 %cmp.2, float %sub, float %sum.1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + %1 = phi float [ 0.000000e+00, %entry ], [ %sum.2, %for.body ] + ret float %1 +} + +; Double pattern: +; Check vectorization of reduction code which has an fsub instruction after +; an fcmp instruction which compares an array element and 0. +; +; double fcmp_0_fsub_select2(double * restrict x, const int N) { +; double sum = 0. +; for (int i = 0; i < N; ++i) +; if (x[i] > 0.) +; sum -= x[i]; +; return sum; +; } + +; CHECK-LABEL: @fcmp_0_fsub_select2( +; CHECK: %[[V1:.*]] = fcmp ogt <4 x double> %[[V0:.*]], zeroinitializer +; CHECK: %[[V3:.*]] = fsub <4 x double> %[[V2:.*]], %[[V0]] +; CHECK: select <4 x i1> %[[V1]], <4 x double> %[[V3]], <4 x double> %[[V2]] +define double @fcmp_0_fsub_select2(double* noalias %x, i32 %N) nounwind readonly { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %for.header, label %for.end + +for.header: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %for.header + %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ] + %sum.1 = phi double [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ] + %arrayidx = getelementptr inbounds double, double* %x, i64 %indvars.iv + %0 = load double, double* %arrayidx, align 4 + %cmp.2 = fcmp ogt double %0, 0.000000e+00 + %sub = fsub double %sum.1, %0 + %sum.2 = select i1 %cmp.2, double %sub, double %sum.1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + %1 = phi double [ 0.000000e+00, %entry ], [ %sum.2, %for.body ] + ret double %1 +} + +; Float pattern: +; Check vectorization of reduction code which has an fmul instruction after +; an fcmp instruction which compares an array element and 0. +; +; float fcmp_0_fmult_select1(float * restrict x, const int N) { +; float sum = 0. +; for (int i = 0; i < N; ++i) +; if (x[i] > (float)0.) +; sum *= x[i]; +; return sum; +; } + +; CHECK-LABEL: @fcmp_0_fmult_select1( +; CHECK: %[[V1:.*]] = fcmp ogt <4 x float> %[[V0:.*]], zeroinitializer +; CHECK: %[[V3:.*]] = fmul <4 x float> %[[V2:.*]], %[[V0]] +; CHECK: select <4 x i1> %[[V1]], <4 x float> %[[V3]], <4 x float> %[[V2]] +define float @fcmp_0_fmult_select1(float* noalias %x, i32 %N) nounwind readonly { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %for.header, label %for.end + +for.header: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %for.header + %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ] + %sum.1 = phi float [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ] + %arrayidx = getelementptr inbounds float, float* %x, i64 %indvars.iv + %0 = load float, float* %arrayidx, align 4 + %cmp.2 = fcmp ogt float %0, 0.000000e+00 + %mult = fmul float %sum.1, %0 + %sum.2 = select i1 %cmp.2, float %mult, float %sum.1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + %1 = phi float [ 0.000000e+00, %entry ], [ %sum.2, %for.body ] + ret float %1 +} + +; Double pattern: +; Check vectorization of reduction code which has an fmul instruction after +; an fcmp instruction which compares an array element and 0. +; +; double fcmp_0_fmult_select2(double * restrict x, const int N) { +; double sum = 0. +; for (int i = 0; i < N; ++i) +; if (x[i] > 0.) +; sum *= x[i]; +; return sum; +; } + +; CHECK-LABEL: @fcmp_0_fmult_select2( +; CHECK: %[[V1:.*]] = fcmp ogt <4 x double> %[[V0:.*]], zeroinitializer +; CHECK: %[[V3:.*]] = fmul <4 x double> %[[V2:.*]], %[[V0]] +; CHECK: select <4 x i1> %[[V1]], <4 x double> %[[V3]], <4 x double> %[[V2]] +define double @fcmp_0_fmult_select2(double* noalias %x, i32 %N) nounwind readonly { +entry: + %cmp.1 = icmp sgt i32 %N, 0 + br i1 %cmp.1, label %for.header, label %for.end + +for.header: ; preds = %entry + %zext = zext i32 %N to i64 + br label %for.body + +for.body: ; preds = %for.body, %for.header + %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ] + %sum.1 = phi double [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ] + %arrayidx = getelementptr inbounds double, double* %x, i64 %indvars.iv + %0 = load double, double* %arrayidx, align 4 + %cmp.2 = fcmp ogt double %0, 0.000000e+00 + %mult = fmul double %sum.1, %0 + %sum.2 = select i1 %cmp.2, double %mult, double %sum.1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %zext + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + %1 = phi double [ 0.000000e+00, %entry ], [ %sum.2, %for.body ] + ret double %1 +} + +; Float multi pattern +; Check vectorisation of reduction code with a pair of selects to different +; fadd patterns. +; +; float fcmp_multi(float *a, int n) { +; float sum=0.0; +; for (int i=0;i1.0) +; sum+=a[i]; +; else if (a[i]<3.0) +; sum+=2*a[i]; +; else +; sum+=3*a[i]; +; } +; return sum; +; } + +; CHECK-LABEL: @fcmp_multi( +; CHECK: %[[C1:.*]] = fcmp ogt <4 x float> %[[V0:.*]], %[[V0]], %[[V0]], %[[V0]], %[[C1]], %[[C2]], %[[C11]] +; CHECK-DAG: %[[C21:.*]] = xor <4 x i1> %[[C2]], %[[C21]], %[[C11]] +; CHECK: %[[S1:.*]] = select <4 x i1> %[[C22]], <4 x float> %[[M1]], <4 x float> %[[M2]] +; CHECK: %[[S2:.*]] = select <4 x i1> %[[C1]], <4 x float> %[[V0]], <4 x float> %[[S1]] +; CHECK: fadd fast <4 x float> %[[S2]], +define float @fcmp_multi(float* nocapture readonly %a, i32 %n) nounwind readonly { +entry: + %cmp10 = icmp sgt i32 %n, 0 + br i1 %cmp10, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.body: ; preds = %for.inc, %for.body.preheader + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ] + %sum.011 = phi float [ 0.000000e+00, %for.body.preheader ], [ %sum.1, %for.inc ] + %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv + %0 = load float, float* %arrayidx, align 4 + %cmp1 = fcmp ogt float %0, 1.000000e+00 + br i1 %cmp1, label %for.inc, label %if.else + +if.else: ; preds = %for.body + %cmp8 = fcmp olt float %0, 3.000000e+00 + br i1 %cmp8, label %if.then10, label %if.else14 + +if.then10: ; preds = %if.else + %mul = fmul fast float %0, 2.000000e+00 + br label %for.inc + +if.else14: ; preds = %if.else + %mul17 = fmul fast float %0, 3.000000e+00 + br label %for.inc + +for.inc: ; preds = %for.body, %if.else14, %if.then10 + %.pn = phi float [ %mul, %if.then10 ], [ %mul17, %if.else14 ], [ %0, %for.body ] + %sum.1 = fadd fast float %.pn, %sum.011 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.inc, %entry + %sum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %sum.1, %for.inc ] + ret float %sum.0.lcssa +} + +; Float fadd + fsub patterns +; Check vectorisation of reduction code with a pair of selects to different +; instructions { fadd, fsub } but equivalent (change in constant). +; +; float fcmp_multi(float *a, int n) { +; float sum=0.0; +; for (int i=0;i1.0) +; sum+=a[i]; +; else if (a[i]<3.0) +; sum-=a[i]; +; } +; return sum; +; } + +; CHECK-LABEL: @fcmp_fadd_fsub( +; CHECK: %[[C1:.*]] = fcmp ogt <4 x float> %[[V0:.*]], %[[V0]], +; CHECK-DAG: %[[ADD:.*]] = fadd fast <4 x float> +; CHECK: %[[C11:.*]] = xor <4 x i1> %[[C1]], %[[C2]], %[[C11]] +; CHECK-DAG: %[[C21:.*]] = xor <4 x i1> %[[C2]], %[[C21]], %[[C11]] +; CHECK: %[[S1:.*]] = select <4 x i1> %[[C12]], <4 x float> %[[SUB]], <4 x float> %[[ADD]] +; CHECK: %[[S2:.*]] = select <4 x i1> %[[C22]], {{.*}} <4 x float> %[[S1]] +define float @fcmp_fadd_fsub(float* nocapture readonly %a, i32 %n) nounwind readonly { +entry: + %cmp9 = icmp sgt i32 %n, 0 + br i1 %cmp9, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.body: ; preds = %for.inc, %for.body.preheader + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ] + %sum.010 = phi float [ 0.000000e+00, %for.body.preheader ], [ %sum.1, %for.inc ] + %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv + %0 = load float, float* %arrayidx, align 4 + %cmp1 = fcmp ogt float %0, 1.000000e+00 + br i1 %cmp1, label %if.then, label %if.else + +if.then: ; preds = %for.body + %add = fadd fast float %0, %sum.010 + br label %for.inc + +if.else: ; preds = %for.body + %cmp8 = fcmp olt float %0, 3.000000e+00 + br i1 %cmp8, label %if.then10, label %for.inc + +if.then10: ; preds = %if.else + %sub = fsub fast float %sum.010, %0 + br label %for.inc + +for.inc: ; preds = %if.then, %if.then10, %if.else + %sum.1 = phi float [ %add, %if.then ], [ %sub, %if.then10 ], [ %sum.010, %if.else ] + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.inc, %entry + %sum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %sum.1, %for.inc ] + ret float %sum.0.lcssa +} + +; Float fadd + fmul patterns +; Check lack of vectorisation of reduction code with a pair of non-compatible +; instructions { fadd, fmul }. +; +; float fcmp_multi(float *a, int n) { +; float sum=0.0; +; for (int i=0;i1.0) +; sum+=a[i]; +; else if (a[i]<3.0) +; sum*=a[i]; +; } +; return sum; +; } + +; CHECK-LABEL: @fcmp_fadd_fmul( +; CHECK-NOT: <4 x float> +define float @fcmp_fadd_fmul(float* nocapture readonly %a, i32 %n) nounwind readonly { +entry: + %cmp9 = icmp sgt i32 %n, 0 + br i1 %cmp9, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.body: ; preds = %for.inc, %for.body.preheader + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ] + %sum.010 = phi float [ 0.000000e+00, %for.body.preheader ], [ %sum.1, %for.inc ] + %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv + %0 = load float, float* %arrayidx, align 4 + %cmp1 = fcmp ogt float %0, 1.000000e+00 + br i1 %cmp1, label %if.then, label %if.else + +if.then: ; preds = %for.body + %add = fadd fast float %0, %sum.010 + br label %for.inc + +if.else: ; preds = %for.body + %cmp8 = fcmp olt float %0, 3.000000e+00 + br i1 %cmp8, label %if.then10, label %for.inc + +if.then10: ; preds = %if.else + %mul = fmul fast float %0, %sum.010 + br label %for.inc + +for.inc: ; preds = %if.then, %if.then10, %if.else + %sum.1 = phi float [ %add, %if.then ], [ %mul, %if.then10 ], [ %sum.010, %if.else ] + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.inc, %entry + %sum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %sum.1, %for.inc ] + ret float %sum.0.lcssa +} + +; Float fadd + store patterns +; Check lack of vectorisation of reduction code with a store back, given it +; has loop dependency on a[i]. +; +; float fcmp_store_back(float a[], int LEN) { +; float sum = 0.0; +; for (int i = 0; i < LEN; i++) { +; sum += a[i]; +; a[i] = sum; +; } +; return sum; +; } + +; CHECK-LABEL: @fcmp_store_back( +; CHECK-NOT: <4 x float> +define float @fcmp_store_back(float* nocapture %a, i32 %LEN) nounwind readonly { +entry: + %cmp7 = icmp sgt i32 %LEN, 0 + br i1 %cmp7, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %LEN to i64 + br label %for.body + +for.body: ; preds = %for.body, %for.body.preheader + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %sum.08 = phi float [ 0.000000e+00, %for.body.preheader ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv + %0 = load float, float* %arrayidx, align 4 + %add = fadd fast float %0, %sum.08 + store float %add, float* %arrayidx, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + %sum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ] + ret float %sum.0.lcssa +} -- GitLab From c8b6096ed08df62066b9abfc0e56c734d6530be8 Mon Sep 17 00:00:00 2001 From: Thomas Lively Date: Wed, 10 Oct 2018 19:09:16 +0000 Subject: [PATCH 0017/1116] [WebAssembly][NFC] Use vnot patfrag to simplify v128.not Reviewers: aheejin, dschuff Subscribers: sbc100, jgravelle-google, sunfish, llvm-commits Differential Revision: https://reviews.llvm.org/D53097 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344175 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../WebAssembly/WebAssemblyInstrSIMD.td | 21 +++++++------------ 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td index 28262fbcaf6..491ee56b794 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -508,23 +508,16 @@ defm XOR : SIMDBitwise; } // isCommutable = 1 // Bitwise logic: v128.not -multiclass SIMDNot { - defm NOT_#vec_t : SIMD_I<(outs V128:$dst), (ins V128:$vec), - (outs), (ins), - [(set - (vec_t V128:$dst), - (vec_t (xor - (vec_t V128:$vec), - (vec_t (splat_pat (lane_t -1))) - )) - )], +multiclass SIMDNot { + defm NOT_#vec_t : SIMD_I<(outs V128:$dst), (ins V128:$vec), (outs), (ins), + [(set (vec_t V128:$dst), (vec_t (vnot V128:$vec)))], "v128.not\t$dst, $vec", "v128.not", 63>; } -defm "" : SIMDNot; -defm "" : SIMDNot; -defm "" : SIMDNot; -defm "" : SIMDNot; +defm "" : SIMDNot; +defm "" : SIMDNot; +defm "" : SIMDNot; +defm "" : SIMDNot; // Bitwise select: v128.bitselect def wasm_bitselect_t : SDTypeProfile<1, 3, -- GitLab From 4942b853a9cf260fc8159868fc1d0d636a6bead9 Mon Sep 17 00:00:00 2001 From: David Bolvansky Date: Wed, 10 Oct 2018 20:10:37 +0000 Subject: [PATCH 0018/1116] [DwarfVerifier] Fixed -Wimplicit-fallthrough warning Reviewers: JDevlieghere, RKSimon Reviewed By: JDevlieghere Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D52963 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344176 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/DebugInfo/DWARF/DWARFVerifier.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/DebugInfo/DWARF/DWARFVerifier.cpp b/lib/DebugInfo/DWARF/DWARFVerifier.cpp index c433fe470cb..e78e13bf4af 100644 --- a/lib/DebugInfo/DWARF/DWARFVerifier.cpp +++ b/lib/DebugInfo/DWARF/DWARFVerifier.cpp @@ -508,14 +508,15 @@ unsigned DWARFVerifier::verifyDebugInfoAttribute(const DWARFDie &Die, "incompatible tag " + TagString(RefTag)); } + break; } case DW_AT_type: { DWARFDie TypeDie = Die.getAttributeValueAsReferencedDie(DW_AT_type); if (TypeDie && !isType(TypeDie.getTag())) { ReportError("DIE has " + AttributeString(Attr) + " with incompatible tag " + TagString(TypeDie.getTag())); - break; } + break; } default: break; -- GitLab From 9f5daa2df05a60699215bed0da0a569531d4e26d Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Wed, 10 Oct 2018 20:39:39 +0000 Subject: [PATCH 0019/1116] revert r344082: [InstCombine] reverse 'trunc X to ' canonicalization This commit accidentally included the diffs from D53057. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344178 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../InstCombine/InstCombineCasts.cpp | 31 +- .../InstCombine/InstCombineCompares.cpp | 7 - .../InstCombine/InstCombineVectorOps.cpp | 30 -- test/Transforms/InstCombine/apint-shift.ll | 4 +- .../Transforms/InstCombine/apint-shl-trunc.ll | 5 +- test/Transforms/InstCombine/icmp.ll | 20 +- test/Transforms/InstCombine/vec_shuffle.ll | 5 +- test/Transforms/InstCombine/vector-casts.ll | 19 +- .../LoopVectorize/X86/masked_load_store.ll | 336 +++++++++--------- 9 files changed, 212 insertions(+), 245 deletions(-) diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp index 74f1e695ff6..fd59c3a7c0c 100644 --- a/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -706,35 +706,12 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) { if (SimplifyDemandedInstructionBits(CI)) return &CI; + // Canonicalize trunc x to i1 -> (icmp ne (and x, 1), 0), likewise for vector. if (DestTy->getScalarSizeInBits() == 1) { + Constant *One = ConstantInt::get(SrcTy, 1); + Src = Builder.CreateAnd(Src, One); Value *Zero = Constant::getNullValue(Src->getType()); - if (DestTy->isIntegerTy()) { - // Canonicalize trunc x to i1 -> icmp ne (and x, 1), 0 (scalar only). - // TODO: We canonicalize to more instructions here because we are probably - // lacking equivalent analysis for trunc relative to icmp. There may also - // be codegen concerns. If those trunc limitations were removed, we could - // remove this transform. - Value *And = Builder.CreateAnd(Src, ConstantInt::get(SrcTy, 1)); - return new ICmpInst(ICmpInst::ICMP_NE, And, Zero); - } - - // For vectors, we do not canonicalize all truncs to icmp, so optimize - // patterns that would be covered within visitICmpInst. - Value *X; - const APInt *C; - if (match(Src, m_OneUse(m_LShr(m_Value(X), m_APInt(C))))) { - // trunc (lshr X, C) to i1 --> icmp ne (and X, C'), 0 - APInt MaskC = APInt(SrcTy->getScalarSizeInBits(), 1).shl(*C); - Value *And = Builder.CreateAnd(X, ConstantInt::get(SrcTy, MaskC)); - return new ICmpInst(ICmpInst::ICMP_NE, And, Zero); - } - if (match(Src, m_OneUse(m_c_Or(m_LShr(m_Value(X), m_APInt(C)), - m_Deferred(X))))) { - // trunc (or (lshr X, C), X) to i1 --> icmp ne (and X, C'), 0 - APInt MaskC = APInt(SrcTy->getScalarSizeInBits(), 1).shl(*C) | 1; - Value *And = Builder.CreateAnd(X, ConstantInt::get(SrcTy, MaskC)); - return new ICmpInst(ICmpInst::ICMP_NE, And, Zero); - } + return new ICmpInst(ICmpInst::ICMP_NE, Src, Zero); } // FIXME: Maybe combine the next two transforms to handle the no cast case diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp index bf8bc8818f7..07bd98b30ab 100644 --- a/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -1609,13 +1609,6 @@ Instruction *InstCombiner::foldICmpAndShift(ICmpInst &Cmp, BinaryOperator *And, Instruction *InstCombiner::foldICmpAndConstConst(ICmpInst &Cmp, BinaryOperator *And, const APInt &C1) { - // For vectors: icmp ne (and X, 1), 0 --> trunc X to N x i1 - // TODO: We canonicalize to the longer form for scalars because we have - // better analysis/folds for icmp, and codegen may be better with icmp. - if (Cmp.getPredicate() == CmpInst::ICMP_NE && Cmp.getType()->isVectorTy() && - C1.isNullValue() && match(And->getOperand(1), m_One())) - return new TruncInst(And->getOperand(0), Cmp.getType()); - const APInt *C2; if (!match(And->getOperand(1), m_APInt(C2))) return nullptr; diff --git a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp index 7258127f319..61a3e31f960 100644 --- a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -1477,33 +1477,6 @@ static Instruction *narrowVectorSelect(ShuffleVectorInst &Shuf, return SelectInst::Create(NarrowCond, NarrowX, NarrowY); } -/// Try to combine 2 shuffles into 1 shuffle by concatenating a shuffle mask. -static Instruction *foldIdentityExtractShuffle(ShuffleVectorInst &Shuf) { - Value *Op0 = Shuf.getOperand(0), *Op1 = Shuf.getOperand(1); - if (!Shuf.isIdentityWithExtract() || !isa(Op1)) - return nullptr; - - Value *X, *Y; - Constant *Mask; - if (!match(Op0, m_ShuffleVector(m_Value(X), m_Value(Y), m_Constant(Mask)))) - return nullptr; - - // We are extracting a subvector from a shuffle. Remove excess elements from - // the 1st shuffle mask to eliminate the extract. - // shuf (shuf X, Y, ), undef, <0, undef, 2> --> - // shuf X, Y, - unsigned NumElts = Shuf.getType()->getVectorNumElements(); - SmallVector NewMask(NumElts); - for (unsigned i = 0; i != NumElts; ++i) { - // If the extracting shuffle has an undef mask element, it transfers to the - // new shuffle mask. Otherwise, copy the original mask element. - Constant *ExtractMaskElt = Shuf.getMask()->getAggregateElement(i); - Constant *MaskElt = Mask->getAggregateElement(i); - NewMask[i] = isa(ExtractMaskElt) ? ExtractMaskElt : MaskElt; - } - return new ShuffleVectorInst(X, Y, ConstantVector::get(NewMask)); -} - Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) { Value *LHS = SVI.getOperand(0); Value *RHS = SVI.getOperand(1); @@ -1526,9 +1499,6 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) { return &SVI; } - if (Instruction *I = foldIdentityExtractShuffle(SVI)) - return I; - SmallVector Mask = SVI.getShuffleMask(); Type *Int32Ty = Type::getInt32Ty(SVI.getContext()); unsigned LHSWidth = LHS->getType()->getVectorNumElements(); diff --git a/test/Transforms/InstCombine/apint-shift.ll b/test/Transforms/InstCombine/apint-shift.ll index 3266fa6e443..fc564665a60 100644 --- a/test/Transforms/InstCombine/apint-shift.ll +++ b/test/Transforms/InstCombine/apint-shift.ll @@ -319,8 +319,8 @@ define i1 @test16(i84 %X) { define <2 x i1> @test16vec(<2 x i84> %X) { ; CHECK-LABEL: @test16vec( -; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i84> [[X:%.*]], -; CHECK-NEXT: [[CMP:%.*]] = icmp ne <2 x i84> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[AND:%.*]] = and <2 x i84> %X, +; CHECK-NEXT: [[CMP:%.*]] = icmp ne <2 x i84> [[AND]], zeroinitializer ; CHECK-NEXT: ret <2 x i1> [[CMP]] ; %shr = ashr <2 x i84> %X, diff --git a/test/Transforms/InstCombine/apint-shl-trunc.ll b/test/Transforms/InstCombine/apint-shl-trunc.ll index 2241c88cb6b..c7d7d369592 100644 --- a/test/Transforms/InstCombine/apint-shl-trunc.ll +++ b/test/Transforms/InstCombine/apint-shl-trunc.ll @@ -27,8 +27,9 @@ define i1 @test1(i799 %X, i799 %A) { define <2 x i1> @test0vec(<2 x i39> %X, <2 x i39> %A) { ; CHECK-LABEL: @test0vec( -; CHECK-NEXT: [[B:%.*]] = lshr <2 x i39> [[X:%.*]], [[A:%.*]] -; CHECK-NEXT: [[D:%.*]] = trunc <2 x i39> [[B]] to <2 x i1> +; CHECK-NEXT: [[TMP1:%.*]] = shl <2 x i39> , [[A:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i39> [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[D:%.*]] = icmp ne <2 x i39> [[TMP2]], zeroinitializer ; CHECK-NEXT: ret <2 x i1> [[D]] ; %B = lshr <2 x i39> %X, %A diff --git a/test/Transforms/InstCombine/icmp.ll b/test/Transforms/InstCombine/icmp.ll index 1f05bb67e96..1f97009911b 100644 --- a/test/Transforms/InstCombine/icmp.ll +++ b/test/Transforms/InstCombine/icmp.ll @@ -2427,9 +2427,10 @@ define i1 @icmp_and_or_lshr(i32 %x, i32 %y) { define <2 x i1> @icmp_and_or_lshr_vec(<2 x i32> %x, <2 x i32> %y) { ; CHECK-LABEL: @icmp_and_or_lshr_vec( -; CHECK-NEXT: [[SHF:%.*]] = lshr <2 x i32> [[X:%.*]], [[Y:%.*]] -; CHECK-NEXT: [[OR:%.*]] = or <2 x i32> [[SHF]], [[X]] -; CHECK-NEXT: [[RET:%.*]] = trunc <2 x i32> [[OR]] to <2 x i1> +; CHECK-NEXT: [[SHF1:%.*]] = shl nuw <2 x i32> , [[Y:%.*]] +; CHECK-NEXT: [[OR2:%.*]] = or <2 x i32> [[SHF1]], +; CHECK-NEXT: [[AND3:%.*]] = and <2 x i32> [[OR2]], [[X:%.*]] +; CHECK-NEXT: [[RET:%.*]] = icmp ne <2 x i32> [[AND3]], zeroinitializer ; CHECK-NEXT: ret <2 x i1> [[RET]] ; %shf = lshr <2 x i32> %x, %y @@ -2444,7 +2445,8 @@ define <2 x i1> @icmp_and_or_lshr_vec_commute(<2 x i32> %xp, <2 x i32> %y) { ; CHECK-NEXT: [[X:%.*]] = srem <2 x i32> [[XP:%.*]], ; CHECK-NEXT: [[SHF:%.*]] = lshr <2 x i32> [[X]], [[Y:%.*]] ; CHECK-NEXT: [[OR:%.*]] = or <2 x i32> [[X]], [[SHF]] -; CHECK-NEXT: [[RET:%.*]] = trunc <2 x i32> [[OR]] to <2 x i1> +; CHECK-NEXT: [[AND:%.*]] = and <2 x i32> [[OR]], +; CHECK-NEXT: [[RET:%.*]] = icmp ne <2 x i32> [[AND]], zeroinitializer ; CHECK-NEXT: ret <2 x i1> [[RET]] ; %x = srem <2 x i32> %xp, ; prevent complexity-based canonicalization @@ -2470,8 +2472,8 @@ define i1 @icmp_and_or_lshr_cst(i32 %x) { define <2 x i1> @icmp_and_or_lshr_cst_vec(<2 x i32> %x) { ; CHECK-LABEL: @icmp_and_or_lshr_cst_vec( -; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i32> [[X:%.*]], -; CHECK-NEXT: [[RET:%.*]] = icmp ne <2 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[AND1:%.*]] = and <2 x i32> [[X:%.*]], +; CHECK-NEXT: [[RET:%.*]] = icmp ne <2 x i32> [[AND1]], zeroinitializer ; CHECK-NEXT: ret <2 x i1> [[RET]] ; %shf = lshr <2 x i32> %x, @@ -2484,8 +2486,10 @@ define <2 x i1> @icmp_and_or_lshr_cst_vec(<2 x i32> %x) { define <2 x i1> @icmp_and_or_lshr_cst_vec_commute(<2 x i32> %xp) { ; CHECK-LABEL: @icmp_and_or_lshr_cst_vec_commute( ; CHECK-NEXT: [[X:%.*]] = srem <2 x i32> [[XP:%.*]], -; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i32> [[X]], -; CHECK-NEXT: [[RET:%.*]] = icmp ne <2 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[SHF:%.*]] = lshr <2 x i32> [[X]], +; CHECK-NEXT: [[OR:%.*]] = or <2 x i32> [[X]], [[SHF]] +; CHECK-NEXT: [[AND:%.*]] = and <2 x i32> [[OR]], +; CHECK-NEXT: [[RET:%.*]] = icmp ne <2 x i32> [[AND]], zeroinitializer ; CHECK-NEXT: ret <2 x i1> [[RET]] ; %x = srem <2 x i32> %xp, ; prevent complexity-based canonicalization diff --git a/test/Transforms/InstCombine/vec_shuffle.ll b/test/Transforms/InstCombine/vec_shuffle.ll index 8adb211b0a9..e9c3539ef6b 100644 --- a/test/Transforms/InstCombine/vec_shuffle.ll +++ b/test/Transforms/InstCombine/vec_shuffle.ll @@ -174,7 +174,8 @@ define <8 x i8> @test12a(<8 x i8> %t6, <8 x i8> %t2) { define <2 x i8> @extract_subvector_of_shuffle(<2 x i8> %x, <2 x i8> %y) { ; CHECK-LABEL: @extract_subvector_of_shuffle( -; CHECK-NEXT: [[EXTRACT_SUBV:%.*]] = shufflevector <2 x i8> [[X:%.*]], <2 x i8> [[Y:%.*]], <2 x i32> +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <2 x i8> [[X:%.*]], <2 x i8> [[Y:%.*]], <3 x i32> +; CHECK-NEXT: [[EXTRACT_SUBV:%.*]] = shufflevector <3 x i8> [[SHUF]], <3 x i8> undef, <2 x i32> ; CHECK-NEXT: ret <2 x i8> [[EXTRACT_SUBV]] ; %shuf = shufflevector <2 x i8> %x, <2 x i8> %y, <3 x i32> @@ -193,7 +194,7 @@ define <4 x i8> @extract_subvector_of_shuffle_extra_use(<2 x i8> %x, <2 x i8> %y ; CHECK-LABEL: @extract_subvector_of_shuffle_extra_use( ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <2 x i8> [[X:%.*]], <2 x i8> [[Y:%.*]], <5 x i32> ; CHECK-NEXT: call void @use_v5i8(<5 x i8> [[SHUF]]) -; CHECK-NEXT: [[EXTRACT_SUBV:%.*]] = shufflevector <2 x i8> [[X]], <2 x i8> [[Y]], <4 x i32> +; CHECK-NEXT: [[EXTRACT_SUBV:%.*]] = shufflevector <5 x i8> [[SHUF]], <5 x i8> undef, <4 x i32> ; CHECK-NEXT: ret <4 x i8> [[EXTRACT_SUBV]] ; %shuf = shufflevector <2 x i8> %x, <2 x i8> %y, <5 x i32> diff --git a/test/Transforms/InstCombine/vector-casts.ll b/test/Transforms/InstCombine/vector-casts.ll index e0d6083a969..6e0d66b8ed4 100644 --- a/test/Transforms/InstCombine/vector-casts.ll +++ b/test/Transforms/InstCombine/vector-casts.ll @@ -1,22 +1,26 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -instcombine -S | FileCheck %s -; Can't get smaller than this. +; This turns into a&1 != 0 +; TODO: The bar for canonicalizing to something bigger than the minimal length IR is very high. +; This pattern does not appear to meet that standard. define <2 x i1> @trunc(<2 x i64> %a) { ; CHECK-LABEL: @trunc( -; CHECK-NEXT: [[T:%.*]] = trunc <2 x i64> [[A:%.*]] to <2 x i1> +; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i64> [[A:%.*]], +; CHECK-NEXT: [[T:%.*]] = icmp ne <2 x i64> [[TMP1]], zeroinitializer ; CHECK-NEXT: ret <2 x i1> [[T]] ; %t = trunc <2 x i64> %a to <2 x i1> ret <2 x i1> %t } -; This is trunc. +; TODO: This could be just 1 instruction (trunc). define <2 x i1> @and_cmp_is_trunc(<2 x i64> %a) { ; CHECK-LABEL: @and_cmp_is_trunc( -; CHECK-NEXT: [[R:%.*]] = trunc <2 x i64> [[A:%.*]] to <2 x i1> +; CHECK-NEXT: [[T:%.*]] = and <2 x i64> [[A:%.*]], +; CHECK-NEXT: [[R:%.*]] = icmp ne <2 x i64> [[T]], zeroinitializer ; CHECK-NEXT: ret <2 x i1> [[R]] ; %t = and <2 x i64> %a, @@ -24,11 +28,12 @@ define <2 x i1> @and_cmp_is_trunc(<2 x i64> %a) { ret <2 x i1> %r } -; This is trunc. +; TODO: This could be just 1 instruction (trunc). define <2 x i1> @and_cmp_is_trunc_even_with_undef_elt(<2 x i64> %a) { ; CHECK-LABEL: @and_cmp_is_trunc_even_with_undef_elt( -; CHECK-NEXT: [[R:%.*]] = trunc <2 x i64> [[A:%.*]] to <2 x i1> +; CHECK-NEXT: [[T:%.*]] = and <2 x i64> [[A:%.*]], +; CHECK-NEXT: [[R:%.*]] = icmp ne <2 x i64> [[T]], zeroinitializer ; CHECK-NEXT: ret <2 x i1> [[R]] ; %t = and <2 x i64> %a, @@ -36,7 +41,7 @@ define <2 x i1> @and_cmp_is_trunc_even_with_undef_elt(<2 x i64> %a) { ret <2 x i1> %r } -; TODO: This could be just 1 instruction (trunc), but our undef matching is incomplete. +; TODO: This could be just 1 instruction (trunc). define <2 x i1> @and_cmp_is_trunc_even_with_undef_elts(<2 x i64> %a) { ; CHECK-LABEL: @and_cmp_is_trunc_even_with_undef_elts( diff --git a/test/Transforms/LoopVectorize/X86/masked_load_store.ll b/test/Transforms/LoopVectorize/X86/masked_load_store.ll index 71038feec7b..8e948639ba1 100644 --- a/test/Transforms/LoopVectorize/X86/masked_load_store.ll +++ b/test/Transforms/LoopVectorize/X86/masked_load_store.ll @@ -2901,45 +2901,49 @@ define void @foo7(double* noalias %out, double** noalias %in, i8* noalias %trigg ; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 12 ; AVX-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <4 x i8>* ; AVX-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x i8>, <4 x i8>* [[TMP7]], align 1 -; AVX-NEXT: [[TMP8:%.*]] = trunc <4 x i8> [[WIDE_LOAD]] to <4 x i1> -; AVX-NEXT: [[TMP9:%.*]] = trunc <4 x i8> [[WIDE_LOAD10]] to <4 x i1> -; AVX-NEXT: [[TMP10:%.*]] = trunc <4 x i8> [[WIDE_LOAD11]] to <4 x i1> -; AVX-NEXT: [[TMP11:%.*]] = trunc <4 x i8> [[WIDE_LOAD12]] to <4 x i1> -; AVX-NEXT: [[TMP12:%.*]] = getelementptr inbounds double*, double** [[IN:%.*]], i64 [[INDEX]] -; AVX-NEXT: [[TMP13:%.*]] = bitcast double** [[TMP12]] to <4 x double*>* -; AVX-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP13]], i32 8, <4 x i1> [[TMP8]], <4 x double*> undef) -; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 4 -; AVX-NEXT: [[TMP15:%.*]] = bitcast double** [[TMP14]] to <4 x double*>* -; AVX-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP15]], i32 8, <4 x i1> [[TMP9]], <4 x double*> undef) -; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 8 +; AVX-NEXT: [[TMP8:%.*]] = and <4 x i8> [[WIDE_LOAD]], +; AVX-NEXT: [[TMP9:%.*]] = and <4 x i8> [[WIDE_LOAD10]], +; AVX-NEXT: [[TMP10:%.*]] = and <4 x i8> [[WIDE_LOAD11]], +; AVX-NEXT: [[TMP11:%.*]] = and <4 x i8> [[WIDE_LOAD12]], +; AVX-NEXT: [[TMP12:%.*]] = icmp ne <4 x i8> [[TMP8]], zeroinitializer +; AVX-NEXT: [[TMP13:%.*]] = icmp ne <4 x i8> [[TMP9]], zeroinitializer +; AVX-NEXT: [[TMP14:%.*]] = icmp ne <4 x i8> [[TMP10]], zeroinitializer +; AVX-NEXT: [[TMP15:%.*]] = icmp ne <4 x i8> [[TMP11]], zeroinitializer +; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds double*, double** [[IN:%.*]], i64 [[INDEX]] ; AVX-NEXT: [[TMP17:%.*]] = bitcast double** [[TMP16]] to <4 x double*>* -; AVX-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP17]], i32 8, <4 x i1> [[TMP10]], <4 x double*> undef) -; AVX-NEXT: [[TMP18:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 12 +; AVX-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP17]], i32 8, <4 x i1> [[TMP12]], <4 x double*> undef) +; AVX-NEXT: [[TMP18:%.*]] = getelementptr inbounds double*, double** [[TMP16]], i64 4 ; AVX-NEXT: [[TMP19:%.*]] = bitcast double** [[TMP18]] to <4 x double*>* -; AVX-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP19]], i32 8, <4 x i1> [[TMP11]], <4 x double*> undef) -; AVX-NEXT: [[TMP20:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer -; AVX-NEXT: [[TMP21:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD13]], zeroinitializer -; AVX-NEXT: [[TMP22:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD14]], zeroinitializer -; AVX-NEXT: [[TMP23:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD15]], zeroinitializer -; AVX-NEXT: [[TMP24:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]] -; AVX-NEXT: [[TMP25:%.*]] = and <4 x i1> [[TMP20]], [[TMP8]] -; AVX-NEXT: [[TMP26:%.*]] = and <4 x i1> [[TMP21]], [[TMP9]] -; AVX-NEXT: [[TMP27:%.*]] = and <4 x i1> [[TMP22]], [[TMP10]] -; AVX-NEXT: [[TMP28:%.*]] = and <4 x i1> [[TMP23]], [[TMP11]] -; AVX-NEXT: [[TMP29:%.*]] = bitcast double* [[TMP24]] to <4 x double>* -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP29]], i32 8, <4 x i1> [[TMP25]]) -; AVX-NEXT: [[TMP30:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 4 -; AVX-NEXT: [[TMP31:%.*]] = bitcast double* [[TMP30]] to <4 x double>* -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP31]], i32 8, <4 x i1> [[TMP26]]) -; AVX-NEXT: [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 8 -; AVX-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP32]] to <4 x double>* -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP33]], i32 8, <4 x i1> [[TMP27]]) -; AVX-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 12 +; AVX-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP19]], i32 8, <4 x i1> [[TMP13]], <4 x double*> undef) +; AVX-NEXT: [[TMP20:%.*]] = getelementptr inbounds double*, double** [[TMP16]], i64 8 +; AVX-NEXT: [[TMP21:%.*]] = bitcast double** [[TMP20]] to <4 x double*>* +; AVX-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP21]], i32 8, <4 x i1> [[TMP14]], <4 x double*> undef) +; AVX-NEXT: [[TMP22:%.*]] = getelementptr inbounds double*, double** [[TMP16]], i64 12 +; AVX-NEXT: [[TMP23:%.*]] = bitcast double** [[TMP22]] to <4 x double*>* +; AVX-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP23]], i32 8, <4 x i1> [[TMP15]], <4 x double*> undef) +; AVX-NEXT: [[TMP24:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer +; AVX-NEXT: [[TMP25:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD13]], zeroinitializer +; AVX-NEXT: [[TMP26:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD14]], zeroinitializer +; AVX-NEXT: [[TMP27:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD15]], zeroinitializer +; AVX-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]] +; AVX-NEXT: [[TMP29:%.*]] = and <4 x i1> [[TMP24]], [[TMP12]] +; AVX-NEXT: [[TMP30:%.*]] = and <4 x i1> [[TMP25]], [[TMP13]] +; AVX-NEXT: [[TMP31:%.*]] = and <4 x i1> [[TMP26]], [[TMP14]] +; AVX-NEXT: [[TMP32:%.*]] = and <4 x i1> [[TMP27]], [[TMP15]] +; AVX-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP28]] to <4 x double>* +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP33]], i32 8, <4 x i1> [[TMP29]]) +; AVX-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 4 ; AVX-NEXT: [[TMP35:%.*]] = bitcast double* [[TMP34]] to <4 x double>* -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP35]], i32 8, <4 x i1> [[TMP28]]) +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP35]], i32 8, <4 x i1> [[TMP30]]) +; AVX-NEXT: [[TMP36:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 8 +; AVX-NEXT: [[TMP37:%.*]] = bitcast double* [[TMP36]] to <4 x double>* +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP37]], i32 8, <4 x i1> [[TMP31]]) +; AVX-NEXT: [[TMP38:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 12 +; AVX-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <4 x double>* +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP39]], i32 8, <4 x i1> [[TMP32]]) ; AVX-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 -; AVX-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX-NEXT: br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !51 +; AVX-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; AVX-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !51 ; AVX: middle.block: ; AVX-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]] ; AVX-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER16]] @@ -2949,14 +2953,14 @@ define void @foo7(double* noalias %out, double** noalias %in, i8* noalias %trigg ; AVX: for.body: ; AVX-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER16]] ] ; AVX-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX-NEXT: [[TMP37:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; AVX-NEXT: [[TMP38:%.*]] = and i8 [[TMP37]], 1 -; AVX-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP38]], 0 +; AVX-NEXT: [[TMP41:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; AVX-NEXT: [[TMP42:%.*]] = and i8 [[TMP41]], 1 +; AVX-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP42]], 0 ; AVX-NEXT: br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]] ; AVX: land.lhs.true: ; AVX-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[INDVARS_IV]] -; AVX-NEXT: [[TMP39:%.*]] = load double*, double** [[ARRAYIDX2]], align 8 -; AVX-NEXT: [[CMP3:%.*]] = icmp eq double* [[TMP39]], null +; AVX-NEXT: [[TMP43:%.*]] = load double*, double** [[ARRAYIDX2]], align 8 +; AVX-NEXT: [[CMP3:%.*]] = icmp eq double* [[TMP43]], null ; AVX-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]] ; AVX: if.then: ; AVX-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]] @@ -2994,45 +2998,49 @@ define void @foo7(double* noalias %out, double** noalias %in, i8* noalias %trigg ; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 24 ; AVX512-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <8 x i8>* ; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i8>, <8 x i8>* [[TMP7]], align 1 -; AVX512-NEXT: [[TMP8:%.*]] = trunc <8 x i8> [[WIDE_LOAD]] to <8 x i1> -; AVX512-NEXT: [[TMP9:%.*]] = trunc <8 x i8> [[WIDE_LOAD10]] to <8 x i1> -; AVX512-NEXT: [[TMP10:%.*]] = trunc <8 x i8> [[WIDE_LOAD11]] to <8 x i1> -; AVX512-NEXT: [[TMP11:%.*]] = trunc <8 x i8> [[WIDE_LOAD12]] to <8 x i1> -; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds double*, double** [[IN:%.*]], i64 [[INDEX]] -; AVX512-NEXT: [[TMP13:%.*]] = bitcast double** [[TMP12]] to <8 x double*>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP13]], i32 8, <8 x i1> [[TMP8]], <8 x double*> undef) -; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 8 -; AVX512-NEXT: [[TMP15:%.*]] = bitcast double** [[TMP14]] to <8 x double*>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP15]], i32 8, <8 x i1> [[TMP9]], <8 x double*> undef) -; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 16 +; AVX512-NEXT: [[TMP8:%.*]] = and <8 x i8> [[WIDE_LOAD]], +; AVX512-NEXT: [[TMP9:%.*]] = and <8 x i8> [[WIDE_LOAD10]], +; AVX512-NEXT: [[TMP10:%.*]] = and <8 x i8> [[WIDE_LOAD11]], +; AVX512-NEXT: [[TMP11:%.*]] = and <8 x i8> [[WIDE_LOAD12]], +; AVX512-NEXT: [[TMP12:%.*]] = icmp ne <8 x i8> [[TMP8]], zeroinitializer +; AVX512-NEXT: [[TMP13:%.*]] = icmp ne <8 x i8> [[TMP9]], zeroinitializer +; AVX512-NEXT: [[TMP14:%.*]] = icmp ne <8 x i8> [[TMP10]], zeroinitializer +; AVX512-NEXT: [[TMP15:%.*]] = icmp ne <8 x i8> [[TMP11]], zeroinitializer +; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds double*, double** [[IN:%.*]], i64 [[INDEX]] ; AVX512-NEXT: [[TMP17:%.*]] = bitcast double** [[TMP16]] to <8 x double*>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP17]], i32 8, <8 x i1> [[TMP10]], <8 x double*> undef) -; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 24 +; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP17]], i32 8, <8 x i1> [[TMP12]], <8 x double*> undef) +; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds double*, double** [[TMP16]], i64 8 ; AVX512-NEXT: [[TMP19:%.*]] = bitcast double** [[TMP18]] to <8 x double*>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP19]], i32 8, <8 x i1> [[TMP11]], <8 x double*> undef) -; AVX512-NEXT: [[TMP20:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer -; AVX512-NEXT: [[TMP21:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD13]], zeroinitializer -; AVX512-NEXT: [[TMP22:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD14]], zeroinitializer -; AVX512-NEXT: [[TMP23:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD15]], zeroinitializer -; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]] -; AVX512-NEXT: [[TMP25:%.*]] = and <8 x i1> [[TMP20]], [[TMP8]] -; AVX512-NEXT: [[TMP26:%.*]] = and <8 x i1> [[TMP21]], [[TMP9]] -; AVX512-NEXT: [[TMP27:%.*]] = and <8 x i1> [[TMP22]], [[TMP10]] -; AVX512-NEXT: [[TMP28:%.*]] = and <8 x i1> [[TMP23]], [[TMP11]] -; AVX512-NEXT: [[TMP29:%.*]] = bitcast double* [[TMP24]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP29]], i32 8, <8 x i1> [[TMP25]]) -; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 8 -; AVX512-NEXT: [[TMP31:%.*]] = bitcast double* [[TMP30]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP31]], i32 8, <8 x i1> [[TMP26]]) -; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 16 -; AVX512-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP32]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP33]], i32 8, <8 x i1> [[TMP27]]) -; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 24 +; AVX512-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP19]], i32 8, <8 x i1> [[TMP13]], <8 x double*> undef) +; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds double*, double** [[TMP16]], i64 16 +; AVX512-NEXT: [[TMP21:%.*]] = bitcast double** [[TMP20]] to <8 x double*>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP21]], i32 8, <8 x i1> [[TMP14]], <8 x double*> undef) +; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds double*, double** [[TMP16]], i64 24 +; AVX512-NEXT: [[TMP23:%.*]] = bitcast double** [[TMP22]] to <8 x double*>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP23]], i32 8, <8 x i1> [[TMP15]], <8 x double*> undef) +; AVX512-NEXT: [[TMP24:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer +; AVX512-NEXT: [[TMP25:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD13]], zeroinitializer +; AVX512-NEXT: [[TMP26:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD14]], zeroinitializer +; AVX512-NEXT: [[TMP27:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD15]], zeroinitializer +; AVX512-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]] +; AVX512-NEXT: [[TMP29:%.*]] = and <8 x i1> [[TMP24]], [[TMP12]] +; AVX512-NEXT: [[TMP30:%.*]] = and <8 x i1> [[TMP25]], [[TMP13]] +; AVX512-NEXT: [[TMP31:%.*]] = and <8 x i1> [[TMP26]], [[TMP14]] +; AVX512-NEXT: [[TMP32:%.*]] = and <8 x i1> [[TMP27]], [[TMP15]] +; AVX512-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP28]] to <8 x double>* +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP33]], i32 8, <8 x i1> [[TMP29]]) +; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 8 ; AVX512-NEXT: [[TMP35:%.*]] = bitcast double* [[TMP34]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP35]], i32 8, <8 x i1> [[TMP28]]) +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP35]], i32 8, <8 x i1> [[TMP30]]) +; AVX512-NEXT: [[TMP36:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 16 +; AVX512-NEXT: [[TMP37:%.*]] = bitcast double* [[TMP36]] to <8 x double>* +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP37]], i32 8, <8 x i1> [[TMP31]]) +; AVX512-NEXT: [[TMP38:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 24 +; AVX512-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <8 x double>* +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP39]], i32 8, <8 x i1> [[TMP32]]) ; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32 -; AVX512-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX512-NEXT: br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !63 +; AVX512-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; AVX512-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !63 ; AVX512: middle.block: ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]] ; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER16]] @@ -3042,14 +3050,14 @@ define void @foo7(double* noalias %out, double** noalias %in, i8* noalias %trigg ; AVX512: for.body: ; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER16]] ] ; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX512-NEXT: [[TMP37:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; AVX512-NEXT: [[TMP38:%.*]] = and i8 [[TMP37]], 1 -; AVX512-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP38]], 0 +; AVX512-NEXT: [[TMP41:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; AVX512-NEXT: [[TMP42:%.*]] = and i8 [[TMP41]], 1 +; AVX512-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP42]], 0 ; AVX512-NEXT: br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]] ; AVX512: land.lhs.true: ; AVX512-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[INDVARS_IV]] -; AVX512-NEXT: [[TMP39:%.*]] = load double*, double** [[ARRAYIDX2]], align 8 -; AVX512-NEXT: [[CMP3:%.*]] = icmp eq double* [[TMP39]], null +; AVX512-NEXT: [[TMP43:%.*]] = load double*, double** [[ARRAYIDX2]], align 8 +; AVX512-NEXT: [[CMP3:%.*]] = icmp eq double* [[TMP43]], null ; AVX512-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]] ; AVX512: if.then: ; AVX512-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]] @@ -3154,45 +3162,49 @@ define void @foo8(double* noalias %out, i32 ()** noalias %in, i8* noalias %trigg ; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 12 ; AVX-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <4 x i8>* ; AVX-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x i8>, <4 x i8>* [[TMP7]], align 1 -; AVX-NEXT: [[TMP8:%.*]] = trunc <4 x i8> [[WIDE_LOAD]] to <4 x i1> -; AVX-NEXT: [[TMP9:%.*]] = trunc <4 x i8> [[WIDE_LOAD10]] to <4 x i1> -; AVX-NEXT: [[TMP10:%.*]] = trunc <4 x i8> [[WIDE_LOAD11]] to <4 x i1> -; AVX-NEXT: [[TMP11:%.*]] = trunc <4 x i8> [[WIDE_LOAD12]] to <4 x i1> -; AVX-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN:%.*]], i64 [[INDEX]] -; AVX-NEXT: [[TMP13:%.*]] = bitcast i32 ()** [[TMP12]] to <4 x i32 ()*>* -; AVX-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP13]], i32 8, <4 x i1> [[TMP8]], <4 x i32 ()*> undef) -; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 4 -; AVX-NEXT: [[TMP15:%.*]] = bitcast i32 ()** [[TMP14]] to <4 x i32 ()*>* -; AVX-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP15]], i32 8, <4 x i1> [[TMP9]], <4 x i32 ()*> undef) -; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 8 +; AVX-NEXT: [[TMP8:%.*]] = and <4 x i8> [[WIDE_LOAD]], +; AVX-NEXT: [[TMP9:%.*]] = and <4 x i8> [[WIDE_LOAD10]], +; AVX-NEXT: [[TMP10:%.*]] = and <4 x i8> [[WIDE_LOAD11]], +; AVX-NEXT: [[TMP11:%.*]] = and <4 x i8> [[WIDE_LOAD12]], +; AVX-NEXT: [[TMP12:%.*]] = icmp ne <4 x i8> [[TMP8]], zeroinitializer +; AVX-NEXT: [[TMP13:%.*]] = icmp ne <4 x i8> [[TMP9]], zeroinitializer +; AVX-NEXT: [[TMP14:%.*]] = icmp ne <4 x i8> [[TMP10]], zeroinitializer +; AVX-NEXT: [[TMP15:%.*]] = icmp ne <4 x i8> [[TMP11]], zeroinitializer +; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN:%.*]], i64 [[INDEX]] ; AVX-NEXT: [[TMP17:%.*]] = bitcast i32 ()** [[TMP16]] to <4 x i32 ()*>* -; AVX-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP17]], i32 8, <4 x i1> [[TMP10]], <4 x i32 ()*> undef) -; AVX-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 12 +; AVX-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP17]], i32 8, <4 x i1> [[TMP12]], <4 x i32 ()*> undef) +; AVX-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP16]], i64 4 ; AVX-NEXT: [[TMP19:%.*]] = bitcast i32 ()** [[TMP18]] to <4 x i32 ()*>* -; AVX-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP19]], i32 8, <4 x i1> [[TMP11]], <4 x i32 ()*> undef) -; AVX-NEXT: [[TMP20:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer -; AVX-NEXT: [[TMP21:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD13]], zeroinitializer -; AVX-NEXT: [[TMP22:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD14]], zeroinitializer -; AVX-NEXT: [[TMP23:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD15]], zeroinitializer -; AVX-NEXT: [[TMP24:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]] -; AVX-NEXT: [[TMP25:%.*]] = and <4 x i1> [[TMP20]], [[TMP8]] -; AVX-NEXT: [[TMP26:%.*]] = and <4 x i1> [[TMP21]], [[TMP9]] -; AVX-NEXT: [[TMP27:%.*]] = and <4 x i1> [[TMP22]], [[TMP10]] -; AVX-NEXT: [[TMP28:%.*]] = and <4 x i1> [[TMP23]], [[TMP11]] -; AVX-NEXT: [[TMP29:%.*]] = bitcast double* [[TMP24]] to <4 x double>* -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP29]], i32 8, <4 x i1> [[TMP25]]) -; AVX-NEXT: [[TMP30:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 4 -; AVX-NEXT: [[TMP31:%.*]] = bitcast double* [[TMP30]] to <4 x double>* -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP31]], i32 8, <4 x i1> [[TMP26]]) -; AVX-NEXT: [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 8 -; AVX-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP32]] to <4 x double>* -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP33]], i32 8, <4 x i1> [[TMP27]]) -; AVX-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 12 +; AVX-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP19]], i32 8, <4 x i1> [[TMP13]], <4 x i32 ()*> undef) +; AVX-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP16]], i64 8 +; AVX-NEXT: [[TMP21:%.*]] = bitcast i32 ()** [[TMP20]] to <4 x i32 ()*>* +; AVX-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP21]], i32 8, <4 x i1> [[TMP14]], <4 x i32 ()*> undef) +; AVX-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP16]], i64 12 +; AVX-NEXT: [[TMP23:%.*]] = bitcast i32 ()** [[TMP22]] to <4 x i32 ()*>* +; AVX-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP23]], i32 8, <4 x i1> [[TMP15]], <4 x i32 ()*> undef) +; AVX-NEXT: [[TMP24:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer +; AVX-NEXT: [[TMP25:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD13]], zeroinitializer +; AVX-NEXT: [[TMP26:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD14]], zeroinitializer +; AVX-NEXT: [[TMP27:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD15]], zeroinitializer +; AVX-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]] +; AVX-NEXT: [[TMP29:%.*]] = and <4 x i1> [[TMP24]], [[TMP12]] +; AVX-NEXT: [[TMP30:%.*]] = and <4 x i1> [[TMP25]], [[TMP13]] +; AVX-NEXT: [[TMP31:%.*]] = and <4 x i1> [[TMP26]], [[TMP14]] +; AVX-NEXT: [[TMP32:%.*]] = and <4 x i1> [[TMP27]], [[TMP15]] +; AVX-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP28]] to <4 x double>* +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP33]], i32 8, <4 x i1> [[TMP29]]) +; AVX-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 4 ; AVX-NEXT: [[TMP35:%.*]] = bitcast double* [[TMP34]] to <4 x double>* -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP35]], i32 8, <4 x i1> [[TMP28]]) +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP35]], i32 8, <4 x i1> [[TMP30]]) +; AVX-NEXT: [[TMP36:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 8 +; AVX-NEXT: [[TMP37:%.*]] = bitcast double* [[TMP36]] to <4 x double>* +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP37]], i32 8, <4 x i1> [[TMP31]]) +; AVX-NEXT: [[TMP38:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 12 +; AVX-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <4 x double>* +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP39]], i32 8, <4 x i1> [[TMP32]]) ; AVX-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 -; AVX-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX-NEXT: br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !54 +; AVX-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; AVX-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !54 ; AVX: middle.block: ; AVX-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]] ; AVX-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER16]] @@ -3202,14 +3214,14 @@ define void @foo8(double* noalias %out, i32 ()** noalias %in, i8* noalias %trigg ; AVX: for.body: ; AVX-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER16]] ] ; AVX-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX-NEXT: [[TMP37:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; AVX-NEXT: [[TMP38:%.*]] = and i8 [[TMP37]], 1 -; AVX-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP38]], 0 +; AVX-NEXT: [[TMP41:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; AVX-NEXT: [[TMP42:%.*]] = and i8 [[TMP41]], 1 +; AVX-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP42]], 0 ; AVX-NEXT: br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]] ; AVX: land.lhs.true: ; AVX-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[INDVARS_IV]] -; AVX-NEXT: [[TMP39:%.*]] = load i32 ()*, i32 ()** [[ARRAYIDX2]], align 8 -; AVX-NEXT: [[CMP3:%.*]] = icmp eq i32 ()* [[TMP39]], null +; AVX-NEXT: [[TMP43:%.*]] = load i32 ()*, i32 ()** [[ARRAYIDX2]], align 8 +; AVX-NEXT: [[CMP3:%.*]] = icmp eq i32 ()* [[TMP43]], null ; AVX-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]] ; AVX: if.then: ; AVX-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]] @@ -3247,45 +3259,49 @@ define void @foo8(double* noalias %out, i32 ()** noalias %in, i8* noalias %trigg ; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 24 ; AVX512-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <8 x i8>* ; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i8>, <8 x i8>* [[TMP7]], align 1 -; AVX512-NEXT: [[TMP8:%.*]] = trunc <8 x i8> [[WIDE_LOAD]] to <8 x i1> -; AVX512-NEXT: [[TMP9:%.*]] = trunc <8 x i8> [[WIDE_LOAD10]] to <8 x i1> -; AVX512-NEXT: [[TMP10:%.*]] = trunc <8 x i8> [[WIDE_LOAD11]] to <8 x i1> -; AVX512-NEXT: [[TMP11:%.*]] = trunc <8 x i8> [[WIDE_LOAD12]] to <8 x i1> -; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN:%.*]], i64 [[INDEX]] -; AVX512-NEXT: [[TMP13:%.*]] = bitcast i32 ()** [[TMP12]] to <8 x i32 ()*>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP13]], i32 8, <8 x i1> [[TMP8]], <8 x i32 ()*> undef) -; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 8 -; AVX512-NEXT: [[TMP15:%.*]] = bitcast i32 ()** [[TMP14]] to <8 x i32 ()*>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP15]], i32 8, <8 x i1> [[TMP9]], <8 x i32 ()*> undef) -; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 16 +; AVX512-NEXT: [[TMP8:%.*]] = and <8 x i8> [[WIDE_LOAD]], +; AVX512-NEXT: [[TMP9:%.*]] = and <8 x i8> [[WIDE_LOAD10]], +; AVX512-NEXT: [[TMP10:%.*]] = and <8 x i8> [[WIDE_LOAD11]], +; AVX512-NEXT: [[TMP11:%.*]] = and <8 x i8> [[WIDE_LOAD12]], +; AVX512-NEXT: [[TMP12:%.*]] = icmp ne <8 x i8> [[TMP8]], zeroinitializer +; AVX512-NEXT: [[TMP13:%.*]] = icmp ne <8 x i8> [[TMP9]], zeroinitializer +; AVX512-NEXT: [[TMP14:%.*]] = icmp ne <8 x i8> [[TMP10]], zeroinitializer +; AVX512-NEXT: [[TMP15:%.*]] = icmp ne <8 x i8> [[TMP11]], zeroinitializer +; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN:%.*]], i64 [[INDEX]] ; AVX512-NEXT: [[TMP17:%.*]] = bitcast i32 ()** [[TMP16]] to <8 x i32 ()*>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP17]], i32 8, <8 x i1> [[TMP10]], <8 x i32 ()*> undef) -; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 24 +; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP17]], i32 8, <8 x i1> [[TMP12]], <8 x i32 ()*> undef) +; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP16]], i64 8 ; AVX512-NEXT: [[TMP19:%.*]] = bitcast i32 ()** [[TMP18]] to <8 x i32 ()*>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP19]], i32 8, <8 x i1> [[TMP11]], <8 x i32 ()*> undef) -; AVX512-NEXT: [[TMP20:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer -; AVX512-NEXT: [[TMP21:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD13]], zeroinitializer -; AVX512-NEXT: [[TMP22:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD14]], zeroinitializer -; AVX512-NEXT: [[TMP23:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD15]], zeroinitializer -; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]] -; AVX512-NEXT: [[TMP25:%.*]] = and <8 x i1> [[TMP20]], [[TMP8]] -; AVX512-NEXT: [[TMP26:%.*]] = and <8 x i1> [[TMP21]], [[TMP9]] -; AVX512-NEXT: [[TMP27:%.*]] = and <8 x i1> [[TMP22]], [[TMP10]] -; AVX512-NEXT: [[TMP28:%.*]] = and <8 x i1> [[TMP23]], [[TMP11]] -; AVX512-NEXT: [[TMP29:%.*]] = bitcast double* [[TMP24]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP29]], i32 8, <8 x i1> [[TMP25]]) -; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 8 -; AVX512-NEXT: [[TMP31:%.*]] = bitcast double* [[TMP30]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP31]], i32 8, <8 x i1> [[TMP26]]) -; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 16 -; AVX512-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP32]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP33]], i32 8, <8 x i1> [[TMP27]]) -; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 24 +; AVX512-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP19]], i32 8, <8 x i1> [[TMP13]], <8 x i32 ()*> undef) +; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP16]], i64 16 +; AVX512-NEXT: [[TMP21:%.*]] = bitcast i32 ()** [[TMP20]] to <8 x i32 ()*>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP21]], i32 8, <8 x i1> [[TMP14]], <8 x i32 ()*> undef) +; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP16]], i64 24 +; AVX512-NEXT: [[TMP23:%.*]] = bitcast i32 ()** [[TMP22]] to <8 x i32 ()*>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP23]], i32 8, <8 x i1> [[TMP15]], <8 x i32 ()*> undef) +; AVX512-NEXT: [[TMP24:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer +; AVX512-NEXT: [[TMP25:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD13]], zeroinitializer +; AVX512-NEXT: [[TMP26:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD14]], zeroinitializer +; AVX512-NEXT: [[TMP27:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD15]], zeroinitializer +; AVX512-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]] +; AVX512-NEXT: [[TMP29:%.*]] = and <8 x i1> [[TMP24]], [[TMP12]] +; AVX512-NEXT: [[TMP30:%.*]] = and <8 x i1> [[TMP25]], [[TMP13]] +; AVX512-NEXT: [[TMP31:%.*]] = and <8 x i1> [[TMP26]], [[TMP14]] +; AVX512-NEXT: [[TMP32:%.*]] = and <8 x i1> [[TMP27]], [[TMP15]] +; AVX512-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP28]] to <8 x double>* +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP33]], i32 8, <8 x i1> [[TMP29]]) +; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 8 ; AVX512-NEXT: [[TMP35:%.*]] = bitcast double* [[TMP34]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP35]], i32 8, <8 x i1> [[TMP28]]) +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP35]], i32 8, <8 x i1> [[TMP30]]) +; AVX512-NEXT: [[TMP36:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 16 +; AVX512-NEXT: [[TMP37:%.*]] = bitcast double* [[TMP36]] to <8 x double>* +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP37]], i32 8, <8 x i1> [[TMP31]]) +; AVX512-NEXT: [[TMP38:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 24 +; AVX512-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <8 x double>* +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP39]], i32 8, <8 x i1> [[TMP32]]) ; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32 -; AVX512-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX512-NEXT: br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !66 +; AVX512-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; AVX512-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !66 ; AVX512: middle.block: ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]] ; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER16]] @@ -3295,14 +3311,14 @@ define void @foo8(double* noalias %out, i32 ()** noalias %in, i8* noalias %trigg ; AVX512: for.body: ; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER16]] ] ; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX512-NEXT: [[TMP37:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; AVX512-NEXT: [[TMP38:%.*]] = and i8 [[TMP37]], 1 -; AVX512-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP38]], 0 +; AVX512-NEXT: [[TMP41:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; AVX512-NEXT: [[TMP42:%.*]] = and i8 [[TMP41]], 1 +; AVX512-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP42]], 0 ; AVX512-NEXT: br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]] ; AVX512: land.lhs.true: ; AVX512-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[INDVARS_IV]] -; AVX512-NEXT: [[TMP39:%.*]] = load i32 ()*, i32 ()** [[ARRAYIDX2]], align 8 -; AVX512-NEXT: [[CMP3:%.*]] = icmp eq i32 ()* [[TMP39]], null +; AVX512-NEXT: [[TMP43:%.*]] = load i32 ()*, i32 ()** [[ARRAYIDX2]], align 8 +; AVX512-NEXT: [[CMP3:%.*]] = icmp eq i32 ()* [[TMP43]], null ; AVX512-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]] ; AVX512: if.then: ; AVX512-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]] -- GitLab From 6029ddd2298b5a52ba2cc29e3f4c38f6c7cfab20 Mon Sep 17 00:00:00 2001 From: Roman Lebedev Date: Wed, 10 Oct 2018 20:40:12 +0000 Subject: [PATCH 0020/1116] [X86] Move X86DAGToDAGISel::matchBEXTRFromAnd() into X86ISelLowering Summary: As discussed in [[ https://bugs.llvm.org/show_bug.cgi?id=38938 | PR38938 ]], we fail to emit `BEXTR` if the mask is shifted. We can't deal with that in `X86DAGToDAGISel` `before the address mode for the inc is selected`, and we can't really do it in the normal DAGCombine, because we don't have generic `ISD::BitFieldExtract` node, and if we simply turn the shifted mask into a normal mask + shift-left, it will be folded back. So it would seem X86ISelLowering is the place to handle this. This patch only moves the matchBEXTRFromAnd() from X86DAGToDAGISel to X86ISelLowering. It does not add support for the 'shifted mask' pattern. Reviewers: RKSimon, craig.topper, spatel Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D52426 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344179 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelDAGToDAG.cpp | 66 ------------------------------ lib/Target/X86/X86ISelLowering.cpp | 66 ++++++++++++++++++++++++++++++ test/CodeGen/X86/tbm_patterns.ll | 6 +-- 3 files changed, 68 insertions(+), 70 deletions(-) diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index be079659da4..25a8567a9c1 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -457,7 +457,6 @@ namespace { } bool foldLoadStoreIntoMemOperand(SDNode *Node); - bool matchBEXTRFromAnd(SDNode *Node); bool shrinkAndImmediate(SDNode *N); bool isMaskZeroExtended(SDNode *N) const; bool tryShiftAmountMod(SDNode *N); @@ -2582,69 +2581,6 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) { return true; } -// See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI. -bool X86DAGToDAGISel::matchBEXTRFromAnd(SDNode *Node) { - MVT NVT = Node->getSimpleValueType(0); - SDLoc dl(Node); - - SDValue N0 = Node->getOperand(0); - SDValue N1 = Node->getOperand(1); - - // If we have TBM we can use an immediate for the control. If we have BMI - // we should only do this if the BEXTR instruction is implemented well. - // Otherwise moving the control into a register makes this more costly. - // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM - // hoisting the move immediate would make it worthwhile with a less optimal - // BEXTR? - if (!Subtarget->hasTBM() && - !(Subtarget->hasBMI() && Subtarget->hasFastBEXTR())) - return false; - - // Must have a shift right. - if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA) - return false; - - // Shift can't have additional users. - if (!N0->hasOneUse()) - return false; - - // Only supported for 32 and 64 bits. - if (NVT != MVT::i32 && NVT != MVT::i64) - return false; - - // Shift amount and RHS of and must be constant. - ConstantSDNode *MaskCst = dyn_cast(N1); - ConstantSDNode *ShiftCst = dyn_cast(N0->getOperand(1)); - if (!MaskCst || !ShiftCst) - return false; - - // And RHS must be a mask. - uint64_t Mask = MaskCst->getZExtValue(); - if (!isMask_64(Mask)) - return false; - - uint64_t Shift = ShiftCst->getZExtValue(); - uint64_t MaskSize = countPopulation(Mask); - - // Don't interfere with something that can be handled by extracting AH. - // TODO: If we are able to fold a load, BEXTR might still be better than AH. - if (Shift == 8 && MaskSize == 8) - return false; - - // Make sure we are only using bits that were in the original value, not - // shifted in. - if (Shift + MaskSize > NVT.getSizeInBits()) - return false; - - // Create a BEXTR node and run it through selection. - SDValue C = CurDAG->getConstant(Shift | (MaskSize << 8), dl, NVT); - SDValue New = CurDAG->getNode(X86ISD::BEXTR, dl, NVT, - N0->getOperand(0), C); - ReplaceNode(Node, New.getNode()); - SelectCode(New.getNode()); - return true; -} - // Emit a PCMISTR(I/M) instruction. MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad, const SDLoc &dl, @@ -2952,8 +2888,6 @@ void X86DAGToDAGISel::Select(SDNode *Node) { break; case ISD::AND: - if (matchBEXTRFromAnd(Node)) - return; if (AndImmShrink && shrinkAndImmediate(Node)) return; diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 67f98d8ee72..ab9a14a65a1 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -35278,6 +35278,69 @@ static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG, return SDValue(); } +static bool hasBEXTR(const X86Subtarget &Subtarget, EVT VT) { + // If we have TBM we can use an immediate for the control. If we have BMI + // we should only do this if the BEXTR instruction is implemented well. + // Otherwise moving the control into a register makes this more costly. + // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM + // hoisting the move immediate would make it worthwhile with a less optimal + // BEXTR? + if (!Subtarget.hasTBM() && !(Subtarget.hasBMI() && Subtarget.hasFastBEXTR())) + return false; + return (VT == MVT::i32 || (VT == MVT::i64 && Subtarget.is64Bit())); +} + +// See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI. +static SDValue combineAndIntoBEXTR(SDNode *Node, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + EVT NVT = Node->getValueType(0); + SDLoc dl(Node); + + SDValue N0 = Node->getOperand(0); + SDValue N1 = Node->getOperand(1); + + // Check if subtarget has BEXTR instruction for the node's type + if (!hasBEXTR(Subtarget, NVT)) + return SDValue(); + + // Must have a shift right. + if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA) + return SDValue(); + + // Shift can't have additional users. + if (!N0->hasOneUse()) + return SDValue(); + + // Shift amount and RHS of and must be constant. + ConstantSDNode *MaskCst = dyn_cast(N1); + ConstantSDNode *ShiftCst = dyn_cast(N0->getOperand(1)); + if (!MaskCst || !ShiftCst) + return SDValue(); + + // And RHS must be a mask. + uint64_t Mask = MaskCst->getZExtValue(); + if (!isMask_64(Mask)) + return SDValue(); + + uint64_t Shift = ShiftCst->getZExtValue(); + uint64_t MaskSize = countPopulation(Mask); + + // Don't interfere with something that can be handled by extracting AH. + // TODO: If we are able to fold a load, BEXTR might still be better than AH. + if (Shift == 8 && MaskSize == 8) + return SDValue(); + + // Make sure we are only using bits that were in the original value, not + // shifted in. + if (Shift + MaskSize > NVT.getSizeInBits()) + return SDValue(); + + // Create a BEXTR node. + SDValue C = DAG.getConstant(Shift | (MaskSize << 8), dl, NVT); + SDValue New = DAG.getNode(X86ISD::BEXTR, dl, NVT, N0->getOperand(0), C); + return New; +} + // Look for (and (ctpop X), 1) which is the IR form of __builtin_parity. // Turn it into series of XORs and a setnp. static SDValue combineParity(SDNode *N, SelectionDAG &DAG, @@ -35379,6 +35442,9 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, if (DCI.isBeforeLegalizeOps()) return SDValue(); + if (SDValue R = combineAndIntoBEXTR(N, DAG, Subtarget)) + return R; + if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget)) return R; diff --git a/test/CodeGen/X86/tbm_patterns.ll b/test/CodeGen/X86/tbm_patterns.ll index 6865cc5a0ef..2b335ea4268 100644 --- a/test/CodeGen/X86/tbm_patterns.ll +++ b/test/CodeGen/X86/tbm_patterns.ll @@ -53,8 +53,7 @@ define i32 @test_x86_tbm_bextri_u32_z2(i32 %a, i32 %b, i32 %c) nounwind { ; CHECK-LABEL: test_x86_tbm_bextri_u32_z2: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %esi, %eax -; CHECK-NEXT: shrl $4, %edi -; CHECK-NEXT: testl $4095, %edi # imm = 0xFFF +; CHECK-NEXT: bextrl $3076, %edi, %ecx # imm = 0xC04 ; CHECK-NEXT: cmovnel %edx, %eax ; CHECK-NEXT: retq %t0 = lshr i32 %a, 4 @@ -114,8 +113,7 @@ define i64 @test_x86_tbm_bextri_u64_z2(i64 %a, i64 %b, i64 %c) nounwind { ; CHECK-LABEL: test_x86_tbm_bextri_u64_z2: ; CHECK: # %bb.0: ; CHECK-NEXT: movq %rsi, %rax -; CHECK-NEXT: shrl $4, %edi -; CHECK-NEXT: testl $4095, %edi # imm = 0xFFF +; CHECK-NEXT: bextrl $3076, %edi, %ecx # imm = 0xC04 ; CHECK-NEXT: cmovneq %rdx, %rax ; CHECK-NEXT: retq %t0 = lshr i64 %a, 4 -- GitLab From 2870bb0615585488fd166f9a625b655452be91e4 Mon Sep 17 00:00:00 2001 From: Thomas Lively Date: Wed, 10 Oct 2018 20:40:54 +0000 Subject: [PATCH 0021/1116] [WebAssembly][NFC] Remove repetition of Defs = [ARGUMENTS] Summary: By moving that line into the `I` multiclass. Reviewers: aheejin Subscribers: dschuff, sbc100, jgravelle-google, sunfish, llvm-commits Differential Revision: https://reviews.llvm.org/D53093 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344180 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../WebAssembly/WebAssemblyInstrAtomics.td | 16 -------------- .../WebAssembly/WebAssemblyInstrCall.td | 4 ---- .../WebAssembly/WebAssemblyInstrControl.td | 8 ------- .../WebAssembly/WebAssemblyInstrConv.td | 8 ------- .../WebAssembly/WebAssemblyInstrExceptRef.td | 4 ---- .../WebAssembly/WebAssemblyInstrFloat.td | 12 ---------- .../WebAssembly/WebAssemblyInstrFormats.td | 1 + .../WebAssembly/WebAssemblyInstrInfo.td | 7 ++---- .../WebAssembly/WebAssemblyInstrInteger.td | 9 -------- .../WebAssembly/WebAssemblyInstrMemory.td | 22 ------------------- .../WebAssembly/WebAssemblyInstrSIMD.td | 2 -- 11 files changed, 3 insertions(+), 90 deletions(-) diff --git a/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td b/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td index 9eff2cfde0a..f9d092e4b8a 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td @@ -24,10 +24,8 @@ multiclass ATOMIC_I; } -let Defs = [ARGUMENTS] in { defm ATOMIC_LOAD_I32 : WebAssemblyLoad; defm ATOMIC_LOAD_I64 : WebAssemblyLoad; -} // Defs = [ARGUMENTS] // Select loads with no constant offset. let Predicates = [HasAtomics] in { @@ -62,13 +60,11 @@ def : LoadPatExternSymOffOnly; // Extending loads. Note that there are only zero-extending atomic loads, no // sign-extending loads. -let Defs = [ARGUMENTS] in { defm ATOMIC_LOAD8_U_I32 : WebAssemblyLoad; defm ATOMIC_LOAD16_U_I32 : WebAssemblyLoad; defm ATOMIC_LOAD8_U_I64 : WebAssemblyLoad; defm ATOMIC_LOAD16_U_I64 : WebAssemblyLoad; defm ATOMIC_LOAD32_U_I64 : WebAssemblyLoad; -} // Defs = [ARGUMENTS] // Fragments for extending loads. These are different from regular loads because // the SDNodes are derived from AtomicSDNode rather than LoadSDNode and @@ -200,10 +196,8 @@ def : LoadPatExternSymOffOnly; // Atomic stores //===----------------------------------------------------------------------===// -let Defs = [ARGUMENTS] in { defm ATOMIC_STORE_I32 : WebAssemblyStore; defm ATOMIC_STORE_I64 : WebAssemblyStore; -} // Defs = [ARGUMENTS] // We need an 'atomic' version of store patterns because store and atomic_store // nodes have different operand orders: @@ -263,13 +257,11 @@ def : AStorePatExternSymOffOnly; } // Predicates = [HasAtomics] // Truncating stores. -let Defs = [ARGUMENTS] in { defm ATOMIC_STORE8_I32 : WebAssemblyStore; defm ATOMIC_STORE16_I32 : WebAssemblyStore; defm ATOMIC_STORE8_I64 : WebAssemblyStore; defm ATOMIC_STORE16_I64 : WebAssemblyStore; defm ATOMIC_STORE32_I64 : WebAssemblyStore; -} // Defs = [ARGUMENTS] // Fragments for truncating stores. @@ -341,8 +333,6 @@ def : AStorePatExternSymOffOnly; // Atomic binary read-modify-writes //===----------------------------------------------------------------------===// -let Defs = [ARGUMENTS] in { - multiclass WebAssemblyBinRMW { defm "" : I<(outs rc:$dst), (ins P2Align:$p2align, offset32_op:$off, I32:$addr, rc:$val), @@ -430,7 +420,6 @@ defm ATOMIC_RMW16_U_XCHG_I64 : WebAssemblyBinRMW; defm ATOMIC_RMW32_U_XCHG_I64 : WebAssemblyBinRMW; -} // Select binary RMWs with no constant offset. class BinRMWPatNoOffset : @@ -674,8 +663,6 @@ defm : BinRMWTruncExtPattern< // Consider adding a pass after instruction selection that optimizes this case // if it is frequent. -let Defs = [ARGUMENTS] in { - multiclass WebAssemblyTerRMW { defm "" : I<(outs rc:$dst), (ins P2Align:$p2align, offset32_op:$off, I32:$addr, rc:$exp, @@ -699,7 +686,6 @@ defm ATOMIC_RMW16_U_CMPXCHG_I64 : WebAssemblyTerRMW; defm ATOMIC_RMW32_U_CMPXCHG_I64 : WebAssemblyTerRMW; -} // Select ternary RMWs with no constant offset. class TerRMWPatNoOffset : @@ -912,7 +898,6 @@ defm : TerRMWTruncExtPattern< // Atomic wait / notify //===----------------------------------------------------------------------===// -let Defs = [ARGUMENTS] in { let hasSideEffects = 1 in { defm ATOMIC_NOTIFY : I<(outs I32:$dst), @@ -935,7 +920,6 @@ defm ATOMIC_WAIT_I64 : "i64.atomic.wait \t${off}, ${p2align}", 0xfe02>; } // mayLoad = 1 } // hasSideEffects = 1 -} // Defs = [ARGUMENTS] let Predicates = [HasAtomics] in { // Select notifys with no constant offset. diff --git a/lib/Target/WebAssembly/WebAssemblyInstrCall.td b/lib/Target/WebAssembly/WebAssemblyInstrCall.td index 3c9caa3f0de..07839b79011 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrCall.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrCall.td @@ -15,8 +15,6 @@ // TODO: addr64: These currently assume the callee address is 32-bit. // FIXME: add $type to first call_indirect asmstr (and maybe $flags) -let Defs = [ARGUMENTS] in { - // Call sequence markers. These have an immediate which represents the amount of // stack space to allocate or free, which is used for varargs lowering. let Uses = [SP32, SP64], Defs = [SP32, SP64], isCodeGenOnly = 1 in { @@ -118,8 +116,6 @@ let Uses = [SP32, SP64], isCall = 1 in { 0x11>; } // Uses = [SP32,SP64], isCall = 1 -} // Defs = [ARGUMENTS] - // Patterns for matching a direct call to a global address. def : Pat<(i32 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))), (CALL_I32 tglobaladdr:$callee)>; diff --git a/lib/Target/WebAssembly/WebAssemblyInstrControl.td b/lib/Target/WebAssembly/WebAssemblyInstrControl.td index e27d81937dd..ed9879ae454 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrControl.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrControl.td @@ -12,8 +12,6 @@ /// //===----------------------------------------------------------------------===// -let Defs = [ARGUMENTS] in { - let isBranch = 1, isTerminator = 1, hasCtrlDep = 1 in { // The condition operand is a boolean value which WebAssembly represents as i32. defm BR_IF : I<(outs), (ins bb_op:$dst, I32:$cond), @@ -30,15 +28,11 @@ defm BR : NRI<(outs), (ins bb_op:$dst), } // isBarrier = 1 } // isBranch = 1, isTerminator = 1, hasCtrlDep = 1 -} // Defs = [ARGUMENTS] - def : Pat<(brcond (i32 (setne I32:$cond, 0)), bb:$dst), (BR_IF bb_op:$dst, I32:$cond)>; def : Pat<(brcond (i32 (seteq I32:$cond, 0)), bb:$dst), (BR_UNLESS bb_op:$dst, I32:$cond)>; -let Defs = [ARGUMENTS] in { - // TODO: SelectionDAG's lowering insists on using a pointer as the index for // jump tables, so in practice we don't ever use BR_TABLE_I64 in wasm32 mode // currently. @@ -194,5 +188,3 @@ let isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1, [(catchret bb:$dst, bb:$from)], "", 0>; } } - -} // Defs = [ARGUMENTS] diff --git a/lib/Target/WebAssembly/WebAssemblyInstrConv.td b/lib/Target/WebAssembly/WebAssemblyInstrConv.td index c89c1b54981..e9ba52799ee 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrConv.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrConv.td @@ -13,8 +13,6 @@ /// //===----------------------------------------------------------------------===// -let Defs = [ARGUMENTS] in { - defm I32_WRAP_I64 : I<(outs I32:$dst), (ins I64:$src), (outs), (ins), [(set I32:$dst, (trunc I64:$src))], "i32.wrap/i64\t$dst, $src", "i32.wrap/i64", 0xa7>; @@ -51,15 +49,11 @@ defm I64_EXTEND32_S_I64 : I<(outs I64:$dst), (ins I64:$src), (outs), (ins), 0xc4>; } // Predicates = [HasSignExt] -} // defs = [ARGUMENTS] - // Expand a "don't care" extend into zero-extend (chosen over sign-extend // somewhat arbitrarily, although it favors popular hardware architectures // and is conceptually a simpler operation). def : Pat<(i64 (anyext I32:$src)), (I64_EXTEND_U_I32 I32:$src)>; -let Defs = [ARGUMENTS] in { - // Conversion from floating point to integer instructions which don't trap on // overflow or invalid. defm I32_TRUNC_S_SAT_F32 : I<(outs I32:$dst), (ins F32:$src), (outs), (ins), @@ -218,5 +212,3 @@ defm F64_REINTERPRET_I64 : I<(outs F64:$dst), (ins I64:$src), (outs), (ins), [(set F64:$dst, (bitconvert I64:$src))], "f64.reinterpret/i64\t$dst, $src", "f64.reinterpret/i64", 0xbf>; - -} // Defs = [ARGUMENTS] diff --git a/lib/Target/WebAssembly/WebAssemblyInstrExceptRef.td b/lib/Target/WebAssembly/WebAssemblyInstrExceptRef.td index 41b39f69e51..a251d60b89e 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrExceptRef.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrExceptRef.td @@ -12,8 +12,6 @@ /// //===----------------------------------------------------------------------===// -let Defs = [ARGUMENTS] in { - defm SELECT_EXCEPT_REF : I<(outs EXCEPT_REF:$dst), (ins EXCEPT_REF:$lhs, EXCEPT_REF:$rhs, I32:$cond), (outs), (ins), @@ -23,8 +21,6 @@ defm SELECT_EXCEPT_REF : I<(outs EXCEPT_REF:$dst), "except_ref.select\t$dst, $lhs, $rhs, $cond", "except_ref.select", 0x1b>; -} // Defs = [ARGUMENTS] - def : Pat<(select (i32 (setne I32:$cond, 0)), EXCEPT_REF:$lhs, EXCEPT_REF:$rhs), (SELECT_EXCEPT_REF EXCEPT_REF:$lhs, EXCEPT_REF:$rhs, I32:$cond)>; def : Pat<(select (i32 (seteq I32:$cond, 0)), EXCEPT_REF:$lhs, EXCEPT_REF:$rhs), diff --git a/lib/Target/WebAssembly/WebAssemblyInstrFloat.td b/lib/Target/WebAssembly/WebAssemblyInstrFloat.td index 70e27df27e6..364c485f409 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrFloat.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrFloat.td @@ -45,8 +45,6 @@ multiclass ComparisonFP f32Inst, bits<32> f !strconcat("f64.", name), f64Inst>; } -let Defs = [ARGUMENTS] in { - let isCommutable = 1 in defm ADD : BinaryFP; defm SUB : BinaryFP; @@ -69,8 +67,6 @@ defm FLOOR : UnaryFP; defm TRUNC : UnaryFP; defm NEAREST : UnaryFP; -} // Defs = [ARGUMENTS] - // DAGCombine oddly folds casts into the rhs of copysign. Unfold them. def : Pat<(fcopysign F64:$lhs, F32:$rhs), (COPYSIGN_F64 F64:$lhs, (F64_PROMOTE_F32 F32:$rhs))>; @@ -81,8 +77,6 @@ def : Pat<(fcopysign F32:$lhs, F64:$rhs), def : Pat<(frint f32:$src), (NEAREST_F32 f32:$src)>; def : Pat<(frint f64:$src), (NEAREST_F64 f64:$src)>; -let Defs = [ARGUMENTS] in { - let isCommutable = 1 in { defm EQ : ComparisonFP; defm NE : ComparisonFP; @@ -92,8 +86,6 @@ defm LE : ComparisonFP; defm GT : ComparisonFP; defm GE : ComparisonFP; -} // Defs = [ARGUMENTS] - // Don't care floating-point comparisons, supported via other comparisons. def : Pat<(seteq f32:$lhs, f32:$rhs), (EQ_F32 f32:$lhs, f32:$rhs)>; def : Pat<(setne f32:$lhs, f32:$rhs), (NE_F32 f32:$lhs, f32:$rhs)>; @@ -108,8 +100,6 @@ def : Pat<(setle f64:$lhs, f64:$rhs), (LE_F64 f64:$lhs, f64:$rhs)>; def : Pat<(setgt f64:$lhs, f64:$rhs), (GT_F64 f64:$lhs, f64:$rhs)>; def : Pat<(setge f64:$lhs, f64:$rhs), (GE_F64 f64:$lhs, f64:$rhs)>; -let Defs = [ARGUMENTS] in { - defm SELECT_F32 : I<(outs F32:$dst), (ins F32:$lhs, F32:$rhs, I32:$cond), (outs), (ins), [(set F32:$dst, (select I32:$cond, F32:$lhs, F32:$rhs))], @@ -119,8 +109,6 @@ defm SELECT_F64 : I<(outs F64:$dst), (ins F64:$lhs, F64:$rhs, I32:$cond), [(set F64:$dst, (select I32:$cond, F64:$lhs, F64:$rhs))], "f64.select\t$dst, $lhs, $rhs, $cond", "f64.select", 0x1b>; -} // Defs = [ARGUMENTS] - // ISD::SELECT requires its operand to conform to getBooleanContents, but // WebAssembly's select interprets any non-zero value as true, so we can fold // a setne with 0 into a select. diff --git a/lib/Target/WebAssembly/WebAssemblyInstrFormats.td b/lib/Target/WebAssembly/WebAssemblyInstrFormats.td index 683fb3d981f..2d23acfc825 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrFormats.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrFormats.td @@ -30,6 +30,7 @@ class NI pattern, bit stack, string asmstr = "", dag OutOperandList = oops; dag InOperandList = iops; let Pattern = pattern; + let Defs = [ARGUMENTS]; } // Generates both register and stack based versions of one actual instruction. diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td index a2ea14cc28b..9e1409cf90e 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td @@ -164,7 +164,8 @@ include "WebAssemblyInstrFormats.td" //===----------------------------------------------------------------------===// multiclass ARGUMENT { - let hasSideEffects = 1, Uses = [ARGUMENTS], isCodeGenOnly = 1 in + let hasSideEffects = 1, isCodeGenOnly = 1, + Defs = [], Uses = [ARGUMENTS] in defm ARGUMENT_#vt : I<(outs vt:$res), (ins i32imm:$argno), (outs), (ins i32imm:$argno), [(set vt:$res, (WebAssemblyargument timm:$argno))]>; @@ -175,8 +176,6 @@ defm "": ARGUMENT; defm "": ARGUMENT; defm "": ARGUMENT; -let Defs = [ARGUMENTS] in { - // get_local and set_local are not generated by instruction selection; they // are implied by virtual register uses and defs. multiclass LOCAL { @@ -266,8 +265,6 @@ defm CONST_F64 : I<(outs F64:$res), (ins f64imm_op:$imm), "f64.const\t$res, $imm", "f64.const\t$imm", 0x44>; } // isMoveImm = 1, isAsCheapAsAMove = 1, isReMaterializable = 1 -} // Defs = [ARGUMENTS] - def : Pat<(i32 (WebAssemblywrapper tglobaladdr:$addr)), (CONST_I32 tglobaladdr:$addr)>; def : Pat<(i32 (WebAssemblywrapper texternalsym:$addr)), diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInteger.td b/lib/Target/WebAssembly/WebAssemblyInstrInteger.td index 44c93de54aa..bd41f46214a 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrInteger.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrInteger.td @@ -45,9 +45,6 @@ multiclass ComparisonInt i32Inst, bits<32> !strconcat("i64.", name), i64Inst>; } - -let Defs = [ARGUMENTS] in { - // The spaces after the names are for aesthetic purposes only, to make // operands line up vertically after tab expansion. let isCommutable = 1 in @@ -97,16 +94,12 @@ defm EQZ_I64 : I<(outs I32:$dst), (ins I64:$src), (outs), (ins), [(set I32:$dst, (setcc I64:$src, 0, SETEQ))], "i64.eqz \t$dst, $src", "i64.eqz", 0x50>; -} // Defs = [ARGUMENTS] - // Optimize away an explicit mask on a rotate count. def : Pat<(rotl I32:$lhs, (and I32:$rhs, 31)), (ROTL_I32 I32:$lhs, I32:$rhs)>; def : Pat<(rotr I32:$lhs, (and I32:$rhs, 31)), (ROTR_I32 I32:$lhs, I32:$rhs)>; def : Pat<(rotl I64:$lhs, (and I64:$rhs, 63)), (ROTL_I64 I64:$lhs, I64:$rhs)>; def : Pat<(rotr I64:$lhs, (and I64:$rhs, 63)), (ROTR_I64 I64:$lhs, I64:$rhs)>; -let Defs = [ARGUMENTS] in { - defm SELECT_I32 : I<(outs I32:$dst), (ins I32:$lhs, I32:$rhs, I32:$cond), (outs), (ins), [(set I32:$dst, (select I32:$cond, I32:$lhs, I32:$rhs))], @@ -116,8 +109,6 @@ defm SELECT_I64 : I<(outs I64:$dst), (ins I64:$lhs, I64:$rhs, I32:$cond), [(set I64:$dst, (select I32:$cond, I64:$lhs, I64:$rhs))], "i64.select\t$dst, $lhs, $rhs, $cond", "i64.select", 0x1b>; -} // Defs = [ARGUMENTS] - // ISD::SELECT requires its operand to conform to getBooleanContents, but // WebAssembly's select interprets any non-zero value as true, so we can fold // a setne with 0 into a select. diff --git a/lib/Target/WebAssembly/WebAssemblyInstrMemory.td b/lib/Target/WebAssembly/WebAssemblyInstrMemory.td index 76ef1461d22..ccc331d1bf0 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrMemory.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrMemory.td @@ -53,8 +53,6 @@ def regPlusGA : PatFrag<(ops node:$addr, node:$off), // We don't need a regPlusES because external symbols never have constant // offsets folded into them, so we can just use add. -let Defs = [ARGUMENTS] in { - // Defines atomic and non-atomic loads, regular and extending. multiclass WebAssemblyLoad { let mayLoad = 1 in @@ -73,8 +71,6 @@ defm LOAD_I64 : WebAssemblyLoad; defm LOAD_F32 : WebAssemblyLoad; defm LOAD_F64 : WebAssemblyLoad; -} // Defs = [ARGUMENTS] - // Select loads with no constant offset. class LoadPatNoOffset : Pat<(ty (kind I32:$addr)), (inst 0, 0, I32:$addr)>; @@ -144,8 +140,6 @@ def : LoadPatExternSymOffOnly; def : LoadPatExternSymOffOnly; def : LoadPatExternSymOffOnly; -let Defs = [ARGUMENTS] in { - // Extending load. defm LOAD8_S_I32 : WebAssemblyLoad; defm LOAD8_U_I32 : WebAssemblyLoad; @@ -158,8 +152,6 @@ defm LOAD16_U_I64 : WebAssemblyLoad; defm LOAD32_S_I64 : WebAssemblyLoad; defm LOAD32_U_I64 : WebAssemblyLoad; -} // Defs = [ARGUMENTS] - // Select extending loads with no constant offset. def : LoadPatNoOffset; def : LoadPatNoOffset; @@ -303,9 +295,6 @@ def : LoadPatExternSymOffOnly; def : LoadPatExternSymOffOnly; def : LoadPatExternSymOffOnly; - -let Defs = [ARGUMENTS] in { - // Defines atomic and non-atomic stores, regular and truncating multiclass WebAssemblyStore { let mayStore = 1 in @@ -323,8 +312,6 @@ defm STORE_I64 : WebAssemblyStore; defm STORE_F32 : WebAssemblyStore; defm STORE_F64 : WebAssemblyStore; -} // Defs = [ARGUMENTS] - // Select stores with no constant offset. class StorePatNoOffset : Pat<(node ty:$val, I32:$addr), (inst 0, 0, I32:$addr, ty:$val)>; @@ -389,9 +376,6 @@ def : StorePatExternSymOffOnly; def : StorePatExternSymOffOnly; def : StorePatExternSymOffOnly; - -let Defs = [ARGUMENTS] in { - // Truncating store. defm STORE8_I32 : WebAssemblyStore; defm STORE16_I32 : WebAssemblyStore; @@ -399,8 +383,6 @@ defm STORE8_I64 : WebAssemblyStore; defm STORE16_I64 : WebAssemblyStore; defm STORE32_I64 : WebAssemblyStore; -} // Defs = [ARGUMENTS] - // Select truncating stores with no constant offset. def : StorePatNoOffset; def : StorePatNoOffset; @@ -448,8 +430,6 @@ def : StorePatExternSymOffOnly; def : StorePatExternSymOffOnly; def : StorePatExternSymOffOnly; -let Defs = [ARGUMENTS] in { - // Current memory size. defm MEMORY_SIZE_I32 : I<(outs I32:$dst), (ins i32imm:$flags), (outs), (ins i32imm:$flags), @@ -493,8 +473,6 @@ defm GROW_MEMORY_I32 : I<(outs I32:$dst), (ins i32imm:$flags, I32:$delta), 0x40>, Requires<[HasAddr32]>; -} // Defs = [ARGUMENTS] - def : Pat<(int_wasm_current_memory), (CURRENT_MEMORY_I32 0)>; def : Pat<(int_wasm_grow_memory I32:$delta), diff --git a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td index 491ee56b794..90bdc17890b 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -55,7 +55,6 @@ multiclass ConstVec { "v128.const\t"#args, 0>; } -let Defs = [ARGUMENTS] in { defm "" : ConstVec; -} // Defs = [ARGUMENTS] // Create vector with identical lanes: splat def splat2 : PatFrag<(ops node:$x), (build_vector node:$x, node:$x)>; -- GitLab From 66a2c5ecaaec52cf521193dbc139fdeba8987720 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Wed, 10 Oct 2018 20:47:46 +0000 Subject: [PATCH 0022/1116] [InstCombine] reverse 'trunc X to ' canonicalization; 2nd try Re-trying r344082 because it unintentionally included extra diffs. Original commit message: icmp ne (and X, 1), 0 --> trunc X to N x i1 Ideally, we'd do the same for scalars, but there will likely be regressions unless we add more trunc folds as we're doing here for vectors. The motivating vector case is from PR37549: https://bugs.llvm.org/show_bug.cgi?id=37549 define <4 x float> @bitwise_select(<4 x float> %x, <4 x float> %y, <4 x float> %z, <4 x float> %w) { %c = fcmp ole <4 x float> %x, %y %s = sext <4 x i1> %c to <4 x i32> %s1 = shufflevector <4 x i32> %s, <4 x i32> undef, <4 x i32> %s2 = shufflevector <4 x i32> %s, <4 x i32> undef, <4 x i32> %cond = or <4 x i32> %s1, %s2 %condtr = trunc <4 x i32> %cond to <4 x i1> %r = select <4 x i1> %condtr, <4 x float> %z, <4 x float> %w ret <4 x float> %r } Here's a sampling of the vector codegen for that case using mask+icmp (current behavior) vs. trunc (with this patch): AVX before: vcmpleps %xmm1, %xmm0, %xmm0 vpermilps $80, %xmm0, %xmm1 ## xmm1 = xmm0[0,0,1,1] vpermilps $250, %xmm0, %xmm0 ## xmm0 = xmm0[2,2,3,3] vorps %xmm0, %xmm1, %xmm0 vandps LCPI0_0(%rip), %xmm0, %xmm0 vxorps %xmm1, %xmm1, %xmm1 vpcmpeqd %xmm1, %xmm0, %xmm0 vblendvps %xmm0, %xmm3, %xmm2, %xmm0 AVX after: vcmpleps %xmm1, %xmm0, %xmm0 vpermilps $80, %xmm0, %xmm1 ## xmm1 = xmm0[0,0,1,1] vpermilps $250, %xmm0, %xmm0 ## xmm0 = xmm0[2,2,3,3] vorps %xmm0, %xmm1, %xmm0 vblendvps %xmm0, %xmm2, %xmm3, %xmm0 AVX512f before: vcmpleps %xmm1, %xmm0, %xmm0 vpermilps $80, %xmm0, %xmm1 ## xmm1 = xmm0[0,0,1,1] vpermilps $250, %xmm0, %xmm0 ## xmm0 = xmm0[2,2,3,3] vorps %xmm0, %xmm1, %xmm0 vpbroadcastd LCPI0_0(%rip), %xmm1 ## xmm1 = [1,1,1,1] vptestnmd %zmm1, %zmm0, %k1 vblendmps %zmm3, %zmm2, %zmm0 {%k1} AVX512f after: vcmpleps %xmm1, %xmm0, %xmm0 vpermilps $80, %xmm0, %xmm1 ## xmm1 = xmm0[0,0,1,1] vpermilps $250, %xmm0, %xmm0 ## xmm0 = xmm0[2,2,3,3] vorps %xmm0, %xmm1, %xmm0 vpslld $31, %xmm0, %xmm0 vptestmd %zmm0, %zmm0, %k1 vblendmps %zmm2, %zmm3, %zmm0 {%k1} AArch64 before: fcmge v0.4s, v1.4s, v0.4s zip1 v1.4s, v0.4s, v0.4s zip2 v0.4s, v0.4s, v0.4s orr v0.16b, v1.16b, v0.16b movi v1.4s, #1 and v0.16b, v0.16b, v1.16b cmeq v0.4s, v0.4s, #0 bsl v0.16b, v3.16b, v2.16b AArch64 after: fcmge v0.4s, v1.4s, v0.4s zip1 v1.4s, v0.4s, v0.4s zip2 v0.4s, v0.4s, v0.4s orr v0.16b, v1.16b, v0.16b bsl v0.16b, v2.16b, v3.16b PowerPC-le before: xvcmpgesp 34, 35, 34 vspltisw 0, 1 vmrglw 3, 2, 2 vmrghw 2, 2, 2 xxlor 0, 35, 34 xxlxor 35, 35, 35 xxland 34, 0, 32 vcmpequw 2, 2, 3 xxsel 34, 36, 37, 34 PowerPC-le after: xvcmpgesp 34, 35, 34 vmrglw 3, 2, 2 vmrghw 2, 2, 2 xxlor 0, 35, 34 xxsel 34, 37, 36, 0 Differential Revision: https://reviews.llvm.org/D52747 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344181 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../InstCombine/InstCombineCasts.cpp | 31 +- .../InstCombine/InstCombineCompares.cpp | 7 + test/Transforms/InstCombine/apint-shift.ll | 4 +- .../Transforms/InstCombine/apint-shl-trunc.ll | 5 +- test/Transforms/InstCombine/icmp.ll | 20 +- test/Transforms/InstCombine/vector-casts.ll | 19 +- .../LoopVectorize/X86/masked_load_store.ll | 336 +++++++++--------- 7 files changed, 213 insertions(+), 209 deletions(-) diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp index fd59c3a7c0c..74f1e695ff6 100644 --- a/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -706,12 +706,35 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) { if (SimplifyDemandedInstructionBits(CI)) return &CI; - // Canonicalize trunc x to i1 -> (icmp ne (and x, 1), 0), likewise for vector. if (DestTy->getScalarSizeInBits() == 1) { - Constant *One = ConstantInt::get(SrcTy, 1); - Src = Builder.CreateAnd(Src, One); Value *Zero = Constant::getNullValue(Src->getType()); - return new ICmpInst(ICmpInst::ICMP_NE, Src, Zero); + if (DestTy->isIntegerTy()) { + // Canonicalize trunc x to i1 -> icmp ne (and x, 1), 0 (scalar only). + // TODO: We canonicalize to more instructions here because we are probably + // lacking equivalent analysis for trunc relative to icmp. There may also + // be codegen concerns. If those trunc limitations were removed, we could + // remove this transform. + Value *And = Builder.CreateAnd(Src, ConstantInt::get(SrcTy, 1)); + return new ICmpInst(ICmpInst::ICMP_NE, And, Zero); + } + + // For vectors, we do not canonicalize all truncs to icmp, so optimize + // patterns that would be covered within visitICmpInst. + Value *X; + const APInt *C; + if (match(Src, m_OneUse(m_LShr(m_Value(X), m_APInt(C))))) { + // trunc (lshr X, C) to i1 --> icmp ne (and X, C'), 0 + APInt MaskC = APInt(SrcTy->getScalarSizeInBits(), 1).shl(*C); + Value *And = Builder.CreateAnd(X, ConstantInt::get(SrcTy, MaskC)); + return new ICmpInst(ICmpInst::ICMP_NE, And, Zero); + } + if (match(Src, m_OneUse(m_c_Or(m_LShr(m_Value(X), m_APInt(C)), + m_Deferred(X))))) { + // trunc (or (lshr X, C), X) to i1 --> icmp ne (and X, C'), 0 + APInt MaskC = APInt(SrcTy->getScalarSizeInBits(), 1).shl(*C) | 1; + Value *And = Builder.CreateAnd(X, ConstantInt::get(SrcTy, MaskC)); + return new ICmpInst(ICmpInst::ICMP_NE, And, Zero); + } } // FIXME: Maybe combine the next two transforms to handle the no cast case diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp index 07bd98b30ab..bf8bc8818f7 100644 --- a/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -1609,6 +1609,13 @@ Instruction *InstCombiner::foldICmpAndShift(ICmpInst &Cmp, BinaryOperator *And, Instruction *InstCombiner::foldICmpAndConstConst(ICmpInst &Cmp, BinaryOperator *And, const APInt &C1) { + // For vectors: icmp ne (and X, 1), 0 --> trunc X to N x i1 + // TODO: We canonicalize to the longer form for scalars because we have + // better analysis/folds for icmp, and codegen may be better with icmp. + if (Cmp.getPredicate() == CmpInst::ICMP_NE && Cmp.getType()->isVectorTy() && + C1.isNullValue() && match(And->getOperand(1), m_One())) + return new TruncInst(And->getOperand(0), Cmp.getType()); + const APInt *C2; if (!match(And->getOperand(1), m_APInt(C2))) return nullptr; diff --git a/test/Transforms/InstCombine/apint-shift.ll b/test/Transforms/InstCombine/apint-shift.ll index fc564665a60..3266fa6e443 100644 --- a/test/Transforms/InstCombine/apint-shift.ll +++ b/test/Transforms/InstCombine/apint-shift.ll @@ -319,8 +319,8 @@ define i1 @test16(i84 %X) { define <2 x i1> @test16vec(<2 x i84> %X) { ; CHECK-LABEL: @test16vec( -; CHECK-NEXT: [[AND:%.*]] = and <2 x i84> %X, -; CHECK-NEXT: [[CMP:%.*]] = icmp ne <2 x i84> [[AND]], zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i84> [[X:%.*]], +; CHECK-NEXT: [[CMP:%.*]] = icmp ne <2 x i84> [[TMP1]], zeroinitializer ; CHECK-NEXT: ret <2 x i1> [[CMP]] ; %shr = ashr <2 x i84> %X, diff --git a/test/Transforms/InstCombine/apint-shl-trunc.ll b/test/Transforms/InstCombine/apint-shl-trunc.ll index c7d7d369592..2241c88cb6b 100644 --- a/test/Transforms/InstCombine/apint-shl-trunc.ll +++ b/test/Transforms/InstCombine/apint-shl-trunc.ll @@ -27,9 +27,8 @@ define i1 @test1(i799 %X, i799 %A) { define <2 x i1> @test0vec(<2 x i39> %X, <2 x i39> %A) { ; CHECK-LABEL: @test0vec( -; CHECK-NEXT: [[TMP1:%.*]] = shl <2 x i39> , [[A:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i39> [[TMP1]], [[X:%.*]] -; CHECK-NEXT: [[D:%.*]] = icmp ne <2 x i39> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[B:%.*]] = lshr <2 x i39> [[X:%.*]], [[A:%.*]] +; CHECK-NEXT: [[D:%.*]] = trunc <2 x i39> [[B]] to <2 x i1> ; CHECK-NEXT: ret <2 x i1> [[D]] ; %B = lshr <2 x i39> %X, %A diff --git a/test/Transforms/InstCombine/icmp.ll b/test/Transforms/InstCombine/icmp.ll index 1f97009911b..1f05bb67e96 100644 --- a/test/Transforms/InstCombine/icmp.ll +++ b/test/Transforms/InstCombine/icmp.ll @@ -2427,10 +2427,9 @@ define i1 @icmp_and_or_lshr(i32 %x, i32 %y) { define <2 x i1> @icmp_and_or_lshr_vec(<2 x i32> %x, <2 x i32> %y) { ; CHECK-LABEL: @icmp_and_or_lshr_vec( -; CHECK-NEXT: [[SHF1:%.*]] = shl nuw <2 x i32> , [[Y:%.*]] -; CHECK-NEXT: [[OR2:%.*]] = or <2 x i32> [[SHF1]], -; CHECK-NEXT: [[AND3:%.*]] = and <2 x i32> [[OR2]], [[X:%.*]] -; CHECK-NEXT: [[RET:%.*]] = icmp ne <2 x i32> [[AND3]], zeroinitializer +; CHECK-NEXT: [[SHF:%.*]] = lshr <2 x i32> [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: [[OR:%.*]] = or <2 x i32> [[SHF]], [[X]] +; CHECK-NEXT: [[RET:%.*]] = trunc <2 x i32> [[OR]] to <2 x i1> ; CHECK-NEXT: ret <2 x i1> [[RET]] ; %shf = lshr <2 x i32> %x, %y @@ -2445,8 +2444,7 @@ define <2 x i1> @icmp_and_or_lshr_vec_commute(<2 x i32> %xp, <2 x i32> %y) { ; CHECK-NEXT: [[X:%.*]] = srem <2 x i32> [[XP:%.*]], ; CHECK-NEXT: [[SHF:%.*]] = lshr <2 x i32> [[X]], [[Y:%.*]] ; CHECK-NEXT: [[OR:%.*]] = or <2 x i32> [[X]], [[SHF]] -; CHECK-NEXT: [[AND:%.*]] = and <2 x i32> [[OR]], -; CHECK-NEXT: [[RET:%.*]] = icmp ne <2 x i32> [[AND]], zeroinitializer +; CHECK-NEXT: [[RET:%.*]] = trunc <2 x i32> [[OR]] to <2 x i1> ; CHECK-NEXT: ret <2 x i1> [[RET]] ; %x = srem <2 x i32> %xp, ; prevent complexity-based canonicalization @@ -2472,8 +2470,8 @@ define i1 @icmp_and_or_lshr_cst(i32 %x) { define <2 x i1> @icmp_and_or_lshr_cst_vec(<2 x i32> %x) { ; CHECK-LABEL: @icmp_and_or_lshr_cst_vec( -; CHECK-NEXT: [[AND1:%.*]] = and <2 x i32> [[X:%.*]], -; CHECK-NEXT: [[RET:%.*]] = icmp ne <2 x i32> [[AND1]], zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i32> [[X:%.*]], +; CHECK-NEXT: [[RET:%.*]] = icmp ne <2 x i32> [[TMP1]], zeroinitializer ; CHECK-NEXT: ret <2 x i1> [[RET]] ; %shf = lshr <2 x i32> %x, @@ -2486,10 +2484,8 @@ define <2 x i1> @icmp_and_or_lshr_cst_vec(<2 x i32> %x) { define <2 x i1> @icmp_and_or_lshr_cst_vec_commute(<2 x i32> %xp) { ; CHECK-LABEL: @icmp_and_or_lshr_cst_vec_commute( ; CHECK-NEXT: [[X:%.*]] = srem <2 x i32> [[XP:%.*]], -; CHECK-NEXT: [[SHF:%.*]] = lshr <2 x i32> [[X]], -; CHECK-NEXT: [[OR:%.*]] = or <2 x i32> [[X]], [[SHF]] -; CHECK-NEXT: [[AND:%.*]] = and <2 x i32> [[OR]], -; CHECK-NEXT: [[RET:%.*]] = icmp ne <2 x i32> [[AND]], zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i32> [[X]], +; CHECK-NEXT: [[RET:%.*]] = icmp ne <2 x i32> [[TMP1]], zeroinitializer ; CHECK-NEXT: ret <2 x i1> [[RET]] ; %x = srem <2 x i32> %xp, ; prevent complexity-based canonicalization diff --git a/test/Transforms/InstCombine/vector-casts.ll b/test/Transforms/InstCombine/vector-casts.ll index 6e0d66b8ed4..e0d6083a969 100644 --- a/test/Transforms/InstCombine/vector-casts.ll +++ b/test/Transforms/InstCombine/vector-casts.ll @@ -1,26 +1,22 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -instcombine -S | FileCheck %s -; This turns into a&1 != 0 -; TODO: The bar for canonicalizing to something bigger than the minimal length IR is very high. -; This pattern does not appear to meet that standard. +; Can't get smaller than this. define <2 x i1> @trunc(<2 x i64> %a) { ; CHECK-LABEL: @trunc( -; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i64> [[A:%.*]], -; CHECK-NEXT: [[T:%.*]] = icmp ne <2 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[T:%.*]] = trunc <2 x i64> [[A:%.*]] to <2 x i1> ; CHECK-NEXT: ret <2 x i1> [[T]] ; %t = trunc <2 x i64> %a to <2 x i1> ret <2 x i1> %t } -; TODO: This could be just 1 instruction (trunc). +; This is trunc. define <2 x i1> @and_cmp_is_trunc(<2 x i64> %a) { ; CHECK-LABEL: @and_cmp_is_trunc( -; CHECK-NEXT: [[T:%.*]] = and <2 x i64> [[A:%.*]], -; CHECK-NEXT: [[R:%.*]] = icmp ne <2 x i64> [[T]], zeroinitializer +; CHECK-NEXT: [[R:%.*]] = trunc <2 x i64> [[A:%.*]] to <2 x i1> ; CHECK-NEXT: ret <2 x i1> [[R]] ; %t = and <2 x i64> %a, @@ -28,12 +24,11 @@ define <2 x i1> @and_cmp_is_trunc(<2 x i64> %a) { ret <2 x i1> %r } -; TODO: This could be just 1 instruction (trunc). +; This is trunc. define <2 x i1> @and_cmp_is_trunc_even_with_undef_elt(<2 x i64> %a) { ; CHECK-LABEL: @and_cmp_is_trunc_even_with_undef_elt( -; CHECK-NEXT: [[T:%.*]] = and <2 x i64> [[A:%.*]], -; CHECK-NEXT: [[R:%.*]] = icmp ne <2 x i64> [[T]], zeroinitializer +; CHECK-NEXT: [[R:%.*]] = trunc <2 x i64> [[A:%.*]] to <2 x i1> ; CHECK-NEXT: ret <2 x i1> [[R]] ; %t = and <2 x i64> %a, @@ -41,7 +36,7 @@ define <2 x i1> @and_cmp_is_trunc_even_with_undef_elt(<2 x i64> %a) { ret <2 x i1> %r } -; TODO: This could be just 1 instruction (trunc). +; TODO: This could be just 1 instruction (trunc), but our undef matching is incomplete. define <2 x i1> @and_cmp_is_trunc_even_with_undef_elts(<2 x i64> %a) { ; CHECK-LABEL: @and_cmp_is_trunc_even_with_undef_elts( diff --git a/test/Transforms/LoopVectorize/X86/masked_load_store.ll b/test/Transforms/LoopVectorize/X86/masked_load_store.ll index 8e948639ba1..71038feec7b 100644 --- a/test/Transforms/LoopVectorize/X86/masked_load_store.ll +++ b/test/Transforms/LoopVectorize/X86/masked_load_store.ll @@ -2901,49 +2901,45 @@ define void @foo7(double* noalias %out, double** noalias %in, i8* noalias %trigg ; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 12 ; AVX-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <4 x i8>* ; AVX-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x i8>, <4 x i8>* [[TMP7]], align 1 -; AVX-NEXT: [[TMP8:%.*]] = and <4 x i8> [[WIDE_LOAD]], -; AVX-NEXT: [[TMP9:%.*]] = and <4 x i8> [[WIDE_LOAD10]], -; AVX-NEXT: [[TMP10:%.*]] = and <4 x i8> [[WIDE_LOAD11]], -; AVX-NEXT: [[TMP11:%.*]] = and <4 x i8> [[WIDE_LOAD12]], -; AVX-NEXT: [[TMP12:%.*]] = icmp ne <4 x i8> [[TMP8]], zeroinitializer -; AVX-NEXT: [[TMP13:%.*]] = icmp ne <4 x i8> [[TMP9]], zeroinitializer -; AVX-NEXT: [[TMP14:%.*]] = icmp ne <4 x i8> [[TMP10]], zeroinitializer -; AVX-NEXT: [[TMP15:%.*]] = icmp ne <4 x i8> [[TMP11]], zeroinitializer -; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds double*, double** [[IN:%.*]], i64 [[INDEX]] +; AVX-NEXT: [[TMP8:%.*]] = trunc <4 x i8> [[WIDE_LOAD]] to <4 x i1> +; AVX-NEXT: [[TMP9:%.*]] = trunc <4 x i8> [[WIDE_LOAD10]] to <4 x i1> +; AVX-NEXT: [[TMP10:%.*]] = trunc <4 x i8> [[WIDE_LOAD11]] to <4 x i1> +; AVX-NEXT: [[TMP11:%.*]] = trunc <4 x i8> [[WIDE_LOAD12]] to <4 x i1> +; AVX-NEXT: [[TMP12:%.*]] = getelementptr inbounds double*, double** [[IN:%.*]], i64 [[INDEX]] +; AVX-NEXT: [[TMP13:%.*]] = bitcast double** [[TMP12]] to <4 x double*>* +; AVX-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP13]], i32 8, <4 x i1> [[TMP8]], <4 x double*> undef) +; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 4 +; AVX-NEXT: [[TMP15:%.*]] = bitcast double** [[TMP14]] to <4 x double*>* +; AVX-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP15]], i32 8, <4 x i1> [[TMP9]], <4 x double*> undef) +; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 8 ; AVX-NEXT: [[TMP17:%.*]] = bitcast double** [[TMP16]] to <4 x double*>* -; AVX-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP17]], i32 8, <4 x i1> [[TMP12]], <4 x double*> undef) -; AVX-NEXT: [[TMP18:%.*]] = getelementptr inbounds double*, double** [[TMP16]], i64 4 +; AVX-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP17]], i32 8, <4 x i1> [[TMP10]], <4 x double*> undef) +; AVX-NEXT: [[TMP18:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 12 ; AVX-NEXT: [[TMP19:%.*]] = bitcast double** [[TMP18]] to <4 x double*>* -; AVX-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP19]], i32 8, <4 x i1> [[TMP13]], <4 x double*> undef) -; AVX-NEXT: [[TMP20:%.*]] = getelementptr inbounds double*, double** [[TMP16]], i64 8 -; AVX-NEXT: [[TMP21:%.*]] = bitcast double** [[TMP20]] to <4 x double*>* -; AVX-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP21]], i32 8, <4 x i1> [[TMP14]], <4 x double*> undef) -; AVX-NEXT: [[TMP22:%.*]] = getelementptr inbounds double*, double** [[TMP16]], i64 12 -; AVX-NEXT: [[TMP23:%.*]] = bitcast double** [[TMP22]] to <4 x double*>* -; AVX-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP23]], i32 8, <4 x i1> [[TMP15]], <4 x double*> undef) -; AVX-NEXT: [[TMP24:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer -; AVX-NEXT: [[TMP25:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD13]], zeroinitializer -; AVX-NEXT: [[TMP26:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD14]], zeroinitializer -; AVX-NEXT: [[TMP27:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD15]], zeroinitializer -; AVX-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]] -; AVX-NEXT: [[TMP29:%.*]] = and <4 x i1> [[TMP24]], [[TMP12]] -; AVX-NEXT: [[TMP30:%.*]] = and <4 x i1> [[TMP25]], [[TMP13]] -; AVX-NEXT: [[TMP31:%.*]] = and <4 x i1> [[TMP26]], [[TMP14]] -; AVX-NEXT: [[TMP32:%.*]] = and <4 x i1> [[TMP27]], [[TMP15]] -; AVX-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP28]] to <4 x double>* -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP33]], i32 8, <4 x i1> [[TMP29]]) -; AVX-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 4 +; AVX-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP19]], i32 8, <4 x i1> [[TMP11]], <4 x double*> undef) +; AVX-NEXT: [[TMP20:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer +; AVX-NEXT: [[TMP21:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD13]], zeroinitializer +; AVX-NEXT: [[TMP22:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD14]], zeroinitializer +; AVX-NEXT: [[TMP23:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD15]], zeroinitializer +; AVX-NEXT: [[TMP24:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]] +; AVX-NEXT: [[TMP25:%.*]] = and <4 x i1> [[TMP20]], [[TMP8]] +; AVX-NEXT: [[TMP26:%.*]] = and <4 x i1> [[TMP21]], [[TMP9]] +; AVX-NEXT: [[TMP27:%.*]] = and <4 x i1> [[TMP22]], [[TMP10]] +; AVX-NEXT: [[TMP28:%.*]] = and <4 x i1> [[TMP23]], [[TMP11]] +; AVX-NEXT: [[TMP29:%.*]] = bitcast double* [[TMP24]] to <4 x double>* +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP29]], i32 8, <4 x i1> [[TMP25]]) +; AVX-NEXT: [[TMP30:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 4 +; AVX-NEXT: [[TMP31:%.*]] = bitcast double* [[TMP30]] to <4 x double>* +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP31]], i32 8, <4 x i1> [[TMP26]]) +; AVX-NEXT: [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 8 +; AVX-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP32]] to <4 x double>* +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP33]], i32 8, <4 x i1> [[TMP27]]) +; AVX-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 12 ; AVX-NEXT: [[TMP35:%.*]] = bitcast double* [[TMP34]] to <4 x double>* -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP35]], i32 8, <4 x i1> [[TMP30]]) -; AVX-NEXT: [[TMP36:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 8 -; AVX-NEXT: [[TMP37:%.*]] = bitcast double* [[TMP36]] to <4 x double>* -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP37]], i32 8, <4 x i1> [[TMP31]]) -; AVX-NEXT: [[TMP38:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 12 -; AVX-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <4 x double>* -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP39]], i32 8, <4 x i1> [[TMP32]]) +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP35]], i32 8, <4 x i1> [[TMP28]]) ; AVX-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 -; AVX-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !51 +; AVX-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; AVX-NEXT: br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !51 ; AVX: middle.block: ; AVX-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]] ; AVX-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER16]] @@ -2953,14 +2949,14 @@ define void @foo7(double* noalias %out, double** noalias %in, i8* noalias %trigg ; AVX: for.body: ; AVX-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER16]] ] ; AVX-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX-NEXT: [[TMP41:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; AVX-NEXT: [[TMP42:%.*]] = and i8 [[TMP41]], 1 -; AVX-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP42]], 0 +; AVX-NEXT: [[TMP37:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; AVX-NEXT: [[TMP38:%.*]] = and i8 [[TMP37]], 1 +; AVX-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP38]], 0 ; AVX-NEXT: br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]] ; AVX: land.lhs.true: ; AVX-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[INDVARS_IV]] -; AVX-NEXT: [[TMP43:%.*]] = load double*, double** [[ARRAYIDX2]], align 8 -; AVX-NEXT: [[CMP3:%.*]] = icmp eq double* [[TMP43]], null +; AVX-NEXT: [[TMP39:%.*]] = load double*, double** [[ARRAYIDX2]], align 8 +; AVX-NEXT: [[CMP3:%.*]] = icmp eq double* [[TMP39]], null ; AVX-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]] ; AVX: if.then: ; AVX-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]] @@ -2998,49 +2994,45 @@ define void @foo7(double* noalias %out, double** noalias %in, i8* noalias %trigg ; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 24 ; AVX512-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <8 x i8>* ; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i8>, <8 x i8>* [[TMP7]], align 1 -; AVX512-NEXT: [[TMP8:%.*]] = and <8 x i8> [[WIDE_LOAD]], -; AVX512-NEXT: [[TMP9:%.*]] = and <8 x i8> [[WIDE_LOAD10]], -; AVX512-NEXT: [[TMP10:%.*]] = and <8 x i8> [[WIDE_LOAD11]], -; AVX512-NEXT: [[TMP11:%.*]] = and <8 x i8> [[WIDE_LOAD12]], -; AVX512-NEXT: [[TMP12:%.*]] = icmp ne <8 x i8> [[TMP8]], zeroinitializer -; AVX512-NEXT: [[TMP13:%.*]] = icmp ne <8 x i8> [[TMP9]], zeroinitializer -; AVX512-NEXT: [[TMP14:%.*]] = icmp ne <8 x i8> [[TMP10]], zeroinitializer -; AVX512-NEXT: [[TMP15:%.*]] = icmp ne <8 x i8> [[TMP11]], zeroinitializer -; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds double*, double** [[IN:%.*]], i64 [[INDEX]] +; AVX512-NEXT: [[TMP8:%.*]] = trunc <8 x i8> [[WIDE_LOAD]] to <8 x i1> +; AVX512-NEXT: [[TMP9:%.*]] = trunc <8 x i8> [[WIDE_LOAD10]] to <8 x i1> +; AVX512-NEXT: [[TMP10:%.*]] = trunc <8 x i8> [[WIDE_LOAD11]] to <8 x i1> +; AVX512-NEXT: [[TMP11:%.*]] = trunc <8 x i8> [[WIDE_LOAD12]] to <8 x i1> +; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds double*, double** [[IN:%.*]], i64 [[INDEX]] +; AVX512-NEXT: [[TMP13:%.*]] = bitcast double** [[TMP12]] to <8 x double*>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP13]], i32 8, <8 x i1> [[TMP8]], <8 x double*> undef) +; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 8 +; AVX512-NEXT: [[TMP15:%.*]] = bitcast double** [[TMP14]] to <8 x double*>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP15]], i32 8, <8 x i1> [[TMP9]], <8 x double*> undef) +; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 16 ; AVX512-NEXT: [[TMP17:%.*]] = bitcast double** [[TMP16]] to <8 x double*>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP17]], i32 8, <8 x i1> [[TMP12]], <8 x double*> undef) -; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds double*, double** [[TMP16]], i64 8 +; AVX512-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP17]], i32 8, <8 x i1> [[TMP10]], <8 x double*> undef) +; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 24 ; AVX512-NEXT: [[TMP19:%.*]] = bitcast double** [[TMP18]] to <8 x double*>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP19]], i32 8, <8 x i1> [[TMP13]], <8 x double*> undef) -; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds double*, double** [[TMP16]], i64 16 -; AVX512-NEXT: [[TMP21:%.*]] = bitcast double** [[TMP20]] to <8 x double*>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP21]], i32 8, <8 x i1> [[TMP14]], <8 x double*> undef) -; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds double*, double** [[TMP16]], i64 24 -; AVX512-NEXT: [[TMP23:%.*]] = bitcast double** [[TMP22]] to <8 x double*>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP23]], i32 8, <8 x i1> [[TMP15]], <8 x double*> undef) -; AVX512-NEXT: [[TMP24:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer -; AVX512-NEXT: [[TMP25:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD13]], zeroinitializer -; AVX512-NEXT: [[TMP26:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD14]], zeroinitializer -; AVX512-NEXT: [[TMP27:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD15]], zeroinitializer -; AVX512-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]] -; AVX512-NEXT: [[TMP29:%.*]] = and <8 x i1> [[TMP24]], [[TMP12]] -; AVX512-NEXT: [[TMP30:%.*]] = and <8 x i1> [[TMP25]], [[TMP13]] -; AVX512-NEXT: [[TMP31:%.*]] = and <8 x i1> [[TMP26]], [[TMP14]] -; AVX512-NEXT: [[TMP32:%.*]] = and <8 x i1> [[TMP27]], [[TMP15]] -; AVX512-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP28]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP33]], i32 8, <8 x i1> [[TMP29]]) -; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 8 +; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP19]], i32 8, <8 x i1> [[TMP11]], <8 x double*> undef) +; AVX512-NEXT: [[TMP20:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer +; AVX512-NEXT: [[TMP21:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD13]], zeroinitializer +; AVX512-NEXT: [[TMP22:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD14]], zeroinitializer +; AVX512-NEXT: [[TMP23:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD15]], zeroinitializer +; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]] +; AVX512-NEXT: [[TMP25:%.*]] = and <8 x i1> [[TMP20]], [[TMP8]] +; AVX512-NEXT: [[TMP26:%.*]] = and <8 x i1> [[TMP21]], [[TMP9]] +; AVX512-NEXT: [[TMP27:%.*]] = and <8 x i1> [[TMP22]], [[TMP10]] +; AVX512-NEXT: [[TMP28:%.*]] = and <8 x i1> [[TMP23]], [[TMP11]] +; AVX512-NEXT: [[TMP29:%.*]] = bitcast double* [[TMP24]] to <8 x double>* +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP29]], i32 8, <8 x i1> [[TMP25]]) +; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 8 +; AVX512-NEXT: [[TMP31:%.*]] = bitcast double* [[TMP30]] to <8 x double>* +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP31]], i32 8, <8 x i1> [[TMP26]]) +; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 16 +; AVX512-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP32]] to <8 x double>* +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP33]], i32 8, <8 x i1> [[TMP27]]) +; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 24 ; AVX512-NEXT: [[TMP35:%.*]] = bitcast double* [[TMP34]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP35]], i32 8, <8 x i1> [[TMP30]]) -; AVX512-NEXT: [[TMP36:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 16 -; AVX512-NEXT: [[TMP37:%.*]] = bitcast double* [[TMP36]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP37]], i32 8, <8 x i1> [[TMP31]]) -; AVX512-NEXT: [[TMP38:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 24 -; AVX512-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP39]], i32 8, <8 x i1> [[TMP32]]) +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP35]], i32 8, <8 x i1> [[TMP28]]) ; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32 -; AVX512-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX512-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !63 +; AVX512-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; AVX512-NEXT: br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !63 ; AVX512: middle.block: ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]] ; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER16]] @@ -3050,14 +3042,14 @@ define void @foo7(double* noalias %out, double** noalias %in, i8* noalias %trigg ; AVX512: for.body: ; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER16]] ] ; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX512-NEXT: [[TMP41:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; AVX512-NEXT: [[TMP42:%.*]] = and i8 [[TMP41]], 1 -; AVX512-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP42]], 0 +; AVX512-NEXT: [[TMP37:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; AVX512-NEXT: [[TMP38:%.*]] = and i8 [[TMP37]], 1 +; AVX512-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP38]], 0 ; AVX512-NEXT: br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]] ; AVX512: land.lhs.true: ; AVX512-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[INDVARS_IV]] -; AVX512-NEXT: [[TMP43:%.*]] = load double*, double** [[ARRAYIDX2]], align 8 -; AVX512-NEXT: [[CMP3:%.*]] = icmp eq double* [[TMP43]], null +; AVX512-NEXT: [[TMP39:%.*]] = load double*, double** [[ARRAYIDX2]], align 8 +; AVX512-NEXT: [[CMP3:%.*]] = icmp eq double* [[TMP39]], null ; AVX512-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]] ; AVX512: if.then: ; AVX512-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]] @@ -3162,49 +3154,45 @@ define void @foo8(double* noalias %out, i32 ()** noalias %in, i8* noalias %trigg ; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 12 ; AVX-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <4 x i8>* ; AVX-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x i8>, <4 x i8>* [[TMP7]], align 1 -; AVX-NEXT: [[TMP8:%.*]] = and <4 x i8> [[WIDE_LOAD]], -; AVX-NEXT: [[TMP9:%.*]] = and <4 x i8> [[WIDE_LOAD10]], -; AVX-NEXT: [[TMP10:%.*]] = and <4 x i8> [[WIDE_LOAD11]], -; AVX-NEXT: [[TMP11:%.*]] = and <4 x i8> [[WIDE_LOAD12]], -; AVX-NEXT: [[TMP12:%.*]] = icmp ne <4 x i8> [[TMP8]], zeroinitializer -; AVX-NEXT: [[TMP13:%.*]] = icmp ne <4 x i8> [[TMP9]], zeroinitializer -; AVX-NEXT: [[TMP14:%.*]] = icmp ne <4 x i8> [[TMP10]], zeroinitializer -; AVX-NEXT: [[TMP15:%.*]] = icmp ne <4 x i8> [[TMP11]], zeroinitializer -; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN:%.*]], i64 [[INDEX]] +; AVX-NEXT: [[TMP8:%.*]] = trunc <4 x i8> [[WIDE_LOAD]] to <4 x i1> +; AVX-NEXT: [[TMP9:%.*]] = trunc <4 x i8> [[WIDE_LOAD10]] to <4 x i1> +; AVX-NEXT: [[TMP10:%.*]] = trunc <4 x i8> [[WIDE_LOAD11]] to <4 x i1> +; AVX-NEXT: [[TMP11:%.*]] = trunc <4 x i8> [[WIDE_LOAD12]] to <4 x i1> +; AVX-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN:%.*]], i64 [[INDEX]] +; AVX-NEXT: [[TMP13:%.*]] = bitcast i32 ()** [[TMP12]] to <4 x i32 ()*>* +; AVX-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP13]], i32 8, <4 x i1> [[TMP8]], <4 x i32 ()*> undef) +; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 4 +; AVX-NEXT: [[TMP15:%.*]] = bitcast i32 ()** [[TMP14]] to <4 x i32 ()*>* +; AVX-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP15]], i32 8, <4 x i1> [[TMP9]], <4 x i32 ()*> undef) +; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 8 ; AVX-NEXT: [[TMP17:%.*]] = bitcast i32 ()** [[TMP16]] to <4 x i32 ()*>* -; AVX-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP17]], i32 8, <4 x i1> [[TMP12]], <4 x i32 ()*> undef) -; AVX-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP16]], i64 4 +; AVX-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP17]], i32 8, <4 x i1> [[TMP10]], <4 x i32 ()*> undef) +; AVX-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 12 ; AVX-NEXT: [[TMP19:%.*]] = bitcast i32 ()** [[TMP18]] to <4 x i32 ()*>* -; AVX-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP19]], i32 8, <4 x i1> [[TMP13]], <4 x i32 ()*> undef) -; AVX-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP16]], i64 8 -; AVX-NEXT: [[TMP21:%.*]] = bitcast i32 ()** [[TMP20]] to <4 x i32 ()*>* -; AVX-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP21]], i32 8, <4 x i1> [[TMP14]], <4 x i32 ()*> undef) -; AVX-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP16]], i64 12 -; AVX-NEXT: [[TMP23:%.*]] = bitcast i32 ()** [[TMP22]] to <4 x i32 ()*>* -; AVX-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP23]], i32 8, <4 x i1> [[TMP15]], <4 x i32 ()*> undef) -; AVX-NEXT: [[TMP24:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer -; AVX-NEXT: [[TMP25:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD13]], zeroinitializer -; AVX-NEXT: [[TMP26:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD14]], zeroinitializer -; AVX-NEXT: [[TMP27:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD15]], zeroinitializer -; AVX-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]] -; AVX-NEXT: [[TMP29:%.*]] = and <4 x i1> [[TMP24]], [[TMP12]] -; AVX-NEXT: [[TMP30:%.*]] = and <4 x i1> [[TMP25]], [[TMP13]] -; AVX-NEXT: [[TMP31:%.*]] = and <4 x i1> [[TMP26]], [[TMP14]] -; AVX-NEXT: [[TMP32:%.*]] = and <4 x i1> [[TMP27]], [[TMP15]] -; AVX-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP28]] to <4 x double>* -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP33]], i32 8, <4 x i1> [[TMP29]]) -; AVX-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 4 +; AVX-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP19]], i32 8, <4 x i1> [[TMP11]], <4 x i32 ()*> undef) +; AVX-NEXT: [[TMP20:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer +; AVX-NEXT: [[TMP21:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD13]], zeroinitializer +; AVX-NEXT: [[TMP22:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD14]], zeroinitializer +; AVX-NEXT: [[TMP23:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD15]], zeroinitializer +; AVX-NEXT: [[TMP24:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]] +; AVX-NEXT: [[TMP25:%.*]] = and <4 x i1> [[TMP20]], [[TMP8]] +; AVX-NEXT: [[TMP26:%.*]] = and <4 x i1> [[TMP21]], [[TMP9]] +; AVX-NEXT: [[TMP27:%.*]] = and <4 x i1> [[TMP22]], [[TMP10]] +; AVX-NEXT: [[TMP28:%.*]] = and <4 x i1> [[TMP23]], [[TMP11]] +; AVX-NEXT: [[TMP29:%.*]] = bitcast double* [[TMP24]] to <4 x double>* +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP29]], i32 8, <4 x i1> [[TMP25]]) +; AVX-NEXT: [[TMP30:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 4 +; AVX-NEXT: [[TMP31:%.*]] = bitcast double* [[TMP30]] to <4 x double>* +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP31]], i32 8, <4 x i1> [[TMP26]]) +; AVX-NEXT: [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 8 +; AVX-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP32]] to <4 x double>* +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP33]], i32 8, <4 x i1> [[TMP27]]) +; AVX-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 12 ; AVX-NEXT: [[TMP35:%.*]] = bitcast double* [[TMP34]] to <4 x double>* -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP35]], i32 8, <4 x i1> [[TMP30]]) -; AVX-NEXT: [[TMP36:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 8 -; AVX-NEXT: [[TMP37:%.*]] = bitcast double* [[TMP36]] to <4 x double>* -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP37]], i32 8, <4 x i1> [[TMP31]]) -; AVX-NEXT: [[TMP38:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 12 -; AVX-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <4 x double>* -; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP39]], i32 8, <4 x i1> [[TMP32]]) +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP35]], i32 8, <4 x i1> [[TMP28]]) ; AVX-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 -; AVX-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !54 +; AVX-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; AVX-NEXT: br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !54 ; AVX: middle.block: ; AVX-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]] ; AVX-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER16]] @@ -3214,14 +3202,14 @@ define void @foo8(double* noalias %out, i32 ()** noalias %in, i8* noalias %trigg ; AVX: for.body: ; AVX-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER16]] ] ; AVX-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX-NEXT: [[TMP41:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; AVX-NEXT: [[TMP42:%.*]] = and i8 [[TMP41]], 1 -; AVX-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP42]], 0 +; AVX-NEXT: [[TMP37:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; AVX-NEXT: [[TMP38:%.*]] = and i8 [[TMP37]], 1 +; AVX-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP38]], 0 ; AVX-NEXT: br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]] ; AVX: land.lhs.true: ; AVX-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[INDVARS_IV]] -; AVX-NEXT: [[TMP43:%.*]] = load i32 ()*, i32 ()** [[ARRAYIDX2]], align 8 -; AVX-NEXT: [[CMP3:%.*]] = icmp eq i32 ()* [[TMP43]], null +; AVX-NEXT: [[TMP39:%.*]] = load i32 ()*, i32 ()** [[ARRAYIDX2]], align 8 +; AVX-NEXT: [[CMP3:%.*]] = icmp eq i32 ()* [[TMP39]], null ; AVX-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]] ; AVX: if.then: ; AVX-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]] @@ -3259,49 +3247,45 @@ define void @foo8(double* noalias %out, i32 ()** noalias %in, i8* noalias %trigg ; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 24 ; AVX512-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <8 x i8>* ; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i8>, <8 x i8>* [[TMP7]], align 1 -; AVX512-NEXT: [[TMP8:%.*]] = and <8 x i8> [[WIDE_LOAD]], -; AVX512-NEXT: [[TMP9:%.*]] = and <8 x i8> [[WIDE_LOAD10]], -; AVX512-NEXT: [[TMP10:%.*]] = and <8 x i8> [[WIDE_LOAD11]], -; AVX512-NEXT: [[TMP11:%.*]] = and <8 x i8> [[WIDE_LOAD12]], -; AVX512-NEXT: [[TMP12:%.*]] = icmp ne <8 x i8> [[TMP8]], zeroinitializer -; AVX512-NEXT: [[TMP13:%.*]] = icmp ne <8 x i8> [[TMP9]], zeroinitializer -; AVX512-NEXT: [[TMP14:%.*]] = icmp ne <8 x i8> [[TMP10]], zeroinitializer -; AVX512-NEXT: [[TMP15:%.*]] = icmp ne <8 x i8> [[TMP11]], zeroinitializer -; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN:%.*]], i64 [[INDEX]] +; AVX512-NEXT: [[TMP8:%.*]] = trunc <8 x i8> [[WIDE_LOAD]] to <8 x i1> +; AVX512-NEXT: [[TMP9:%.*]] = trunc <8 x i8> [[WIDE_LOAD10]] to <8 x i1> +; AVX512-NEXT: [[TMP10:%.*]] = trunc <8 x i8> [[WIDE_LOAD11]] to <8 x i1> +; AVX512-NEXT: [[TMP11:%.*]] = trunc <8 x i8> [[WIDE_LOAD12]] to <8 x i1> +; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN:%.*]], i64 [[INDEX]] +; AVX512-NEXT: [[TMP13:%.*]] = bitcast i32 ()** [[TMP12]] to <8 x i32 ()*>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP13]], i32 8, <8 x i1> [[TMP8]], <8 x i32 ()*> undef) +; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 8 +; AVX512-NEXT: [[TMP15:%.*]] = bitcast i32 ()** [[TMP14]] to <8 x i32 ()*>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP15]], i32 8, <8 x i1> [[TMP9]], <8 x i32 ()*> undef) +; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 16 ; AVX512-NEXT: [[TMP17:%.*]] = bitcast i32 ()** [[TMP16]] to <8 x i32 ()*>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP17]], i32 8, <8 x i1> [[TMP12]], <8 x i32 ()*> undef) -; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP16]], i64 8 +; AVX512-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP17]], i32 8, <8 x i1> [[TMP10]], <8 x i32 ()*> undef) +; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 24 ; AVX512-NEXT: [[TMP19:%.*]] = bitcast i32 ()** [[TMP18]] to <8 x i32 ()*>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP19]], i32 8, <8 x i1> [[TMP13]], <8 x i32 ()*> undef) -; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP16]], i64 16 -; AVX512-NEXT: [[TMP21:%.*]] = bitcast i32 ()** [[TMP20]] to <8 x i32 ()*>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP21]], i32 8, <8 x i1> [[TMP14]], <8 x i32 ()*> undef) -; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP16]], i64 24 -; AVX512-NEXT: [[TMP23:%.*]] = bitcast i32 ()** [[TMP22]] to <8 x i32 ()*>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP23]], i32 8, <8 x i1> [[TMP15]], <8 x i32 ()*> undef) -; AVX512-NEXT: [[TMP24:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer -; AVX512-NEXT: [[TMP25:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD13]], zeroinitializer -; AVX512-NEXT: [[TMP26:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD14]], zeroinitializer -; AVX512-NEXT: [[TMP27:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD15]], zeroinitializer -; AVX512-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]] -; AVX512-NEXT: [[TMP29:%.*]] = and <8 x i1> [[TMP24]], [[TMP12]] -; AVX512-NEXT: [[TMP30:%.*]] = and <8 x i1> [[TMP25]], [[TMP13]] -; AVX512-NEXT: [[TMP31:%.*]] = and <8 x i1> [[TMP26]], [[TMP14]] -; AVX512-NEXT: [[TMP32:%.*]] = and <8 x i1> [[TMP27]], [[TMP15]] -; AVX512-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP28]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP33]], i32 8, <8 x i1> [[TMP29]]) -; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 8 +; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP19]], i32 8, <8 x i1> [[TMP11]], <8 x i32 ()*> undef) +; AVX512-NEXT: [[TMP20:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer +; AVX512-NEXT: [[TMP21:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD13]], zeroinitializer +; AVX512-NEXT: [[TMP22:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD14]], zeroinitializer +; AVX512-NEXT: [[TMP23:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD15]], zeroinitializer +; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]] +; AVX512-NEXT: [[TMP25:%.*]] = and <8 x i1> [[TMP20]], [[TMP8]] +; AVX512-NEXT: [[TMP26:%.*]] = and <8 x i1> [[TMP21]], [[TMP9]] +; AVX512-NEXT: [[TMP27:%.*]] = and <8 x i1> [[TMP22]], [[TMP10]] +; AVX512-NEXT: [[TMP28:%.*]] = and <8 x i1> [[TMP23]], [[TMP11]] +; AVX512-NEXT: [[TMP29:%.*]] = bitcast double* [[TMP24]] to <8 x double>* +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP29]], i32 8, <8 x i1> [[TMP25]]) +; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 8 +; AVX512-NEXT: [[TMP31:%.*]] = bitcast double* [[TMP30]] to <8 x double>* +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP31]], i32 8, <8 x i1> [[TMP26]]) +; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 16 +; AVX512-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP32]] to <8 x double>* +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP33]], i32 8, <8 x i1> [[TMP27]]) +; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 24 ; AVX512-NEXT: [[TMP35:%.*]] = bitcast double* [[TMP34]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP35]], i32 8, <8 x i1> [[TMP30]]) -; AVX512-NEXT: [[TMP36:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 16 -; AVX512-NEXT: [[TMP37:%.*]] = bitcast double* [[TMP36]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP37]], i32 8, <8 x i1> [[TMP31]]) -; AVX512-NEXT: [[TMP38:%.*]] = getelementptr inbounds double, double* [[TMP28]], i64 24 -; AVX512-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP39]], i32 8, <8 x i1> [[TMP32]]) +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP35]], i32 8, <8 x i1> [[TMP28]]) ; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32 -; AVX512-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX512-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !66 +; AVX512-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; AVX512-NEXT: br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !66 ; AVX512: middle.block: ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]] ; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER16]] @@ -3311,14 +3295,14 @@ define void @foo8(double* noalias %out, i32 ()** noalias %in, i8* noalias %trigg ; AVX512: for.body: ; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER16]] ] ; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX512-NEXT: [[TMP41:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; AVX512-NEXT: [[TMP42:%.*]] = and i8 [[TMP41]], 1 -; AVX512-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP42]], 0 +; AVX512-NEXT: [[TMP37:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; AVX512-NEXT: [[TMP38:%.*]] = and i8 [[TMP37]], 1 +; AVX512-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP38]], 0 ; AVX512-NEXT: br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]] ; AVX512: land.lhs.true: ; AVX512-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[INDVARS_IV]] -; AVX512-NEXT: [[TMP43:%.*]] = load i32 ()*, i32 ()** [[ARRAYIDX2]], align 8 -; AVX512-NEXT: [[CMP3:%.*]] = icmp eq i32 ()* [[TMP43]], null +; AVX512-NEXT: [[TMP39:%.*]] = load i32 ()*, i32 ()** [[ARRAYIDX2]], align 8 +; AVX512-NEXT: [[CMP3:%.*]] = icmp eq i32 ()* [[TMP39]], null ; AVX512-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]] ; AVX512: if.then: ; AVX512-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]] -- GitLab From 4c5954f7cf9763d09091f37fd909f70e21f96571 Mon Sep 17 00:00:00 2001 From: Roman Lebedev Date: Wed, 10 Oct 2018 20:50:52 +0000 Subject: [PATCH 0023/1116] [NFC][X86][AArch64] extract-bits.ll: add tests with constants+storing results. As noted in https://reviews.llvm.org/D53080#inline-467678, this *may* get pessimized by that diff. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344182 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/CodeGen/AArch64/extract-bits.ll | 90 ++++++++ test/CodeGen/X86/extract-bits.ll | 329 +++++++++++++++++++++++++++ 2 files changed, 419 insertions(+) diff --git a/test/CodeGen/AArch64/extract-bits.ll b/test/CodeGen/AArch64/extract-bits.ll index a60883b958e..21bebc67969 100644 --- a/test/CodeGen/AArch64/extract-bits.ll +++ b/test/CodeGen/AArch64/extract-bits.ll @@ -838,3 +838,93 @@ define i64 @c4_i64_bad(i64 %arg) { %tmp1 = and i64 %tmp0, 16382 ret i64 %tmp1 } + +; ---------------------------------------------------------------------------- ; +; Constant, storing the result afterwards. +; ---------------------------------------------------------------------------- ; + +; i32 + +; The most canonical variant +define void @c5_i32(i32 %arg, i32* %ptr) { +; CHECK-LABEL: c5_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ubfx w8, w0, #19, #10 +; CHECK-NEXT: str w8, [x1] +; CHECK-NEXT: ret + %tmp0 = lshr i32 %arg, 19 + %tmp1 = and i32 %tmp0, 1023 + store i32 %tmp1, i32* %ptr + ret void +} + +; Should be still fine, but the mask is shifted +define void @c6_i32(i32 %arg, i32* %ptr) { +; CHECK-LABEL: c6_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ubfx w8, w0, #19, #12 +; CHECK-NEXT: str w8, [x1] +; CHECK-NEXT: ret + %tmp0 = lshr i32 %arg, 19 + %tmp1 = and i32 %tmp0, 4095 + store i32 %tmp1, i32* %ptr + ret void +} + +; Should be still fine, but the result is shifted left afterwards +define void @c7_i32(i32 %arg, i32* %ptr) { +; CHECK-LABEL: c7_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ubfx w8, w0, #19, #10 +; CHECK-NEXT: lsl w8, w8, #2 +; CHECK-NEXT: str w8, [x1] +; CHECK-NEXT: ret + %tmp0 = lshr i32 %arg, 19 + %tmp1 = and i32 %tmp0, 1023 + %tmp2 = shl i32 %tmp1, 2 + store i32 %tmp2, i32* %ptr + ret void +} + +; i64 + +; The most canonical variant +define void @c5_i64(i64 %arg, i64* %ptr) { +; CHECK-LABEL: c5_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ubfx x8, x0, #51, #10 +; CHECK-NEXT: str x8, [x1] +; CHECK-NEXT: ret + %tmp0 = lshr i64 %arg, 51 + %tmp1 = and i64 %tmp0, 1023 + store i64 %tmp1, i64* %ptr + ret void +} + +; Should be still fine, but the mask is shifted +define void @c6_i64(i64 %arg, i64* %ptr) { +; CHECK-LABEL: c6_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ubfx x8, x0, #51, #12 +; CHECK-NEXT: str x8, [x1] +; CHECK-NEXT: ret + %tmp0 = lshr i64 %arg, 51 + %tmp1 = and i64 %tmp0, 4095 + store i64 %tmp1, i64* %ptr + ret void +} + +; Should be still fine, but the result is shifted left afterwards +define void @c7_i64(i64 %arg, i64* %ptr) { +; CHECK-LABEL: c7_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ubfx x8, x0, #51, #10 +; CHECK-NEXT: lsl x8, x8, #2 +; CHECK-NEXT: str x8, [x1] +; CHECK-NEXT: ret + %tmp0 = lshr i64 %arg, 51 + %tmp1 = and i64 %tmp0, 1023 + %tmp2 = shl i64 %tmp1, 2 + store i64 %tmp2, i64* %ptr + ret void +} diff --git a/test/CodeGen/X86/extract-bits.ll b/test/CodeGen/X86/extract-bits.ll index 6ee5b4a39a5..98c9ab271cb 100644 --- a/test/CodeGen/X86/extract-bits.ll +++ b/test/CodeGen/X86/extract-bits.ll @@ -5880,3 +5880,332 @@ define i64 @c4_i64_bad(i64 %arg) { %tmp1 = and i64 %tmp0, 16382 ret i64 %tmp1 } + +; ---------------------------------------------------------------------------- ; +; Constant, storing the result afterwards. +; ---------------------------------------------------------------------------- ; + +; i32 + +; The most canonical variant +define void @c5_i32(i32 %arg, i32* %ptr) { +; X86-NOBMI-LABEL: c5_i32: +; X86-NOBMI: # %bb.0: +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NOBMI-NEXT: shrl $19, %ecx +; X86-NOBMI-NEXT: andl $1023, %ecx # imm = 0x3FF +; X86-NOBMI-NEXT: movl %ecx, (%eax) +; X86-NOBMI-NEXT: retl +; +; X86-BMI1NOTBM-LABEL: c5_i32: +; X86-BMI1NOTBM: # %bb.0: +; X86-BMI1NOTBM-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-BMI1NOTBM-NEXT: movl $2579, %ecx # imm = 0xA13 +; X86-BMI1NOTBM-NEXT: bextrl %ecx, {{[0-9]+}}(%esp), %ecx +; X86-BMI1NOTBM-NEXT: movl %ecx, (%eax) +; X86-BMI1NOTBM-NEXT: retl +; +; X86-BMI1TBM-LABEL: c5_i32: +; X86-BMI1TBM: # %bb.0: +; X86-BMI1TBM-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-BMI1TBM-NEXT: bextrl $2579, {{[0-9]+}}(%esp), %ecx # imm = 0xA13 +; X86-BMI1TBM-NEXT: movl %ecx, (%eax) +; X86-BMI1TBM-NEXT: retl +; +; X86-BMI1NOTBMBMI2-LABEL: c5_i32: +; X86-BMI1NOTBMBMI2: # %bb.0: +; X86-BMI1NOTBMBMI2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-BMI1NOTBMBMI2-NEXT: movl $2579, %ecx # imm = 0xA13 +; X86-BMI1NOTBMBMI2-NEXT: bextrl %ecx, {{[0-9]+}}(%esp), %ecx +; X86-BMI1NOTBMBMI2-NEXT: movl %ecx, (%eax) +; X86-BMI1NOTBMBMI2-NEXT: retl +; +; X64-NOBMI-LABEL: c5_i32: +; X64-NOBMI: # %bb.0: +; X64-NOBMI-NEXT: shrl $19, %edi +; X64-NOBMI-NEXT: andl $1023, %edi # imm = 0x3FF +; X64-NOBMI-NEXT: movl %edi, (%rsi) +; X64-NOBMI-NEXT: retq +; +; X64-BMI1NOTBM-LABEL: c5_i32: +; X64-BMI1NOTBM: # %bb.0: +; X64-BMI1NOTBM-NEXT: movl $2579, %eax # imm = 0xA13 +; X64-BMI1NOTBM-NEXT: bextrl %eax, %edi, %eax +; X64-BMI1NOTBM-NEXT: movl %eax, (%rsi) +; X64-BMI1NOTBM-NEXT: retq +; +; X64-BMI1TBM-LABEL: c5_i32: +; X64-BMI1TBM: # %bb.0: +; X64-BMI1TBM-NEXT: bextrl $2579, %edi, %eax # imm = 0xA13 +; X64-BMI1TBM-NEXT: movl %eax, (%rsi) +; X64-BMI1TBM-NEXT: retq +; +; X64-BMI1NOTBMBMI2-LABEL: c5_i32: +; X64-BMI1NOTBMBMI2: # %bb.0: +; X64-BMI1NOTBMBMI2-NEXT: movl $2579, %eax # imm = 0xA13 +; X64-BMI1NOTBMBMI2-NEXT: bextrl %eax, %edi, %eax +; X64-BMI1NOTBMBMI2-NEXT: movl %eax, (%rsi) +; X64-BMI1NOTBMBMI2-NEXT: retq + %tmp0 = lshr i32 %arg, 19 + %tmp1 = and i32 %tmp0, 1023 + store i32 %tmp1, i32* %ptr + ret void +} + +; Should be still fine, but the mask is shifted +define void @c6_i32(i32 %arg, i32* %ptr) { +; X86-NOBMI-LABEL: c6_i32: +; X86-NOBMI: # %bb.0: +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NOBMI-NEXT: shrl $19, %ecx +; X86-NOBMI-NEXT: andl $4095, %ecx # imm = 0xFFF +; X86-NOBMI-NEXT: movl %ecx, (%eax) +; X86-NOBMI-NEXT: retl +; +; X86-BMI1NOTBM-LABEL: c6_i32: +; X86-BMI1NOTBM: # %bb.0: +; X86-BMI1NOTBM-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-BMI1NOTBM-NEXT: movl $3091, %ecx # imm = 0xC13 +; X86-BMI1NOTBM-NEXT: bextrl %ecx, {{[0-9]+}}(%esp), %ecx +; X86-BMI1NOTBM-NEXT: movl %ecx, (%eax) +; X86-BMI1NOTBM-NEXT: retl +; +; X86-BMI1TBM-LABEL: c6_i32: +; X86-BMI1TBM: # %bb.0: +; X86-BMI1TBM-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-BMI1TBM-NEXT: bextrl $3091, {{[0-9]+}}(%esp), %ecx # imm = 0xC13 +; X86-BMI1TBM-NEXT: movl %ecx, (%eax) +; X86-BMI1TBM-NEXT: retl +; +; X86-BMI1NOTBMBMI2-LABEL: c6_i32: +; X86-BMI1NOTBMBMI2: # %bb.0: +; X86-BMI1NOTBMBMI2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-BMI1NOTBMBMI2-NEXT: movl $3091, %ecx # imm = 0xC13 +; X86-BMI1NOTBMBMI2-NEXT: bextrl %ecx, {{[0-9]+}}(%esp), %ecx +; X86-BMI1NOTBMBMI2-NEXT: movl %ecx, (%eax) +; X86-BMI1NOTBMBMI2-NEXT: retl +; +; X64-NOBMI-LABEL: c6_i32: +; X64-NOBMI: # %bb.0: +; X64-NOBMI-NEXT: shrl $19, %edi +; X64-NOBMI-NEXT: andl $4095, %edi # imm = 0xFFF +; X64-NOBMI-NEXT: movl %edi, (%rsi) +; X64-NOBMI-NEXT: retq +; +; X64-BMI1NOTBM-LABEL: c6_i32: +; X64-BMI1NOTBM: # %bb.0: +; X64-BMI1NOTBM-NEXT: movl $3091, %eax # imm = 0xC13 +; X64-BMI1NOTBM-NEXT: bextrl %eax, %edi, %eax +; X64-BMI1NOTBM-NEXT: movl %eax, (%rsi) +; X64-BMI1NOTBM-NEXT: retq +; +; X64-BMI1TBM-LABEL: c6_i32: +; X64-BMI1TBM: # %bb.0: +; X64-BMI1TBM-NEXT: bextrl $3091, %edi, %eax # imm = 0xC13 +; X64-BMI1TBM-NEXT: movl %eax, (%rsi) +; X64-BMI1TBM-NEXT: retq +; +; X64-BMI1NOTBMBMI2-LABEL: c6_i32: +; X64-BMI1NOTBMBMI2: # %bb.0: +; X64-BMI1NOTBMBMI2-NEXT: movl $3091, %eax # imm = 0xC13 +; X64-BMI1NOTBMBMI2-NEXT: bextrl %eax, %edi, %eax +; X64-BMI1NOTBMBMI2-NEXT: movl %eax, (%rsi) +; X64-BMI1NOTBMBMI2-NEXT: retq + %tmp0 = lshr i32 %arg, 19 + %tmp1 = and i32 %tmp0, 4095 + store i32 %tmp1, i32* %ptr + ret void +} + +; Should be still fine, but the result is shifted left afterwards +define void @c7_i32(i32 %arg, i32* %ptr) { +; X86-LABEL: c7_i32: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: shrl $17, %ecx +; X86-NEXT: andl $4092, %ecx # imm = 0xFFC +; X86-NEXT: movl %ecx, (%eax) +; X86-NEXT: retl +; +; X64-LABEL: c7_i32: +; X64: # %bb.0: +; X64-NEXT: shrl $17, %edi +; X64-NEXT: andl $4092, %edi # imm = 0xFFC +; X64-NEXT: movl %edi, (%rsi) +; X64-NEXT: retq + %tmp0 = lshr i32 %arg, 19 + %tmp1 = and i32 %tmp0, 1023 + %tmp2 = shl i32 %tmp1, 2 + store i32 %tmp2, i32* %ptr + ret void +} + +; i64 + +; The most canonical variant +define void @c5_i64(i64 %arg, i64* %ptr) { +; X86-NOBMI-LABEL: c5_i64: +; X86-NOBMI: # %bb.0: +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NOBMI-NEXT: shrl $19, %ecx +; X86-NOBMI-NEXT: andl $1023, %ecx # imm = 0x3FF +; X86-NOBMI-NEXT: movl %ecx, (%eax) +; X86-NOBMI-NEXT: movl $0, 4(%eax) +; X86-NOBMI-NEXT: retl +; +; X86-BMI1NOTBM-LABEL: c5_i64: +; X86-BMI1NOTBM: # %bb.0: +; X86-BMI1NOTBM-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-BMI1NOTBM-NEXT: movl $2579, %ecx # imm = 0xA13 +; X86-BMI1NOTBM-NEXT: bextrl %ecx, {{[0-9]+}}(%esp), %ecx +; X86-BMI1NOTBM-NEXT: movl %ecx, (%eax) +; X86-BMI1NOTBM-NEXT: movl $0, 4(%eax) +; X86-BMI1NOTBM-NEXT: retl +; +; X86-BMI1TBM-LABEL: c5_i64: +; X86-BMI1TBM: # %bb.0: +; X86-BMI1TBM-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-BMI1TBM-NEXT: bextrl $2579, {{[0-9]+}}(%esp), %ecx # imm = 0xA13 +; X86-BMI1TBM-NEXT: movl %ecx, (%eax) +; X86-BMI1TBM-NEXT: movl $0, 4(%eax) +; X86-BMI1TBM-NEXT: retl +; +; X86-BMI1NOTBMBMI2-LABEL: c5_i64: +; X86-BMI1NOTBMBMI2: # %bb.0: +; X86-BMI1NOTBMBMI2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-BMI1NOTBMBMI2-NEXT: movl $2579, %ecx # imm = 0xA13 +; X86-BMI1NOTBMBMI2-NEXT: bextrl %ecx, {{[0-9]+}}(%esp), %ecx +; X86-BMI1NOTBMBMI2-NEXT: movl %ecx, (%eax) +; X86-BMI1NOTBMBMI2-NEXT: movl $0, 4(%eax) +; X86-BMI1NOTBMBMI2-NEXT: retl +; +; X64-NOBMI-LABEL: c5_i64: +; X64-NOBMI: # %bb.0: +; X64-NOBMI-NEXT: shrq $51, %rdi +; X64-NOBMI-NEXT: andl $1023, %edi # imm = 0x3FF +; X64-NOBMI-NEXT: movq %rdi, (%rsi) +; X64-NOBMI-NEXT: retq +; +; X64-BMI1NOTBM-LABEL: c5_i64: +; X64-BMI1NOTBM: # %bb.0: +; X64-BMI1NOTBM-NEXT: movl $2611, %eax # imm = 0xA33 +; X64-BMI1NOTBM-NEXT: bextrq %rax, %rdi, %rax +; X64-BMI1NOTBM-NEXT: movq %rax, (%rsi) +; X64-BMI1NOTBM-NEXT: retq +; +; X64-BMI1TBM-LABEL: c5_i64: +; X64-BMI1TBM: # %bb.0: +; X64-BMI1TBM-NEXT: bextrq $2611, %rdi, %rax # imm = 0xA33 +; X64-BMI1TBM-NEXT: movq %rax, (%rsi) +; X64-BMI1TBM-NEXT: retq +; +; X64-BMI1NOTBMBMI2-LABEL: c5_i64: +; X64-BMI1NOTBMBMI2: # %bb.0: +; X64-BMI1NOTBMBMI2-NEXT: movl $2611, %eax # imm = 0xA33 +; X64-BMI1NOTBMBMI2-NEXT: bextrq %rax, %rdi, %rax +; X64-BMI1NOTBMBMI2-NEXT: movq %rax, (%rsi) +; X64-BMI1NOTBMBMI2-NEXT: retq + %tmp0 = lshr i64 %arg, 51 + %tmp1 = and i64 %tmp0, 1023 + store i64 %tmp1, i64* %ptr + ret void +} + +; Should be still fine, but the mask is shifted +define void @c6_i64(i64 %arg, i64* %ptr) { +; X86-NOBMI-LABEL: c6_i64: +; X86-NOBMI: # %bb.0: +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NOBMI-NEXT: shrl $19, %ecx +; X86-NOBMI-NEXT: andl $4095, %ecx # imm = 0xFFF +; X86-NOBMI-NEXT: movl %ecx, (%eax) +; X86-NOBMI-NEXT: movl $0, 4(%eax) +; X86-NOBMI-NEXT: retl +; +; X86-BMI1NOTBM-LABEL: c6_i64: +; X86-BMI1NOTBM: # %bb.0: +; X86-BMI1NOTBM-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-BMI1NOTBM-NEXT: movl $3091, %ecx # imm = 0xC13 +; X86-BMI1NOTBM-NEXT: bextrl %ecx, {{[0-9]+}}(%esp), %ecx +; X86-BMI1NOTBM-NEXT: movl %ecx, (%eax) +; X86-BMI1NOTBM-NEXT: movl $0, 4(%eax) +; X86-BMI1NOTBM-NEXT: retl +; +; X86-BMI1TBM-LABEL: c6_i64: +; X86-BMI1TBM: # %bb.0: +; X86-BMI1TBM-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-BMI1TBM-NEXT: bextrl $3091, {{[0-9]+}}(%esp), %ecx # imm = 0xC13 +; X86-BMI1TBM-NEXT: movl %ecx, (%eax) +; X86-BMI1TBM-NEXT: movl $0, 4(%eax) +; X86-BMI1TBM-NEXT: retl +; +; X86-BMI1NOTBMBMI2-LABEL: c6_i64: +; X86-BMI1NOTBMBMI2: # %bb.0: +; X86-BMI1NOTBMBMI2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-BMI1NOTBMBMI2-NEXT: movl $3091, %ecx # imm = 0xC13 +; X86-BMI1NOTBMBMI2-NEXT: bextrl %ecx, {{[0-9]+}}(%esp), %ecx +; X86-BMI1NOTBMBMI2-NEXT: movl %ecx, (%eax) +; X86-BMI1NOTBMBMI2-NEXT: movl $0, 4(%eax) +; X86-BMI1NOTBMBMI2-NEXT: retl +; +; X64-NOBMI-LABEL: c6_i64: +; X64-NOBMI: # %bb.0: +; X64-NOBMI-NEXT: shrq $51, %rdi +; X64-NOBMI-NEXT: andl $4095, %edi # imm = 0xFFF +; X64-NOBMI-NEXT: movq %rdi, (%rsi) +; X64-NOBMI-NEXT: retq +; +; X64-BMI1NOTBM-LABEL: c6_i64: +; X64-BMI1NOTBM: # %bb.0: +; X64-BMI1NOTBM-NEXT: movl $3123, %eax # imm = 0xC33 +; X64-BMI1NOTBM-NEXT: bextrq %rax, %rdi, %rax +; X64-BMI1NOTBM-NEXT: movq %rax, (%rsi) +; X64-BMI1NOTBM-NEXT: retq +; +; X64-BMI1TBM-LABEL: c6_i64: +; X64-BMI1TBM: # %bb.0: +; X64-BMI1TBM-NEXT: bextrq $3123, %rdi, %rax # imm = 0xC33 +; X64-BMI1TBM-NEXT: movq %rax, (%rsi) +; X64-BMI1TBM-NEXT: retq +; +; X64-BMI1NOTBMBMI2-LABEL: c6_i64: +; X64-BMI1NOTBMBMI2: # %bb.0: +; X64-BMI1NOTBMBMI2-NEXT: movl $3123, %eax # imm = 0xC33 +; X64-BMI1NOTBMBMI2-NEXT: bextrq %rax, %rdi, %rax +; X64-BMI1NOTBMBMI2-NEXT: movq %rax, (%rsi) +; X64-BMI1NOTBMBMI2-NEXT: retq + %tmp0 = lshr i64 %arg, 51 + %tmp1 = and i64 %tmp0, 4095 + store i64 %tmp1, i64* %ptr + ret void +} + +; Should be still fine, but the result is shifted left afterwards +define void @c7_i64(i64 %arg, i64* %ptr) { +; X86-LABEL: c7_i64: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: shrl $17, %ecx +; X86-NEXT: andl $4092, %ecx # imm = 0xFFC +; X86-NEXT: movl %ecx, (%eax) +; X86-NEXT: movl $0, 4(%eax) +; X86-NEXT: retl +; +; X64-LABEL: c7_i64: +; X64: # %bb.0: +; X64-NEXT: shrq $49, %rdi +; X64-NEXT: andl $4092, %edi # imm = 0xFFC +; X64-NEXT: movq %rdi, (%rsi) +; X64-NEXT: retq + %tmp0 = lshr i64 %arg, 51 + %tmp1 = and i64 %tmp0, 1023 + %tmp2 = shl i64 %tmp1, 2 + store i64 %tmp2, i64* %ptr + ret void +} -- GitLab From 742beb6d7db253a3fbbdcd398f0fa1dfd9fc82e6 Mon Sep 17 00:00:00 2001 From: James Y Knight Date: Wed, 10 Oct 2018 21:07:02 +0000 Subject: [PATCH 0024/1116] llvm-ar: Darwin archive format fixes. * Support writing the DARWIN64 symbol table format. * In darwin archives, emit a symbol table whenever requested, even when there are no members, as the apple linker will abort if given an archive without a symbol table. Added tests for same, and also simplified and moved the GNU 64-bit symbol table test into archive-symtab.test. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344183 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Object/ArchiveWriter.cpp | 61 ++++++++++++++++------------ test/Object/archive-GNU64-write.test | 40 ------------------ test/Object/archive-format.test | 2 +- test/Object/archive-symtab.test | 28 +++++++++++++ 4 files changed, 64 insertions(+), 67 deletions(-) delete mode 100644 test/Object/archive-GNU64-write.test diff --git a/lib/Object/ArchiveWriter.cpp b/lib/Object/ArchiveWriter.cpp index c6c0befb90f..767205390e0 100644 --- a/lib/Object/ArchiveWriter.cpp +++ b/lib/Object/ArchiveWriter.cpp @@ -121,6 +121,11 @@ static void printWithSpacePadding(raw_ostream &OS, T Data, unsigned Size) { OS.indent(Size - SizeSoFar); } +static bool isDarwin(object::Archive::Kind Kind) { + return Kind == object::Archive::K_DARWIN || + Kind == object::Archive::K_DARWIN64; +} + static bool isBSDLike(object::Archive::Kind Kind) { switch (Kind) { case object::Archive::K_GNU: @@ -128,8 +133,8 @@ static bool isBSDLike(object::Archive::Kind Kind) { return false; case object::Archive::K_BSD: case object::Archive::K_DARWIN: - return true; case object::Archive::K_DARWIN64: + return true; case object::Archive::K_COFF: break; } @@ -314,7 +319,9 @@ static void printNBits(raw_ostream &Out, object::Archive::Kind Kind, static void writeSymbolTable(raw_ostream &Out, object::Archive::Kind Kind, bool Deterministic, ArrayRef Members, StringRef StringTable) { - if (StringTable.empty()) + // We don't write a symbol table on an archive with no members -- except on + // Darwin, where the linker will abort unless the archive has a symbol table. + if (StringTable.empty() && !isDarwin(Kind)) return; unsigned NumSyms = 0; @@ -322,15 +329,15 @@ static void writeSymbolTable(raw_ostream &Out, object::Archive::Kind Kind, NumSyms += M.Symbols.size(); unsigned Size = 0; - Size += is64BitKind(Kind) ? 8 : 4; // Number of entries + unsigned OffsetSize = is64BitKind(Kind) ? sizeof(uint64_t) : sizeof(uint32_t); + + Size += OffsetSize; // Number of entries if (isBSDLike(Kind)) - Size += NumSyms * 8; // Table - else if (is64BitKind(Kind)) - Size += NumSyms * 8; // Table + Size += NumSyms * OffsetSize * 2; // Table else - Size += NumSyms * 4; // Table + Size += NumSyms * OffsetSize; // Table if (isBSDLike(Kind)) - Size += 4; // byte count + Size += OffsetSize; // byte count Size += StringTable.size(); // ld64 expects the members to be 8-byte aligned for 64-bit content and at // least 4-byte aligned for 32-bit content. Opt for the larger encoding @@ -340,25 +347,26 @@ static void writeSymbolTable(raw_ostream &Out, object::Archive::Kind Kind, unsigned Pad = OffsetToAlignment(Size, Alignment); Size += Pad; - if (isBSDLike(Kind)) - printBSDMemberHeader(Out, Out.tell(), "__.SYMDEF", now(Deterministic), 0, 0, - 0, Size); - else if (is64BitKind(Kind)) - printGNUSmallMemberHeader(Out, "/SYM64", now(Deterministic), 0, 0, 0, Size); - else - printGNUSmallMemberHeader(Out, "", now(Deterministic), 0, 0, 0, Size); + if (isBSDLike(Kind)) { + const char *Name = is64BitKind(Kind) ? "__.SYMDEF_64" : "__.SYMDEF"; + printBSDMemberHeader(Out, Out.tell(), Name, now(Deterministic), 0, 0, 0, + Size); + } else { + const char *Name = is64BitKind(Kind) ? "/SYM64" : ""; + printGNUSmallMemberHeader(Out, Name, now(Deterministic), 0, 0, 0, Size); + } uint64_t Pos = Out.tell() + Size; if (isBSDLike(Kind)) - print(Out, Kind, NumSyms * 8); + printNBits(Out, Kind, NumSyms * 2 * OffsetSize); else printNBits(Out, Kind, NumSyms); for (const MemberData &M : Members) { for (unsigned StringOffset : M.Symbols) { if (isBSDLike(Kind)) - print(Out, Kind, StringOffset); + printNBits(Out, Kind, StringOffset); printNBits(Out, Kind, Pos); // member offset } Pos += M.Header.size() + M.Data.size() + M.Padding.size(); @@ -366,7 +374,7 @@ static void writeSymbolTable(raw_ostream &Out, object::Archive::Kind Kind, if (isBSDLike(Kind)) // byte count of the string table - print(Out, Kind, StringTable.size()); + printNBits(Out, Kind, StringTable.size()); Out << StringTable; while (Pad--) @@ -466,9 +474,7 @@ computeMemberData(raw_ostream &StringTable, raw_ostream &SymNames, // See also the functions that handle the lookup: // in lldb: ObjectContainerBSDArchive::Archive::FindObject() // in llvm/tools/dsymutil: BinaryHolder::GetArchiveMemberBuffers(). - bool UniqueTimestamps = - Deterministic && (Kind == object::Archive::K_DARWIN || - Kind == object::Archive::K_DARWIN64); + bool UniqueTimestamps = Deterministic && isDarwin(Kind); std::map FilenameCount; if (UniqueTimestamps) { for (const NewArchiveMember &M : NewMembers) @@ -488,9 +494,8 @@ computeMemberData(raw_ostream &StringTable, raw_ostream &SymNames, // least 4-byte aligned for 32-bit content. Opt for the larger encoding // uniformly. This matches the behaviour with cctools and ensures that ld64 // is happy with archives that we generate. - unsigned MemberPadding = Kind == object::Archive::K_DARWIN - ? OffsetToAlignment(Data.size(), 8) - : 0; + unsigned MemberPadding = + isDarwin(Kind) ? OffsetToAlignment(Data.size(), 8) : 0; unsigned TailPadding = OffsetToAlignment(Data.size() + MemberPadding, 2); StringRef Padding = StringRef(PaddingData, MemberPadding + TailPadding); @@ -569,8 +574,12 @@ Error llvm::writeArchive(StringRef ArcName, // If LastOffset isn't going to fit in a 32-bit varible we need to switch // to 64-bit. Note that the file can be larger than 4GB as long as the last // member starts before the 4GB offset. - if (LastOffset >= (1ULL << Sym64Threshold)) - Kind = object::Archive::K_GNU64; + if (LastOffset >= (1ULL << Sym64Threshold)) { + if (Kind == object::Archive::K_DARWIN) + Kind = object::Archive::K_DARWIN64; + else + Kind = object::Archive::K_GNU64; + } } Expected Temp = diff --git a/test/Object/archive-GNU64-write.test b/test/Object/archive-GNU64-write.test deleted file mode 100644 index 0bfb7c80d05..00000000000 --- a/test/Object/archive-GNU64-write.test +++ /dev/null @@ -1,40 +0,0 @@ -# REQUIRES: llvm-64-bits -# REQUIRES: system-linux -# REQUIRES: shell - -# RUN: yaml2obj %s > %t -# RUN: dd if=%t of=%t bs=1 count=0 seek=1M -# RUN: rm -f %t.lib -# RUN: cp %t %t2 -# RUN: SYM64_THRESHOLD=19 llvm-ar cr %t.lib %t %t2 %p/Inputs/trivial-object-test.elf-x86-64 -# RUN: llvm-nm --print-armap %t.lib | FileCheck %s -# RUN: grep SYM64 %t.lib - -# Delete temp files. They are too large. -# RUN: rm -f %t %t2 %t.lib - -!ELF -FileHeader: - Class: ELFCLASS64 - Data: ELFDATA2LSB - Type: ET_EXEC - Machine: EM_X86_64 -Sections: - - Name: .data - Type: SHT_PROGBITS - Flags: [ SHF_ALLOC ] - AddressAlign: 0x0000000000000001 - Content: "00" - Size: 32 - -# CHECK: Archive map -# CHECK-NEXT: main in trivial-object-test.elf-x86-64 - -# CHECK: archive-GNU64-write.test.tmp: - -# CHECK: archive-GNU64-write.test.tmp2: - -# CHECK: trivial-object-test.elf-x86-64: -# CHECK-NEXT: U SomeOtherFunction -# CHECK-NEXT: 0000000000000000 T main -# CHECK-NEXT: U puts diff --git a/test/Object/archive-format.test b/test/Object/archive-format.test index 219fc7f894a..b1ae411161b 100644 --- a/test/Object/archive-format.test +++ b/test/Object/archive-format.test @@ -38,7 +38,7 @@ BSD-SAME: #1/16 0 0 0 644 20 ` BSD-NEXT: 0123456789abcdefzed. RUN: rm -f %t.a -RUN: llvm-ar --format=darwin rc %t.a 0123456789abcde 0123456789abcdef +RUN: llvm-ar --format=darwin rcS %t.a 0123456789abcde 0123456789abcdef RUN: cat %t.a | FileCheck -strict-whitespace --check-prefix=DARWIN %s DARWIN: ! diff --git a/test/Object/archive-symtab.test b/test/Object/archive-symtab.test index 297970725bd..96f48139ddd 100644 --- a/test/Object/archive-symtab.test +++ b/test/Object/archive-symtab.test @@ -2,6 +2,11 @@ RUN: rm -f %t.a RUN: llvm-ar rcsU %t.a %p/Inputs/trivial-object-test.elf-x86-64 %p/Inputs/trivial-object-test2.elf-x86-64 RUN: llvm-nm -M %t.a | FileCheck %s +RUN: rm -f %t.a +RUN: env SYM64_THRESHOLD=1 llvm-ar rcsU %t.a %p/Inputs/trivial-object-test.elf-x86-64 %p/Inputs/trivial-object-test2.elf-x86-64 +RUN: llvm-nm -M %t.a | FileCheck %s +RUXX: grep SYM64 %t.a + CHECK: Archive map CHECK-NEXT: main in trivial-object-test.elf-x86-64 CHECK-NEXT: foo in trivial-object-test2.elf-x86-64 @@ -82,6 +87,11 @@ RUN: rm -f %t.a RUN: llvm-ar --format=bsd rcsU %t.a %p/Inputs/trivial-object-test.macho-x86-64 %p/Inputs/trivial-object-test2.macho-x86-64 RUN: llvm-nm -M %t.a | FileCheck --check-prefix=MACHO %s +RUN: rm -f %t.a +RUN: env SYM64_THRESHOLD=1 llvm-ar --format=darwin rcsU %t.a %p/Inputs/trivial-object-test.macho-x86-64 %p/Inputs/trivial-object-test2.macho-x86-64 +RUN: llvm-nm -M %t.a | FileCheck --check-prefix=MACHO %s +RUN: grep '__\.SYMDEF_64' %t.a + MACHO: Archive map MACHO-NEXT: _main in trivial-object-test.macho-x86-64 MACHO-NEXT: _foo in trivial-object-test2.macho-x86-64 @@ -138,3 +148,21 @@ RUN: llvm-ar --format=gnu rcsD %t.a %p/Inputs/trivial-object-test.macho-x86-64 RUN: FileCheck --check-prefix=GNU-SYMTAB-ALIGN %s < %t.a GNU-SYMTAB-ALIGN: ! GNU-SYMTAB-ALIGN-NEXT: / 0 0 0 0 14 ` + + +** Test the behavior of an empty archive: + +No symbol table emitted for GNU archives +RUN: rm -f %t.a +RUN: llvm-ar rcs --format=gnu %t.a +RUN: not grep -q '/ ' %t.a + +No symbol table for BSD archives +RUN: rm -f %t.a +RUN: llvm-ar rcs --format=bsd %t.a +RUN: not grep -q '__\.SYMDEF' %t.a + +And we do emit a symbol table for DARWIN archives +RUN: rm -f %t.a +RUN: llvm-ar rcs --format=darwin %t.a +RUN: grep -q '__\.SYMDEF' %t.a -- GitLab From 3b1e430b900c604ea3c9e3dd1ceee07db3338a5e Mon Sep 17 00:00:00 2001 From: Richard Smith Date: Wed, 10 Oct 2018 21:09:37 +0000 Subject: [PATCH 0025/1116] Support for remapping profile data when symbols change, for instrumentation-based profiling. Reviewers: davidxl, tejohnson, dlj, erik.pilkington Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D51247 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344184 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/ProfileData/InstrProfReader.h | 30 +++- lib/ProfileData/InstrProfReader.cpp | 164 +++++++++++++++++++-- unittests/ProfileData/InstrProfTest.cpp | 44 +++++- 3 files changed, 223 insertions(+), 15 deletions(-) diff --git a/include/llvm/ProfileData/InstrProfReader.h b/include/llvm/ProfileData/InstrProfReader.h index efc22dcd0d9..08d78227611 100644 --- a/include/llvm/ProfileData/InstrProfReader.h +++ b/include/llvm/ProfileData/InstrProfReader.h @@ -348,6 +348,9 @@ struct InstrProfReaderIndexBase { using OnDiskHashTableImplV3 = OnDiskIterableChainedHashTable; +template +class InstrProfReaderItaniumRemapper; + template class InstrProfReaderIndex : public InstrProfReaderIndexBase { private: @@ -355,6 +358,8 @@ private: typename HashTableImpl::data_iterator RecordIterator; uint64_t FormatVersion; + friend class InstrProfReaderItaniumRemapper; + public: InstrProfReaderIndex(const unsigned char *Buckets, const unsigned char *const Payload, @@ -386,13 +391,26 @@ public: } }; +/// Name matcher supporting fuzzy matching of symbol names to names in profiles. +class InstrProfReaderRemapper { +public: + virtual ~InstrProfReaderRemapper() {} + virtual Error populateRemappings() { return Error::success(); } + virtual Error getRecords(StringRef FuncName, + ArrayRef &Data) = 0; +}; + /// Reader for the indexed binary instrprof format. class IndexedInstrProfReader : public InstrProfReader { private: /// The profile data file contents. std::unique_ptr DataBuffer; + /// The profile remapping file contents. + std::unique_ptr RemappingBuffer; /// The index into the profile data. std::unique_ptr Index; + /// The profile remapping file contents. + std::unique_ptr Remapper; /// Profile summary data. std::unique_ptr Summary; // Index to the current record in the record array. @@ -404,8 +422,11 @@ private: const unsigned char *Cur); public: - IndexedInstrProfReader(std::unique_ptr DataBuffer) - : DataBuffer(std::move(DataBuffer)), RecordIndex(0) {} + IndexedInstrProfReader( + std::unique_ptr DataBuffer, + std::unique_ptr RemappingBuffer = nullptr) + : DataBuffer(std::move(DataBuffer)), + RemappingBuffer(std::move(RemappingBuffer)), RecordIndex(0) {} IndexedInstrProfReader(const IndexedInstrProfReader &) = delete; IndexedInstrProfReader &operator=(const IndexedInstrProfReader &) = delete; @@ -434,10 +455,11 @@ public: /// Factory method to create an indexed reader. static Expected> - create(const Twine &Path); + create(const Twine &Path, const Twine &RemappingPath = ""); static Expected> - create(std::unique_ptr Buffer); + create(std::unique_ptr Buffer, + std::unique_ptr RemappingBuffer = nullptr); // Used for testing purpose only. void setValueProfDataEndianness(support::endianness Endianness) { diff --git a/lib/ProfileData/InstrProfReader.cpp b/lib/ProfileData/InstrProfReader.cpp index 3b704158a5c..eaf0eb04bfb 100644 --- a/lib/ProfileData/InstrProfReader.cpp +++ b/lib/ProfileData/InstrProfReader.cpp @@ -14,6 +14,7 @@ #include "llvm/ProfileData/InstrProfReader.h" #include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/IR/ProfileSummary.h" @@ -23,6 +24,7 @@ #include "llvm/Support/Error.h" #include "llvm/Support/ErrorOr.h" #include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/SymbolRemappingReader.h" #include "llvm/Support/SwapByteOrder.h" #include #include @@ -88,16 +90,29 @@ InstrProfReader::create(std::unique_ptr Buffer) { } Expected> -IndexedInstrProfReader::create(const Twine &Path) { +IndexedInstrProfReader::create(const Twine &Path, const Twine &RemappingPath) { // Set up the buffer to read. auto BufferOrError = setupMemoryBuffer(Path); if (Error E = BufferOrError.takeError()) return std::move(E); - return IndexedInstrProfReader::create(std::move(BufferOrError.get())); + + // Set up the remapping buffer if requested. + std::unique_ptr RemappingBuffer; + std::string RemappingPathStr = RemappingPath.str(); + if (!RemappingPathStr.empty()) { + auto RemappingBufferOrError = setupMemoryBuffer(RemappingPathStr); + if (Error E = RemappingBufferOrError.takeError()) + return std::move(E); + RemappingBuffer = std::move(RemappingBufferOrError.get()); + } + + return IndexedInstrProfReader::create(std::move(BufferOrError.get()), + std::move(RemappingBuffer)); } Expected> -IndexedInstrProfReader::create(std::unique_ptr Buffer) { +IndexedInstrProfReader::create(std::unique_ptr Buffer, + std::unique_ptr RemappingBuffer) { // Sanity check the buffer. if (uint64_t(Buffer->getBufferSize()) > std::numeric_limits::max()) return make_error(instrprof_error::too_large); @@ -105,7 +120,8 @@ IndexedInstrProfReader::create(std::unique_ptr Buffer) { // Create the reader. if (!IndexedInstrProfReader::hasFormat(*Buffer)) return make_error(instrprof_error::bad_magic); - auto Result = llvm::make_unique(std::move(Buffer)); + auto Result = llvm::make_unique( + std::move(Buffer), std::move(RemappingBuffer)); // Initialize the reader and return the result. if (Error E = initializeReader(*Result)) @@ -587,6 +603,124 @@ InstrProfReaderIndex::InstrProfReaderIndex( RecordIterator = HashTable->data_begin(); } +namespace { +/// A remapper that does not apply any remappings. +class InstrProfReaderNullRemapper : public InstrProfReaderRemapper { + InstrProfReaderIndexBase &Underlying; + +public: + InstrProfReaderNullRemapper(InstrProfReaderIndexBase &Underlying) + : Underlying(Underlying) {} + + Error getRecords(StringRef FuncName, + ArrayRef &Data) override { + return Underlying.getRecords(FuncName, Data); + } +}; +} + +/// A remapper that applies remappings based on a symbol remapping file. +template +class llvm::InstrProfReaderItaniumRemapper + : public InstrProfReaderRemapper { +public: + InstrProfReaderItaniumRemapper( + std::unique_ptr RemapBuffer, + InstrProfReaderIndex &Underlying) + : RemapBuffer(std::move(RemapBuffer)), Underlying(Underlying) { + } + + /// Extract the original function name from a PGO function name. + static StringRef extractName(StringRef Name) { + // We can have multiple :-separated pieces; there can be pieces both + // before and after the mangled name. Find the first part that starts + // with '_Z'; we'll assume that's the mangled name we want. + std::pair Parts = {StringRef(), Name}; + while (true) { + Parts = Parts.second.split(':'); + if (Parts.first.startswith("_Z")) + return Parts.first; + if (Parts.second.empty()) + return Name; + } + } + + /// Given a mangled name extracted from a PGO function name, and a new + /// form for that mangled name, reconstitute the name. + static void reconstituteName(StringRef OrigName, StringRef ExtractedName, + StringRef Replacement, + SmallVectorImpl &Out) { + Out.reserve(OrigName.size() + Replacement.size() - ExtractedName.size()); + Out.insert(Out.end(), OrigName.begin(), ExtractedName.begin()); + Out.insert(Out.end(), Replacement.begin(), Replacement.end()); + Out.insert(Out.end(), ExtractedName.end(), OrigName.end()); + } + + Error populateRemappings() override { + if (Error E = Remappings.read(*RemapBuffer)) + return E; + for (StringRef Name : Underlying.HashTable->keys()) { + StringRef RealName = extractName(Name); + if (auto Key = Remappings.insert(RealName)) { + // FIXME: We could theoretically map the same equivalence class to + // multiple names in the profile data. If that happens, we should + // return NamedInstrProfRecords from all of them. + MappedNames.insert({Key, RealName}); + } + } + return Error::success(); + } + + Error getRecords(StringRef FuncName, + ArrayRef &Data) override { + StringRef RealName = extractName(FuncName); + if (auto Key = Remappings.lookup(RealName)) { + StringRef Remapped = MappedNames.lookup(Key); + if (!Remapped.empty()) { + if (RealName.begin() == FuncName.begin() && + RealName.end() == FuncName.end()) + FuncName = Remapped; + else { + // Try rebuilding the name from the given remapping. + SmallString<256> Reconstituted; + reconstituteName(FuncName, RealName, Remapped, Reconstituted); + Error E = Underlying.getRecords(Reconstituted, Data); + if (!E) + return E; + + // If we failed because the name doesn't exist, fall back to asking + // about the original name. + if (Error Unhandled = handleErrors( + std::move(E), [](std::unique_ptr Err) { + return Err->get() == instrprof_error::unknown_function + ? Error::success() + : Error(std::move(Err)); + })) + return Unhandled; + } + } + } + return Underlying.getRecords(FuncName, Data); + } + +private: + /// The memory buffer containing the remapping configuration. Remappings + /// holds pointers into this buffer. + std::unique_ptr RemapBuffer; + + /// The mangling remapper. + SymbolRemappingReader Remappings; + + /// Mapping from mangled name keys to the name used for the key in the + /// profile data. + /// FIXME: Can we store a location within the on-disk hash table instead of + /// redoing lookup? + DenseMap MappedNames; + + /// The real profile data reader. + InstrProfReaderIndex &Underlying; +}; + bool IndexedInstrProfReader::hasFormat(const MemoryBuffer &DataBuffer) { using namespace support; @@ -683,10 +817,22 @@ Error IndexedInstrProfReader::readHeader() { uint64_t HashOffset = endian::byte_swap(Header->HashOffset); // The rest of the file is an on disk hash table. - InstrProfReaderIndexBase *IndexPtr = nullptr; - IndexPtr = new InstrProfReaderIndex( - Start + HashOffset, Cur, Start, HashType, FormatVersion); - Index.reset(IndexPtr); + auto IndexPtr = + llvm::make_unique>( + Start + HashOffset, Cur, Start, HashType, FormatVersion); + + // Load the remapping table now if requested. + if (RemappingBuffer) { + Remapper = llvm::make_unique< + InstrProfReaderItaniumRemapper>( + std::move(RemappingBuffer), *IndexPtr); + if (Error E = Remapper->populateRemappings()) + return E; + } else { + Remapper = llvm::make_unique(*IndexPtr); + } + Index = std::move(IndexPtr); + return success(); } @@ -707,7 +853,7 @@ Expected IndexedInstrProfReader::getInstrProfRecord(StringRef FuncName, uint64_t FuncHash) { ArrayRef Data; - Error Err = Index->getRecords(FuncName, Data); + Error Err = Remapper->getRecords(FuncName, Data); if (Err) return std::move(Err); // Found it. Look for counters with the right hash. diff --git a/unittests/ProfileData/InstrProfTest.cpp b/unittests/ProfileData/InstrProfTest.cpp index 0c99f7fde65..2d915d44598 100644 --- a/unittests/ProfileData/InstrProfTest.cpp +++ b/unittests/ProfileData/InstrProfTest.cpp @@ -42,8 +42,10 @@ struct InstrProfTest : ::testing::Test { void SetUp() { Writer.setOutputSparse(false); } - void readProfile(std::unique_ptr Profile) { - auto ReaderOrErr = IndexedInstrProfReader::create(std::move(Profile)); + void readProfile(std::unique_ptr Profile, + std::unique_ptr Remapping = nullptr) { + auto ReaderOrErr = IndexedInstrProfReader::create(std::move(Profile), + std::move(Remapping)); EXPECT_THAT_ERROR(ReaderOrErr.takeError(), Succeeded()); Reader = std::move(ReaderOrErr.get()); } @@ -990,6 +992,44 @@ TEST_P(MaybeSparseInstrProfTest, instr_prof_symtab_compression_test) { } } +TEST_P(MaybeSparseInstrProfTest, remapping_test) { + Writer.addRecord({"_Z3fooi", 0x1234, {1, 2, 3, 4}}, Err); + Writer.addRecord({"file:_Z3barf", 0x567, {5, 6, 7}}, Err); + auto Profile = Writer.writeBuffer(); + readProfile(std::move(Profile), llvm::MemoryBuffer::getMemBuffer(R"( + type i l + name 3bar 4quux + )")); + + std::vector Counts; + for (StringRef FooName : {"_Z3fooi", "_Z3fool"}) { + EXPECT_THAT_ERROR(Reader->getFunctionCounts(FooName, 0x1234, Counts), + Succeeded()); + ASSERT_EQ(4u, Counts.size()); + EXPECT_EQ(1u, Counts[0]); + EXPECT_EQ(2u, Counts[1]); + EXPECT_EQ(3u, Counts[2]); + EXPECT_EQ(4u, Counts[3]); + } + + for (StringRef BarName : {"file:_Z3barf", "file:_Z4quuxf"}) { + EXPECT_THAT_ERROR(Reader->getFunctionCounts(BarName, 0x567, Counts), + Succeeded()); + ASSERT_EQ(3u, Counts.size()); + EXPECT_EQ(5u, Counts[0]); + EXPECT_EQ(6u, Counts[1]); + EXPECT_EQ(7u, Counts[2]); + } + + for (StringRef BadName : {"_Z3foof", "_Z4quuxi", "_Z3barl", "", "_ZZZ", + "_Z3barf", "otherfile:_Z4quuxf"}) { + EXPECT_THAT_ERROR(Reader->getFunctionCounts(BadName, 0x1234, Counts), + Failed()); + EXPECT_THAT_ERROR(Reader->getFunctionCounts(BadName, 0x567, Counts), + Failed()); + } +} + TEST_F(SparseInstrProfTest, preserve_no_records) { Writer.addRecord({"foo", 0x1234, {0}}, Err); Writer.addRecord({"bar", 0x4321, {0, 0}}, Err); -- GitLab From 3cf846acd82f47357fa277a98871d16608d724e3 Mon Sep 17 00:00:00 2001 From: Armando Montanez Date: Wed, 10 Oct 2018 21:16:57 +0000 Subject: [PATCH 0026/1116] Test commit: fix typo in comment git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344185 91177308-0d34-0410-b5e6-96231b3b80d8 --- tools/llvm-objcopy/llvm-objcopy.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/llvm-objcopy/llvm-objcopy.cpp b/tools/llvm-objcopy/llvm-objcopy.cpp index 3e494f92b67..41c6ef3f3dc 100644 --- a/tools/llvm-objcopy/llvm-objcopy.cpp +++ b/tools/llvm-objcopy/llvm-objcopy.cpp @@ -523,7 +523,7 @@ static void handleArgs(const CopyConfig &Config, Object &Obj, // The purpose of this loop is to mark symbols referenced by sections // (like GroupSection or RelocationSection). This way, we know which - // symbols are still 'needed' and wich are not. + // symbols are still 'needed' and which are not. if (Config.StripUnneeded) { for (auto &Section : Obj.sections()) Section.markSymbols(); -- GitLab From ea46abe2cc75b90acc4d34c28921c41b1e547598 Mon Sep 17 00:00:00 2001 From: George Burgess IV Date: Wed, 10 Oct 2018 21:28:44 +0000 Subject: [PATCH 0027/1116] Replace most users of UnknownSize with LocationSize::unknown(); NFC Moving away from UnknownSize is part of the effort to migrate us to LocationSizes (e.g. the cleanup promised in D44748). This doesn't entirely remove all of the uses of UnknownSize; some uses require tweaks to assume that UnknownSize isn't just some kind of int. This patch is intended to just be a trivial replacement for all places where LocationSize::unknown() will Just Work. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344186 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Analysis/AliasAnalysis.h | 3 +- .../llvm/Analysis/MemoryDependenceAnalysis.h | 2 +- include/llvm/Analysis/MemoryLocation.h | 2 +- lib/Analysis/AliasSetTracker.cpp | 2 +- lib/Analysis/BasicAliasAnalysis.cpp | 39 +++++++++---------- lib/Analysis/CFLAndersAliasAnalysis.cpp | 6 +-- lib/Analysis/DependenceAnalysis.cpp | 4 +- lib/Analysis/LoopAccessAnalysis.cpp | 4 +- lib/Analysis/MemoryLocation.cpp | 10 +++-- lib/CodeGen/ImplicitNullChecks.cpp | 10 ++--- lib/CodeGen/MachinePipeliner.cpp | 4 +- lib/Target/ARM/ARMParallelDSP.cpp | 2 +- .../Hexagon/HexagonLoopIdiomRecognition.cpp | 2 +- lib/Transforms/IPO/FunctionAttrs.cpp | 2 +- lib/Transforms/Scalar/LICM.cpp | 2 +- unittests/Analysis/AliasAnalysisTest.cpp | 4 +- 16 files changed, 48 insertions(+), 50 deletions(-) diff --git a/include/llvm/Analysis/AliasAnalysis.h b/include/llvm/Analysis/AliasAnalysis.h index be3496bbd95..88a70f4fe59 100644 --- a/include/llvm/Analysis/AliasAnalysis.h +++ b/include/llvm/Analysis/AliasAnalysis.h @@ -335,8 +335,7 @@ public: /// A convenience wrapper around the primary \c alias interface. AliasResult alias(const Value *V1, const Value *V2) { - return alias(V1, MemoryLocation::UnknownSize, V2, - MemoryLocation::UnknownSize); + return alias(V1, LocationSize::unknown(), V2, LocationSize::unknown()); } /// A trivial helper function to check to see if the specified pointers are diff --git a/include/llvm/Analysis/MemoryDependenceAnalysis.h b/include/llvm/Analysis/MemoryDependenceAnalysis.h index 1c40cffc7f6..52340b0cb51 100644 --- a/include/llvm/Analysis/MemoryDependenceAnalysis.h +++ b/include/llvm/Analysis/MemoryDependenceAnalysis.h @@ -304,7 +304,7 @@ private: /// The maximum size of the dereferences of the pointer. /// /// May be UnknownSize if the sizes are unknown. - LocationSize Size = MemoryLocation::UnknownSize; + LocationSize Size = LocationSize::unknown(); /// The AA tags associated with dereferences of the pointer. /// /// The members may be null if there are no tags or conflicting tags. diff --git a/include/llvm/Analysis/MemoryLocation.h b/include/llvm/Analysis/MemoryLocation.h index 509efa2ca1d..cf839c5a1eb 100644 --- a/include/llvm/Analysis/MemoryLocation.h +++ b/include/llvm/Analysis/MemoryLocation.h @@ -239,7 +239,7 @@ public: } explicit MemoryLocation(const Value *Ptr = nullptr, - LocationSize Size = UnknownSize, + LocationSize Size = LocationSize::unknown(), const AAMDNodes &AATags = AAMDNodes()) : Ptr(Ptr), Size(Size), AATags(AATags) {} diff --git a/lib/Analysis/AliasSetTracker.cpp b/lib/Analysis/AliasSetTracker.cpp index 0d0277e9c34..66544c51446 100644 --- a/lib/Analysis/AliasSetTracker.cpp +++ b/lib/Analysis/AliasSetTracker.cpp @@ -649,7 +649,7 @@ void AliasSet::print(raw_ostream &OS) const { for (iterator I = begin(), E = end(); I != E; ++I) { if (I != begin()) OS << ", "; I.getPointer()->printAsOperand(OS << "("); - if (I.getSize() == MemoryLocation::UnknownSize) + if (I.getSize() == LocationSize::unknown()) OS << ", unknown)"; else OS << ", " << I.getSize() << ")"; diff --git a/lib/Analysis/BasicAliasAnalysis.cpp b/lib/Analysis/BasicAliasAnalysis.cpp index 2f513004fe8..b7aa395ab84 100644 --- a/lib/Analysis/BasicAliasAnalysis.cpp +++ b/lib/Analysis/BasicAliasAnalysis.cpp @@ -1019,8 +1019,8 @@ static AliasResult aliasSameBasePointerGEPs(const GEPOperator *GEP1, // If we don't know the size of the accesses through both GEPs, we can't // determine whether the struct fields accessed can't alias. - if (MaybeV1Size == MemoryLocation::UnknownSize || - MaybeV2Size == MemoryLocation::UnknownSize) + if (MaybeV1Size == LocationSize::unknown() || + MaybeV2Size == LocationSize::unknown()) return MayAlias; const uint64_t V1Size = MaybeV1Size.getValue(); @@ -1184,8 +1184,7 @@ bool BasicAAResult::isGEPBaseAtNegativeOffset(const GEPOperator *GEPOp, const DecomposedGEP &DecompGEP, const DecomposedGEP &DecompObject, LocationSize MaybeObjectAccessSize) { // If the object access size is unknown, or the GEP isn't inbounds, bail. - if (MaybeObjectAccessSize == MemoryLocation::UnknownSize || - !GEPOp->isInBounds()) + if (MaybeObjectAccessSize == LocationSize::unknown() || !GEPOp->isInBounds()) return false; const uint64_t ObjectAccessSize = MaybeObjectAccessSize.getValue(); @@ -1254,8 +1253,8 @@ BasicAAResult::aliasGEP(const GEPOperator *GEP1, LocationSize V1Size, return NoAlias; // Do the base pointers alias? AliasResult BaseAlias = - aliasCheck(UnderlyingV1, MemoryLocation::UnknownSize, AAMDNodes(), - UnderlyingV2, MemoryLocation::UnknownSize, AAMDNodes()); + aliasCheck(UnderlyingV1, LocationSize::unknown(), AAMDNodes(), + UnderlyingV2, LocationSize::unknown(), AAMDNodes()); // Check for geps of non-aliasing underlying pointers where the offsets are // identical. @@ -1314,13 +1313,12 @@ BasicAAResult::aliasGEP(const GEPOperator *GEP1, LocationSize V1Size, // pointer, we know they cannot alias. // If both accesses are unknown size, we can't do anything useful here. - if (V1Size == MemoryLocation::UnknownSize && - V2Size == MemoryLocation::UnknownSize) + if (V1Size == LocationSize::unknown() && V2Size == LocationSize::unknown()) return MayAlias; - AliasResult R = aliasCheck(UnderlyingV1, MemoryLocation::UnknownSize, - AAMDNodes(), V2, MemoryLocation::UnknownSize, - V2AAInfo, nullptr, UnderlyingV2); + AliasResult R = + aliasCheck(UnderlyingV1, LocationSize::unknown(), AAMDNodes(), V2, + LocationSize::unknown(), V2AAInfo, nullptr, UnderlyingV2); if (R != MustAlias) { // If V2 may alias GEP base pointer, conservatively returns MayAlias. // If V2 is known not to alias GEP base pointer, then the two values @@ -1351,7 +1349,7 @@ BasicAAResult::aliasGEP(const GEPOperator *GEP1, LocationSize V1Size, // greater, we know they do not overlap. if (GEP1BaseOffset != 0 && DecompGEP1.VarIndices.empty()) { if (GEP1BaseOffset >= 0) { - if (V2Size != MemoryLocation::UnknownSize) { + if (V2Size != LocationSize::unknown()) { if ((uint64_t)GEP1BaseOffset < V2Size.getValue()) return PartialAlias; return NoAlias; @@ -1365,8 +1363,8 @@ BasicAAResult::aliasGEP(const GEPOperator *GEP1, LocationSize V1Size, // GEP1 V2 // We need to know that V2Size is not unknown, otherwise we might have // stripped a gep with negative index ('gep , -1, ...). - if (V1Size != MemoryLocation::UnknownSize && - V2Size != MemoryLocation::UnknownSize) { + if (V1Size != LocationSize::unknown() && + V2Size != LocationSize::unknown()) { if (-(uint64_t)GEP1BaseOffset < V1Size.getValue()) return PartialAlias; return NoAlias; @@ -1416,9 +1414,8 @@ BasicAAResult::aliasGEP(const GEPOperator *GEP1, LocationSize V1Size, // mod Modulo. Check whether that difference guarantees that the // two locations do not alias. uint64_t ModOffset = (uint64_t)GEP1BaseOffset & (Modulo - 1); - if (V1Size != MemoryLocation::UnknownSize && - V2Size != MemoryLocation::UnknownSize && - ModOffset >= V2Size.getValue() && + if (V1Size != LocationSize::unknown() && + V2Size != LocationSize::unknown() && ModOffset >= V2Size.getValue() && V1Size.getValue() <= Modulo - ModOffset) return NoAlias; @@ -1426,7 +1423,7 @@ BasicAAResult::aliasGEP(const GEPOperator *GEP1, LocationSize V1Size, // If GEP1BasePtr > V2 (GEP1BaseOffset > 0) then we know the pointers // don't alias if V2Size can fit in the gap between V2 and GEP1BasePtr. if (AllPositive && GEP1BaseOffset > 0 && - V2Size != MemoryLocation::UnknownSize && + V2Size != LocationSize::unknown() && V2Size.getValue() <= (uint64_t)GEP1BaseOffset) return NoAlias; @@ -1607,7 +1604,7 @@ AliasResult BasicAAResult::aliasPHI(const PHINode *PN, LocationSize PNSize, // unknown to represent all the possible values the GEP could advance the // pointer to. if (isRecursive) - PNSize = MemoryLocation::UnknownSize; + PNSize = LocationSize::unknown(); AliasResult Alias = aliasCheck(V2, V2Size, V2AAInfo, V1Srcs[0], @@ -1864,8 +1861,8 @@ bool BasicAAResult::constantOffsetHeuristic( const SmallVectorImpl &VarIndices, LocationSize MaybeV1Size, LocationSize MaybeV2Size, int64_t BaseOffset, AssumptionCache *AC, DominatorTree *DT) { - if (VarIndices.size() != 2 || MaybeV1Size == MemoryLocation::UnknownSize || - MaybeV2Size == MemoryLocation::UnknownSize) + if (VarIndices.size() != 2 || MaybeV1Size == LocationSize::unknown() || + MaybeV2Size == LocationSize::unknown()) return false; const uint64_t V1Size = MaybeV1Size.getValue(); diff --git a/lib/Analysis/CFLAndersAliasAnalysis.cpp b/lib/Analysis/CFLAndersAliasAnalysis.cpp index b43b48eeef7..1c61dd369a0 100644 --- a/lib/Analysis/CFLAndersAliasAnalysis.cpp +++ b/lib/Analysis/CFLAndersAliasAnalysis.cpp @@ -556,9 +556,9 @@ bool CFLAndersAAResult::FunctionInfo::mayAlias( OffsetValue{RHS, 0}, Comparator); if (RangePair.first != RangePair.second) { - // Be conservative about UnknownSize - if (MaybeLHSSize == MemoryLocation::UnknownSize || - MaybeRHSSize == MemoryLocation::UnknownSize) + // Be conservative about unknown sizes + if (MaybeLHSSize == LocationSize::unknown() || + MaybeRHSSize == LocationSize::unknown()) return true; const uint64_t LHSSize = MaybeLHSSize.getValue(); diff --git a/lib/Analysis/DependenceAnalysis.cpp b/lib/Analysis/DependenceAnalysis.cpp index 79c2728d562..b544ae5f535 100644 --- a/lib/Analysis/DependenceAnalysis.cpp +++ b/lib/Analysis/DependenceAnalysis.cpp @@ -633,8 +633,8 @@ static AliasResult underlyingObjectsAlias(AliasAnalysis *AA, const MemoryLocation &LocB) { // Check the original locations (minus size) for noalias, which can happen for // tbaa, incompatible underlying object locations, etc. - MemoryLocation LocAS(LocA.Ptr, MemoryLocation::UnknownSize, LocA.AATags); - MemoryLocation LocBS(LocB.Ptr, MemoryLocation::UnknownSize, LocB.AATags); + MemoryLocation LocAS(LocA.Ptr, LocationSize::unknown(), LocA.AATags); + MemoryLocation LocBS(LocB.Ptr, LocationSize::unknown(), LocB.AATags); if (AA->alias(LocAS, LocBS) == NoAlias) return NoAlias; diff --git a/lib/Analysis/LoopAccessAnalysis.cpp b/lib/Analysis/LoopAccessAnalysis.cpp index 8312a0d1cff..b43e290956d 100644 --- a/lib/Analysis/LoopAccessAnalysis.cpp +++ b/lib/Analysis/LoopAccessAnalysis.cpp @@ -509,7 +509,7 @@ public: /// Register a load and whether it is only read from. void addLoad(MemoryLocation &Loc, bool IsReadOnly) { Value *Ptr = const_cast(Loc.Ptr); - AST.add(Ptr, MemoryLocation::UnknownSize, Loc.AATags); + AST.add(Ptr, LocationSize::unknown(), Loc.AATags); Accesses.insert(MemAccessInfo(Ptr, false)); if (IsReadOnly) ReadOnlyPtr.insert(Ptr); @@ -518,7 +518,7 @@ public: /// Register a store. void addStore(MemoryLocation &Loc) { Value *Ptr = const_cast(Loc.Ptr); - AST.add(Ptr, MemoryLocation::UnknownSize, Loc.AATags); + AST.add(Ptr, LocationSize::unknown(), Loc.AATags); Accesses.insert(MemAccessInfo(Ptr, true)); } diff --git a/lib/Analysis/MemoryLocation.cpp b/lib/Analysis/MemoryLocation.cpp index 3cd4b4475ef..c0605f6ad37 100644 --- a/lib/Analysis/MemoryLocation.cpp +++ b/lib/Analysis/MemoryLocation.cpp @@ -55,7 +55,8 @@ MemoryLocation MemoryLocation::get(const VAArgInst *VI) { AAMDNodes AATags; VI->getAAMetadata(AATags); - return MemoryLocation(VI->getPointerOperand(), UnknownSize, AATags); + return MemoryLocation(VI->getPointerOperand(), LocationSize::unknown(), + AATags); } MemoryLocation MemoryLocation::get(const AtomicCmpXchgInst *CXI) { @@ -87,7 +88,7 @@ MemoryLocation MemoryLocation::getForSource(const AtomicMemTransferInst *MTI) { } MemoryLocation MemoryLocation::getForSource(const AnyMemTransferInst *MTI) { - uint64_t Size = UnknownSize; + uint64_t Size = MemoryLocation::UnknownSize; if (ConstantInt *C = dyn_cast(MTI->getLength())) Size = C->getValue().getZExtValue(); @@ -108,7 +109,7 @@ MemoryLocation MemoryLocation::getForDest(const AtomicMemIntrinsic *MI) { } MemoryLocation MemoryLocation::getForDest(const AnyMemIntrinsic *MI) { - uint64_t Size = UnknownSize; + uint64_t Size = MemoryLocation::UnknownSize; if (ConstantInt *C = dyn_cast(MI->getLength())) Size = C->getValue().getZExtValue(); @@ -189,5 +190,6 @@ MemoryLocation MemoryLocation::getForArgument(ImmutableCallSite CS, } // FIXME: Handle memset_pattern4 and memset_pattern8 also. - return MemoryLocation(CS.getArgument(ArgIdx), UnknownSize, AATags); + return MemoryLocation(CS.getArgument(ArgIdx), LocationSize::unknown(), + AATags); } diff --git a/lib/CodeGen/ImplicitNullChecks.cpp b/lib/CodeGen/ImplicitNullChecks.cpp index 034692de92d..deb49a1ea48 100644 --- a/lib/CodeGen/ImplicitNullChecks.cpp +++ b/lib/CodeGen/ImplicitNullChecks.cpp @@ -344,11 +344,11 @@ ImplicitNullChecks::areMemoryOpsAliased(MachineInstr &MI, return AR_MayAlias; continue; } - llvm::AliasResult AAResult = AA->alias( - MemoryLocation(MMO1->getValue(), MemoryLocation::UnknownSize, - MMO1->getAAInfo()), - MemoryLocation(MMO2->getValue(), MemoryLocation::UnknownSize, - MMO2->getAAInfo())); + llvm::AliasResult AAResult = + AA->alias(MemoryLocation(MMO1->getValue(), LocationSize::unknown(), + MMO1->getAAInfo()), + MemoryLocation(MMO2->getValue(), LocationSize::unknown(), + MMO2->getAAInfo())); if (AAResult != NoAlias) return AR_MayAlias; } diff --git a/lib/CodeGen/MachinePipeliner.cpp b/lib/CodeGen/MachinePipeliner.cpp index 5f6f0cf96a5..3d8510f7c0c 100644 --- a/lib/CodeGen/MachinePipeliner.cpp +++ b/lib/CodeGen/MachinePipeliner.cpp @@ -1136,9 +1136,9 @@ void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) { continue; } AliasResult AAResult = AA->alias( - MemoryLocation(MMO1->getValue(), MemoryLocation::UnknownSize, + MemoryLocation(MMO1->getValue(), LocationSize::unknown(), MMO1->getAAInfo()), - MemoryLocation(MMO2->getValue(), MemoryLocation::UnknownSize, + MemoryLocation(MMO2->getValue(), LocationSize::unknown(), MMO2->getAAInfo())); if (AAResult != NoAlias) { diff --git a/lib/Target/ARM/ARMParallelDSP.cpp b/lib/Target/ARM/ARMParallelDSP.cpp index 050a76413cf..3ab9298c110 100644 --- a/lib/Target/ARM/ARMParallelDSP.cpp +++ b/lib/Target/ARM/ARMParallelDSP.cpp @@ -71,7 +71,7 @@ namespace { virtual ~OpChain() = default; void SetMemoryLocations() { - const auto Size = MemoryLocation::UnknownSize; + const auto Size = LocationSize::unknown(); for (auto *V : AllValues) { if (auto *I = dyn_cast(V)) { if (I->mayWriteToMemory()) diff --git a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp index f9ed0390923..f38992bef69 100644 --- a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp +++ b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp @@ -1970,7 +1970,7 @@ mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L, // Get the location that may be stored across the loop. Since the access // is strided positively through memory, we say that the modified location // starts at the pointer and has infinite size. - LocationSize AccessSize = MemoryLocation::UnknownSize; + LocationSize AccessSize = LocationSize::unknown(); // If the loop iterates a fixed number of times, we can refine the access // size to be exactly the size of the memset, which is (BECount+1)*StoreSize diff --git a/lib/Transforms/IPO/FunctionAttrs.cpp b/lib/Transforms/IPO/FunctionAttrs.cpp index 72c850fca99..f01c6a4e99b 100644 --- a/lib/Transforms/IPO/FunctionAttrs.cpp +++ b/lib/Transforms/IPO/FunctionAttrs.cpp @@ -165,7 +165,7 @@ static MemoryAccessKind checkFunctionMemoryAccess(Function &F, bool ThisBody, AAMDNodes AAInfo; I->getAAMetadata(AAInfo); - MemoryLocation Loc(Arg, MemoryLocation::UnknownSize, AAInfo); + MemoryLocation Loc(Arg, LocationSize::unknown(), AAInfo); // Skip accesses to local or constant memory as they don't impact the // externally visible mod/ref behavior. diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp index bb918cf717d..601d49fc03f 100644 --- a/lib/Transforms/Scalar/LICM.cpp +++ b/lib/Transforms/Scalar/LICM.cpp @@ -693,7 +693,7 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT, for (Value *Op : CI->arg_operands()) if (Op->getType()->isPointerTy() && pointerInvalidatedByLoop( - MemoryLocation(Op, MemoryLocation::UnknownSize, AAMDNodes()), + MemoryLocation(Op, LocationSize::unknown(), AAMDNodes()), CurAST, CurLoop, AA)) return false; return true; diff --git a/unittests/Analysis/AliasAnalysisTest.cpp b/unittests/Analysis/AliasAnalysisTest.cpp index 0f0d44f6c78..42a4210feba 100644 --- a/unittests/Analysis/AliasAnalysisTest.cpp +++ b/unittests/Analysis/AliasAnalysisTest.cpp @@ -55,8 +55,8 @@ struct AATestPass : FunctionPass { for (Value *P1 : Pointers) for (Value *P2 : Pointers) - (void)AA.alias(P1, MemoryLocation::UnknownSize, P2, - MemoryLocation::UnknownSize); + (void)AA.alias(P1, LocationSize::unknown(), P2, + LocationSize::unknown()); return false; } -- GitLab From 04af5ff3eb6ec26c853c7aabe3f282f6a293ef9f Mon Sep 17 00:00:00 2001 From: Richard Smith Date: Wed, 10 Oct 2018 21:31:01 +0000 Subject: [PATCH 0028/1116] Support for remapping profile data when symbols change, for sample-based profiling. Reviewers: davidxl, tejohnson, dlj, erik.pilkington Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D51248 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344187 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/ProfileData/SampleProfReader.h | 52 ++++++++++++++++-- lib/ProfileData/SampleProfReader.cpp | 55 +++++++++++++++++++ unittests/ProfileData/SampleProfTest.cpp | 59 ++++++++++++++------- 3 files changed, 144 insertions(+), 22 deletions(-) diff --git a/include/llvm/ProfileData/SampleProfReader.h b/include/llvm/ProfileData/SampleProfReader.h index c100e800464..3c477cc3471 100644 --- a/include/llvm/ProfileData/SampleProfReader.h +++ b/include/llvm/ProfileData/SampleProfReader.h @@ -222,6 +222,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorOr.h" #include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/SymbolRemappingReader.h" #include #include #include @@ -289,11 +290,16 @@ public: // The function name may have been updated by adding suffix. In sample // profile, the function names are all stripped, so we need to strip // the function name suffix before matching with profile. - StringRef Fname = F.getName().split('.').first; + return getSamplesFor(F.getName().split('.').first); + } + + /// Return the samples collected for function \p F. + virtual FunctionSamples *getSamplesFor(StringRef Fname) { std::string FGUID; Fname = getRepInFormat(Fname, getFormat(), FGUID); - if (Profiles.count(Fname)) - return &Profiles[Fname]; + auto It = Profiles.find(Fname); + if (It != Profiles.end()) + return &It->second; return nullptr; } @@ -337,6 +343,12 @@ protected: /// Profile summary information. std::unique_ptr Summary; + /// Take ownership of the summary of this reader. + static std::unique_ptr + takeSummary(SampleProfileReader &Reader) { + return std::move(Reader.Summary); + } + /// Compute summary for this profile. void computeSummary(); @@ -525,6 +537,40 @@ protected: static const uint32_t GCOVTagAFDOFunction = 0xac000000; }; +/// A profile data reader proxy that remaps the profile data from another +/// sample profile data reader, by applying a provided set of equivalences +/// between components of the symbol names in the profile. +class SampleProfileReaderItaniumRemapper : public SampleProfileReader { +public: + SampleProfileReaderItaniumRemapper( + std::unique_ptr B, LLVMContext &C, + std::unique_ptr Underlying) + : SampleProfileReader(std::move(B), C, Underlying->getFormat()) { + Profiles = std::move(Underlying->getProfiles()); + Summary = takeSummary(*Underlying); + } + + /// Create a remapped sample profile from the given remapping file and + /// underlying samples. + static ErrorOr> + create(const Twine &Filename, LLVMContext &C, + std::unique_ptr Underlying); + + /// Read and validate the file header. + std::error_code readHeader() override { return sampleprof_error::success; } + + /// Read remapping file and apply it to the sample profile. + std::error_code read() override; + + /// Return the samples collected for function \p F. + FunctionSamples *getSamplesFor(StringRef FunctionName) override; + using SampleProfileReader::getSamplesFor; + +private: + SymbolRemappingReader Remappings; + DenseMap SampleMap; +}; + } // end namespace sampleprof } // end namespace llvm diff --git a/lib/ProfileData/SampleProfReader.cpp b/lib/ProfileData/SampleProfReader.cpp index 2b4551b9849..a68d1e9d3ab 100644 --- a/lib/ProfileData/SampleProfReader.cpp +++ b/lib/ProfileData/SampleProfReader.cpp @@ -912,6 +912,40 @@ bool SampleProfileReaderGCC::hasFormat(const MemoryBuffer &Buffer) { return Magic == "adcg*704"; } +std::error_code SampleProfileReaderItaniumRemapper::read() { + // If the underlying data is in compact format, we can't remap it because + // we don't know what the original function names were. + if (getFormat() == SPF_Compact_Binary) { + Ctx.diagnose(DiagnosticInfoSampleProfile( + Buffer->getBufferIdentifier(), + "Profile data remapping cannot be applied to profile data " + "in compact format (original mangled names are not available).", + DS_Warning)); + return sampleprof_error::success; + } + + if (Error E = Remappings.read(*Buffer)) { + handleAllErrors( + std::move(E), [&](const SymbolRemappingParseError &ParseError) { + reportError(ParseError.getLineNum(), ParseError.getMessage()); + }); + return sampleprof_error::malformed; + } + + for (auto &Sample : getProfiles()) + if (auto Key = Remappings.insert(Sample.first())) + SampleMap.insert({Key, &Sample.second}); + + return sampleprof_error::success; +} + +FunctionSamples * +SampleProfileReaderItaniumRemapper::getSamplesFor(StringRef Fname) { + if (auto Key = Remappings.lookup(Fname)) + return SampleMap.lookup(Key); + return SampleProfileReader::getSamplesFor(Fname); +} + /// Prepare a memory buffer for the contents of \p Filename. /// /// \returns an error code indicating the status of the buffer. @@ -944,6 +978,27 @@ SampleProfileReader::create(const Twine &Filename, LLVMContext &C) { return create(BufferOrError.get(), C); } +/// Create a sample profile remapper from the given input, to remap the +/// function names in the given profile data. +/// +/// \param Filename The file to open. +/// +/// \param C The LLVM context to use to emit diagnostics. +/// +/// \param Underlying The underlying profile data reader to remap. +/// +/// \returns an error code indicating the status of the created reader. +ErrorOr> +SampleProfileReaderItaniumRemapper::create( + const Twine &Filename, LLVMContext &C, + std::unique_ptr Underlying) { + auto BufferOrError = setupMemoryBuffer(Filename); + if (std::error_code EC = BufferOrError.getError()) + return EC; + return llvm::make_unique( + std::move(BufferOrError.get()), C, std::move(Underlying)); +} + /// Create a sample profile reader based on the format of the input data. /// /// \param B The memory buffer to create the reader from (assumes ownership). diff --git a/unittests/ProfileData/SampleProfTest.cpp b/unittests/ProfileData/SampleProfTest.cpp index 73e8088b638..67e6e9fc95b 100644 --- a/unittests/ProfileData/SampleProfTest.cpp +++ b/unittests/ProfileData/SampleProfTest.cpp @@ -58,7 +58,7 @@ struct SampleProfTest : ::testing::Test { Reader->collectFuncsToUse(M); } - void testRoundTrip(SampleProfileFormat Format) { + void testRoundTrip(SampleProfileFormat Format, bool Remap) { SmallVector ProfilePath; ASSERT_TRUE(NoError(llvm::sys::fs::createTemporaryFile("profile", "", ProfilePath))); StringRef Profile(ProfilePath.data(), ProfilePath.size()); @@ -108,22 +108,35 @@ struct SampleProfTest : ::testing::Test { EC = Reader->read(); ASSERT_TRUE(NoError(EC)); - StringMap &ReadProfiles = Reader->getProfiles(); - ASSERT_EQ(2u, ReadProfiles.size()); - - std::string FooGUID; - StringRef FooRep = getRepInFormat(FooName, Format, FooGUID); - FunctionSamples &ReadFooSamples = ReadProfiles[FooRep]; - ASSERT_EQ(7711u, ReadFooSamples.getTotalSamples()); - ASSERT_EQ(610u, ReadFooSamples.getHeadSamples()); - - std::string BarGUID; - StringRef BarRep = getRepInFormat(BarName, Format, BarGUID); - FunctionSamples &ReadBarSamples = ReadProfiles[BarRep]; - ASSERT_EQ(20301u, ReadBarSamples.getTotalSamples()); - ASSERT_EQ(1437u, ReadBarSamples.getHeadSamples()); + if (Remap) { + auto MemBuffer = llvm::MemoryBuffer::getMemBuffer(R"( + # Types 'int' and 'long' are equivalent + type i l + # Function names 'foo' and 'faux' are equivalent + name 3foo 4faux + )"); + Reader.reset(new SampleProfileReaderItaniumRemapper( + std::move(MemBuffer), Context, std::move(Reader))); + FooName = "_Z4fauxi"; + BarName = "_Z3barl"; + + EC = Reader->read(); + ASSERT_TRUE(NoError(EC)); + } + + ASSERT_EQ(2u, Reader->getProfiles().size()); + + FunctionSamples *ReadFooSamples = Reader->getSamplesFor(FooName); + ASSERT_TRUE(ReadFooSamples != nullptr); + ASSERT_EQ(7711u, ReadFooSamples->getTotalSamples()); + ASSERT_EQ(610u, ReadFooSamples->getHeadSamples()); + + FunctionSamples *ReadBarSamples = Reader->getSamplesFor(BarName); + ASSERT_TRUE(ReadBarSamples != nullptr); + ASSERT_EQ(20301u, ReadBarSamples->getTotalSamples()); + ASSERT_EQ(1437u, ReadBarSamples->getHeadSamples()); ErrorOr CTMap = - ReadBarSamples.findCallTargetMapAt(1, 0); + ReadBarSamples->findCallTargetMapAt(1, 0); ASSERT_FALSE(CTMap.getError()); std::string MconstructGUID; @@ -184,15 +197,23 @@ struct SampleProfTest : ::testing::Test { }; TEST_F(SampleProfTest, roundtrip_text_profile) { - testRoundTrip(SampleProfileFormat::SPF_Text); + testRoundTrip(SampleProfileFormat::SPF_Text, false); } TEST_F(SampleProfTest, roundtrip_raw_binary_profile) { - testRoundTrip(SampleProfileFormat::SPF_Binary); + testRoundTrip(SampleProfileFormat::SPF_Binary, false); } TEST_F(SampleProfTest, roundtrip_compact_binary_profile) { - testRoundTrip(SampleProfileFormat::SPF_Compact_Binary); + testRoundTrip(SampleProfileFormat::SPF_Compact_Binary, false); +} + +TEST_F(SampleProfTest, remap_text_profile) { + testRoundTrip(SampleProfileFormat::SPF_Text, true); +} + +TEST_F(SampleProfTest, remap_raw_binary_profile) { + testRoundTrip(SampleProfileFormat::SPF_Binary, true); } TEST_F(SampleProfTest, sample_overflow_saturation) { -- GitLab From 8313c3b553f996945d5c73734d027fa01af69115 Mon Sep 17 00:00:00 2001 From: Chris Bieneman Date: Wed, 10 Oct 2018 21:36:12 +0000 Subject: [PATCH 0029/1116] [CMake] NFC. Updating documentation on options The Ninja pool options are only supported with the Ninja generator and should be called out as such. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344188 91177308-0d34-0410-b5e6-96231b3b80d8 --- cmake/modules/HandleLLVMOptions.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/modules/HandleLLVMOptions.cmake b/cmake/modules/HandleLLVMOptions.cmake index 2c9bd14ad05..0daaf7d95c0 100644 --- a/cmake/modules/HandleLLVMOptions.cmake +++ b/cmake/modules/HandleLLVMOptions.cmake @@ -23,7 +23,7 @@ string(TOUPPER "${LLVM_ENABLE_LTO}" uppercase_LLVM_ENABLE_LTO) # Ninja Job Pool support # The following only works with the Ninja generator in CMake >= 3.0. set(LLVM_PARALLEL_COMPILE_JOBS "" CACHE STRING - "Define the maximum number of concurrent compilation jobs.") + "Define the maximum number of concurrent compilation jobs (Ninja only).") if(LLVM_PARALLEL_COMPILE_JOBS) if(NOT CMAKE_MAKE_PROGRAM MATCHES "ninja") message(WARNING "Job pooling is only available with Ninja generators.") @@ -34,7 +34,7 @@ if(LLVM_PARALLEL_COMPILE_JOBS) endif() set(LLVM_PARALLEL_LINK_JOBS "" CACHE STRING - "Define the maximum number of concurrent link jobs.") + "Define the maximum number of concurrent link jobs (Ninja only).") if(CMAKE_MAKE_PROGRAM MATCHES "ninja") if(NOT LLVM_PARALLEL_LINK_JOBS AND uppercase_LLVM_ENABLE_LTO STREQUAL "THIN") message(STATUS "ThinLTO provides its own parallel linking - limiting parallel link jobs to 2.") -- GitLab From 63c98b331978f03c4fa392b9a032fad24596dde8 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 10 Oct 2018 21:48:34 +0000 Subject: [PATCH 0030/1116] [X86] Prevent non-temporal loads from folding into instructions by blocking them in X86DAGToDAGISel::IsProfitableToFold rather than with a predicate. Remove tryFoldVecLoad since tryFoldLoad would call IsProfitableToFold and pick up the new check. This saves about 5K out of ~600K on the generated isel table. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344189 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelDAGToDAG.cpp | 32 ++++----------- lib/Target/X86/X86InstrFragmentsSIMD.td | 53 +++++++++++-------------- 2 files changed, 31 insertions(+), 54 deletions(-) diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index 25a8567a9c1..5eb4dbb1d98 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -239,12 +239,6 @@ namespace { return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment); } - // Try to fold a vector load. This makes sure the load isn't non-temporal. - bool tryFoldVecLoad(SDNode *Root, SDNode *P, SDValue N, - SDValue &Base, SDValue &Scale, - SDValue &Index, SDValue &Disp, - SDValue &Segment); - /// Implement addressing mode selection for inline asm expressions. bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, @@ -516,6 +510,10 @@ X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const { if (N.getOpcode() != ISD::LOAD) return true; + // Don't fold non-temporal loads if we have an instruction for them. + if (useNonTemporalLoad(cast(N))) + return false; + // If N is a load, do additional profitability checks. if (U == Root) { switch (U->getOpcode()) { @@ -2053,20 +2051,6 @@ bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N, N.getOperand(1), Base, Scale, Index, Disp, Segment); } -bool X86DAGToDAGISel::tryFoldVecLoad(SDNode *Root, SDNode *P, SDValue N, - SDValue &Base, SDValue &Scale, - SDValue &Index, SDValue &Disp, - SDValue &Segment) { - if (!ISD::isNON_EXTLoad(N.getNode()) || - useNonTemporalLoad(cast(N)) || - !IsProfitableToFold(N, P, Root) || - !IsLegalToFold(N, P, Root, OptLevel)) - return false; - - return selectAddr(N.getNode(), - N.getOperand(1), Base, Scale, Index, Disp, Segment); -} - /// Return an SDNode that returns the value of the global base register. /// Output instructions required to initialize the global base register, /// if necessary. @@ -2595,8 +2579,8 @@ MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc, // alignment on this load. SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; if (MayFoldLoad && N1->getOpcode() == ISD::BITCAST && N1->hasOneUse() && - tryFoldVecLoad(Node, N1.getNode(), N1.getOperand(0), Tmp0, Tmp1, Tmp2, - Tmp3, Tmp4)) { + tryFoldLoad(Node, N1.getNode(), N1.getOperand(0), Tmp0, Tmp1, Tmp2, + Tmp3, Tmp4)) { SDValue Load = N1.getOperand(0); SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm, Load.getOperand(0) }; @@ -2632,8 +2616,8 @@ MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc, // alignment on this load. SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; if (MayFoldLoad && N2->getOpcode() == ISD::BITCAST && N2->hasOneUse() && - tryFoldVecLoad(Node, N2.getNode(), N2.getOperand(0), Tmp0, Tmp1, Tmp2, - Tmp3, Tmp4)) { + tryFoldLoad(Node, N2.getNode(), N2.getOperand(0), Tmp0, Tmp1, Tmp2, + Tmp3, Tmp4)) { SDValue Load = N2.getOperand(0); SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm, Load.getOperand(0), InFlag }; diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index 3aa825ee84e..f750fe3ee0c 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -647,28 +647,22 @@ def sdmem : Operand { // SSE pattern fragments //===----------------------------------------------------------------------===// -// Vector load wrappers to prevent folding of non-temporal aligned loads on -// supporting targets. -def vecload : PatFrag<(ops node:$ptr), (load node:$ptr), [{ - return !useNonTemporalLoad(cast(N)); -}]>; - // 128-bit load pattern fragments // NOTE: all 128-bit integer vector loads are promoted to v2i64 -def loadv4f32 : PatFrag<(ops node:$ptr), (v4f32 (vecload node:$ptr))>; -def loadv2f64 : PatFrag<(ops node:$ptr), (v2f64 (vecload node:$ptr))>; -def loadv2i64 : PatFrag<(ops node:$ptr), (v2i64 (vecload node:$ptr))>; +def loadv4f32 : PatFrag<(ops node:$ptr), (v4f32 (load node:$ptr))>; +def loadv2f64 : PatFrag<(ops node:$ptr), (v2f64 (load node:$ptr))>; +def loadv2i64 : PatFrag<(ops node:$ptr), (v2i64 (load node:$ptr))>; // 256-bit load pattern fragments // NOTE: all 256-bit integer vector loads are promoted to v4i64 -def loadv8f32 : PatFrag<(ops node:$ptr), (v8f32 (vecload node:$ptr))>; -def loadv4f64 : PatFrag<(ops node:$ptr), (v4f64 (vecload node:$ptr))>; -def loadv4i64 : PatFrag<(ops node:$ptr), (v4i64 (vecload node:$ptr))>; +def loadv8f32 : PatFrag<(ops node:$ptr), (v8f32 (load node:$ptr))>; +def loadv4f64 : PatFrag<(ops node:$ptr), (v4f64 (load node:$ptr))>; +def loadv4i64 : PatFrag<(ops node:$ptr), (v4i64 (load node:$ptr))>; // 512-bit load pattern fragments -def loadv16f32 : PatFrag<(ops node:$ptr), (v16f32 (vecload node:$ptr))>; -def loadv8f64 : PatFrag<(ops node:$ptr), (v8f64 (vecload node:$ptr))>; -def loadv8i64 : PatFrag<(ops node:$ptr), (v8i64 (vecload node:$ptr))>; +def loadv16f32 : PatFrag<(ops node:$ptr), (v16f32 (load node:$ptr))>; +def loadv8f64 : PatFrag<(ops node:$ptr), (v8f64 (load node:$ptr))>; +def loadv8i64 : PatFrag<(ops node:$ptr), (v8i64 (load node:$ptr))>; // 128-/256-/512-bit extload pattern fragments def extloadv2f32 : PatFrag<(ops node:$ptr), (v2f64 (extloadvf32 node:$ptr))>; @@ -682,46 +676,45 @@ def alignedstore : PatFrag<(ops node:$val, node:$ptr), return St->getAlignment() >= St->getMemoryVT().getStoreSize(); }]>; -// Like 'load', but always requires 128-bit vector alignment. -def alignedvecload : PatFrag<(ops node:$ptr), (load node:$ptr), [{ +// Like 'load', but always requires vector size alignment. +def alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{ auto *Ld = cast(N); - return Ld->getAlignment() >= Ld->getMemoryVT().getStoreSize() && - !useNonTemporalLoad(cast(N)); + return Ld->getAlignment() >= Ld->getMemoryVT().getStoreSize(); }]>; // 128-bit aligned load pattern fragments // NOTE: all 128-bit integer vector loads are promoted to v2i64 def alignedloadv4f32 : PatFrag<(ops node:$ptr), - (v4f32 (alignedvecload node:$ptr))>; + (v4f32 (alignedload node:$ptr))>; def alignedloadv2f64 : PatFrag<(ops node:$ptr), - (v2f64 (alignedvecload node:$ptr))>; + (v2f64 (alignedload node:$ptr))>; def alignedloadv2i64 : PatFrag<(ops node:$ptr), - (v2i64 (alignedvecload node:$ptr))>; + (v2i64 (alignedload node:$ptr))>; // 256-bit aligned load pattern fragments // NOTE: all 256-bit integer vector loads are promoted to v4i64 def alignedloadv8f32 : PatFrag<(ops node:$ptr), - (v8f32 (alignedvecload node:$ptr))>; + (v8f32 (alignedload node:$ptr))>; def alignedloadv4f64 : PatFrag<(ops node:$ptr), - (v4f64 (alignedvecload node:$ptr))>; + (v4f64 (alignedload node:$ptr))>; def alignedloadv4i64 : PatFrag<(ops node:$ptr), - (v4i64 (alignedvecload node:$ptr))>; + (v4i64 (alignedload node:$ptr))>; // 512-bit aligned load pattern fragments def alignedloadv16f32 : PatFrag<(ops node:$ptr), - (v16f32 (alignedvecload node:$ptr))>; + (v16f32 (alignedload node:$ptr))>; def alignedloadv8f64 : PatFrag<(ops node:$ptr), - (v8f64 (alignedvecload node:$ptr))>; + (v8f64 (alignedload node:$ptr))>; def alignedloadv8i64 : PatFrag<(ops node:$ptr), - (v8i64 (alignedvecload node:$ptr))>; + (v8i64 (alignedload node:$ptr))>; -// Like 'vecload', but uses special alignment checks suitable for use in +// Like 'load', but uses special alignment checks suitable for use in // memory operands in most SSE instructions, which are required to // be naturally aligned on some targets but not on others. If the subtarget // allows unaligned accesses, match any load, though this may require // setting a feature bit in the processor (on startup, for example). // Opteron 10h and later implement such a feature. -def memop : PatFrag<(ops node:$ptr), (vecload node:$ptr), [{ +def memop : PatFrag<(ops node:$ptr), (load node:$ptr), [{ auto *Ld = cast(N); return Subtarget->hasSSEUnalignedMem() || Ld->getAlignment() >= Ld->getMemoryVT().getStoreSize(); -- GitLab From d784be6ea2204c4936a9cf97d3b260d8fc0b39ba Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Wed, 10 Oct 2018 22:52:32 +0000 Subject: [PATCH 0031/1116] [MC][ELF] compute entity size for explicit sections Summary: Global variables might declare themselves to be in explicit sections. Calculate the entity size always to prevent assembler warnings "entity size for SHF_MERGE not specified" when sections are to be marked merge-able. Fixes PR31828. Reviewers: rnk, echristo Reviewed By: rnk Subscribers: llvm-commits, pirama, srhines Differential Revision: https://reviews.llvm.org/D53056 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344197 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/TargetLoweringObjectFileImpl.cpp | 50 +++++++++---------- .../CodeGen/Generic/section_mergeable_size.ll | 3 ++ 2 files changed, 28 insertions(+), 25 deletions(-) create mode 100644 test/CodeGen/Generic/section_mergeable_size.ll diff --git a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp index f6882c40531..b046cd81d6c 100644 --- a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp +++ b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp @@ -506,6 +506,30 @@ static const MCSymbolELF *getAssociatedSymbol(const GlobalObject *GO, return OtherGO ? dyn_cast(TM.getSymbol(OtherGO)) : nullptr; } +static unsigned getEntrySizeForKind(SectionKind Kind) { + if (Kind.isMergeable1ByteCString()) + return 1; + else if (Kind.isMergeable2ByteCString()) + return 2; + else if (Kind.isMergeable4ByteCString()) + return 4; + else if (Kind.isMergeableConst4()) + return 4; + else if (Kind.isMergeableConst8()) + return 8; + else if (Kind.isMergeableConst16()) + return 16; + else if (Kind.isMergeableConst32()) + return 32; + else { + // We shouldn't have mergeable C strings or mergeable constants that we + // didn't handle above. + assert(!Kind.isMergeableCString() && "unknown string width"); + assert(!Kind.isMergeableConst() && "unknown data width"); + return 0; + } +} + MCSection *TargetLoweringObjectFileELF::getExplicitSectionGlobal( const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const { StringRef SectionName = GO->getSection(); @@ -550,7 +574,7 @@ MCSection *TargetLoweringObjectFileELF::getExplicitSectionGlobal( MCSectionELF *Section = getContext().getELFSection( SectionName, getELFSectionType(SectionName, Kind), Flags, - /*EntrySize=*/0, Group, UniqueID, AssociatedSymbol); + getEntrySizeForKind(Kind), Group, UniqueID, AssociatedSymbol); // Make sure that we did not get some other section with incompatible sh_link. // This should not be possible due to UniqueID code above. assert(Section->getAssociatedSymbol() == AssociatedSymbol && @@ -577,30 +601,6 @@ static StringRef getSectionPrefixForGlobal(SectionKind Kind) { return ".data.rel.ro"; } -static unsigned getEntrySizeForKind(SectionKind Kind) { - if (Kind.isMergeable1ByteCString()) - return 1; - else if (Kind.isMergeable2ByteCString()) - return 2; - else if (Kind.isMergeable4ByteCString()) - return 4; - else if (Kind.isMergeableConst4()) - return 4; - else if (Kind.isMergeableConst8()) - return 8; - else if (Kind.isMergeableConst16()) - return 16; - else if (Kind.isMergeableConst32()) - return 32; - else { - // We shouldn't have mergeable C strings or mergeable constants that we - // didn't handle above. - assert(!Kind.isMergeableCString() && "unknown string width"); - assert(!Kind.isMergeableConst() && "unknown data width"); - return 0; - } -} - static MCSectionELF *selectELFSectionForGlobal( MCContext &Ctx, const GlobalObject *GO, SectionKind Kind, Mangler &Mang, const TargetMachine &TM, bool EmitUniqueSection, unsigned Flags, diff --git a/test/CodeGen/Generic/section_mergeable_size.ll b/test/CodeGen/Generic/section_mergeable_size.ll new file mode 100644 index 00000000000..fbab7fe849f --- /dev/null +++ b/test/CodeGen/Generic/section_mergeable_size.ll @@ -0,0 +1,3 @@ +; RUN: llc < %s | FileCheck %s +@a = internal unnamed_addr constant [1 x [1 x i32]] zeroinitializer, section ".init.rodata", align 4 +; CHECK: .init.rodata,"aM",@progbits,4 -- GitLab From 7f7ab9f57f5c84a64b8c37a2273b99682aab2811 Mon Sep 17 00:00:00 2001 From: Warren Ristow Date: Wed, 10 Oct 2018 22:54:31 +0000 Subject: [PATCH 0032/1116] [LTO] Account for overriding lib calls via the alias attribute Given a library call that is represented as an llvm intrinsic call, but later transformed to an actual call, if an overriding definition of that library routine is provided indirectly via an alias, prevent LTO from eliminating the definition. This is a fix for PR38547. Differential Revision: https://reviews.llvm.org/D52836 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344198 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/LTO/UpdateCompilerUsed.cpp | 16 +++-- test/LTO/X86/libcall-overridden-via-alias.ll | 69 ++++++++++++++++++++ 2 files changed, 80 insertions(+), 5 deletions(-) create mode 100755 test/LTO/X86/libcall-overridden-via-alias.ll diff --git a/lib/LTO/UpdateCompilerUsed.cpp b/lib/LTO/UpdateCompilerUsed.cpp index c982a5b0e5a..00482dee6e1 100644 --- a/lib/LTO/UpdateCompilerUsed.cpp +++ b/lib/LTO/UpdateCompilerUsed.cpp @@ -95,12 +95,18 @@ private: if (GV.hasPrivateLinkage()) return; - // Conservatively append user-supplied runtime library functions to - // llvm.compiler.used. These could be internalized and deleted by - // optimizations like -globalopt, causing problems when later optimizations - // add new library calls (e.g., llvm.memset => memset and printf => puts). + // Conservatively append user-supplied runtime library functions (supplied + // either directly, or via a function alias) to llvm.compiler.used. These + // could be internalized and deleted by optimizations like -globalopt, + // causing problems when later optimizations add new library calls (e.g., + // llvm.memset => memset and printf => puts). // Leave it to the linker to remove any dead code (e.g. with -dead_strip). - if (isa(GV) && Libcalls.count(GV.getName())) { + GlobalValue *FuncAliasee = nullptr; + if (isa(GV)) { + auto *A = cast(&GV); + FuncAliasee = dyn_cast(A->getAliasee()); + } + if ((isa(GV) || FuncAliasee) && Libcalls.count(GV.getName())) { LLVMUsed.push_back(&GV); return; } diff --git a/test/LTO/X86/libcall-overridden-via-alias.ll b/test/LTO/X86/libcall-overridden-via-alias.ll new file mode 100755 index 00000000000..cac125b2843 --- /dev/null +++ b/test/LTO/X86/libcall-overridden-via-alias.ll @@ -0,0 +1,69 @@ +; Given a library call that is represented as an llvm intrinsic call, but +; later transformed to an actual call, if an overriding definition of that +; library routine is provided indirectly via an alias, verify that LTO +; does not eliminate the definition. This is a test for PR38547. +; +; RUN: llvm-as -o %t1 %s +; RUN: llvm-lto -exported-symbol=main -save-merged-module -filetype=asm -o %t2 %t1 +; RUN: llvm-dis -o - %t2.merged.bc | FileCheck --check-prefix=CHECK_IR %s +; +; Check that the call is represented as an llvm intrinsic in the IR after LTO: +; CHECK_IR-LABEL: main +; CHECK_IR: call float @llvm.log.f32 +; +; Check that the IR contains the overriding definition of the library routine +; in the IR after LTO: +; CHECK_IR: define internal float @logf(float [[X:%.*]]) +; CHECK_IR-NEXT: [[TMP:%.*]] = fadd float [[X]], [[X]] +; CHECK_IR-NEXT: ret float [[TMP]] +; +; Check that the assembly code from LTO contains the call to the expected +; library routine, and that the overriding definition of the library routine +; is present: +; RUN: FileCheck --check-prefix=CHECK_ASM %s < %t2 +; CHECK_ASM-LABEL: main: +; CHECK_ASM: callq logf +; CHECK_ASM-LABEL: logf: +; CHECK_ASM-NEXT: add +; CHECK_ASM-NEXT: ret + +; Produced from the following source-code: +; +;extern float logf(float); +;// 'src' and 'dst' are 'volatile' to prohibit optimization. +;volatile float src = 3.14f; +;volatile float dst; +; +;int main() { +; dst = logf(src); +; return 0; +;} +; +;extern float fname(float x); +;float fname(float x) { +; return x + x; +;} +; +;float logf(float x) __attribute__((alias("fname"))); +; +target triple = "x86_64-unknown-linux-gnu" + +@src = global float 0x40091EB860000000, align 4 +@dst = common global float 0.000000e+00, align 4 + +@logf = alias float (float), float (float)* @fname + +define i32 @main() local_unnamed_addr { +entry: + %0 = load volatile float, float* @src, align 4 + %1 = tail call float @llvm.log.f32(float %0) + store volatile float %1, float* @dst, align 4 + ret i32 0 +} + +declare float @llvm.log.f32(float) + +define float @fname(float %x) { + %add = fadd float %x, %x + ret float %add +} -- GitLab From 63ec2563a97241c4f743bf5202cb9aae88af3b37 Mon Sep 17 00:00:00 2001 From: Richard Smith Date: Wed, 10 Oct 2018 23:13:47 +0000 Subject: [PATCH 0033/1116] Add a flag to remap manglings when reading profile data information. This can be used to preserve profiling information across codebase changes that have widespread impact on mangled names, but across which most profiling data should still be usable. For example, when switching from libstdc++ to libc++, or from the old libstdc++ ABI to the new ABI, or even from a 32-bit to a 64-bit build. The user can provide a remapping file specifying parts of mangled names that should be treated as equivalent (eg, std::__1 should be treated as equivalent to std::__cxx11), and profile data will be treated as applying to a particular function if its name is equivalent to the name of a function in the profile data under the provided equivalences. See the documentation change for a description of how this is configured. Remapping is supported for both sample-based profiling and instruction profiling. We do not support remapping indirect branch target information, but all other profile data should be remapped appropriately. Support is only added for the new pass manager. If someone wants to also add support for this for the old pass manager, doing so should be straightforward. This is the LLVM side of Clang r344199. Reviewers: davidxl, tejohnson, dlj, erik.pilkington Subscribers: mehdi_amini, steven_wu, dexonsmith, llvm-commits Differential Revision: https://reviews.llvm.org/D51249 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344200 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/LTO/Config.h | 3 + include/llvm/Passes/PassBuilder.h | 13 ++-- include/llvm/Transforms/IPO/SampleProfile.h | 7 ++- .../Instrumentation/PGOInstrumentation.h | 4 +- lib/LTO/LTO.cpp | 9 ++- lib/LTO/LTOBackend.cpp | 3 +- lib/Passes/PassBuilder.cpp | 9 ++- lib/Transforms/IPO/SampleProfile.cpp | 48 +++++++++++---- .../Instrumentation/PGOInstrumentation.cpp | 23 +++++-- test/Transforms/PGOProfile/Inputs/remap.map | 8 +++ .../PGOProfile/Inputs/remap.proftext | 8 +++ test/Transforms/PGOProfile/remap.ll | 28 +++++++++ .../Transforms/SampleProfile/Inputs/remap.map | 8 +++ .../SampleProfile/Inputs/remap.prof | 10 ++++ test/Transforms/SampleProfile/remap.ll | 60 +++++++++++++++++++ tools/opt/NewPMDriver.cpp | 12 ++-- 16 files changed, 221 insertions(+), 32 deletions(-) create mode 100644 test/Transforms/PGOProfile/Inputs/remap.map create mode 100644 test/Transforms/PGOProfile/Inputs/remap.proftext create mode 100644 test/Transforms/PGOProfile/remap.ll create mode 100644 test/Transforms/SampleProfile/Inputs/remap.map create mode 100644 test/Transforms/SampleProfile/Inputs/remap.prof create mode 100644 test/Transforms/SampleProfile/remap.ll diff --git a/include/llvm/LTO/Config.h b/include/llvm/LTO/Config.h index 57bba5e3484..c0ad32f485c 100644 --- a/include/llvm/LTO/Config.h +++ b/include/llvm/LTO/Config.h @@ -73,6 +73,9 @@ struct Config { /// Sample PGO profile path. std::string SampleProfile; + /// Name remapping file for profile data. + std::string ProfileRemapping; + /// The directory to store .dwo files. std::string DwoDir; diff --git a/include/llvm/Passes/PassBuilder.h b/include/llvm/Passes/PassBuilder.h index 02d3dc324bc..91314430a96 100644 --- a/include/llvm/Passes/PassBuilder.h +++ b/include/llvm/Passes/PassBuilder.h @@ -32,10 +32,13 @@ class ModuleSummaryIndex; /// A struct capturing PGO tunables. struct PGOOptions { PGOOptions(std::string ProfileGenFile = "", std::string ProfileUseFile = "", - std::string SampleProfileFile = "", bool RunProfileGen = false, - bool SamplePGOSupport = false) + std::string SampleProfileFile = "", + std::string ProfileRemappingFile = "", + bool RunProfileGen = false, bool SamplePGOSupport = false) : ProfileGenFile(ProfileGenFile), ProfileUseFile(ProfileUseFile), - SampleProfileFile(SampleProfileFile), RunProfileGen(RunProfileGen), + SampleProfileFile(SampleProfileFile), + ProfileRemappingFile(ProfileRemappingFile), + RunProfileGen(RunProfileGen), SamplePGOSupport(SamplePGOSupport || !SampleProfileFile.empty()) { assert((RunProfileGen || !SampleProfileFile.empty() || @@ -45,6 +48,7 @@ struct PGOOptions { std::string ProfileGenFile; std::string ProfileUseFile; std::string SampleProfileFile; + std::string ProfileRemappingFile; bool RunProfileGen; bool SamplePGOSupport; }; @@ -587,7 +591,8 @@ private: void addPGOInstrPasses(ModulePassManager &MPM, bool DebugLogging, OptimizationLevel Level, bool RunProfileGen, std::string ProfileGenFile, - std::string ProfileUseFile); + std::string ProfileUseFile, + std::string ProfileRemappingFile); void invokePeepholeEPCallbacks(FunctionPassManager &, OptimizationLevel); diff --git a/include/llvm/Transforms/IPO/SampleProfile.h b/include/llvm/Transforms/IPO/SampleProfile.h index cd5a0563898..af4a933ec1f 100644 --- a/include/llvm/Transforms/IPO/SampleProfile.h +++ b/include/llvm/Transforms/IPO/SampleProfile.h @@ -25,13 +25,16 @@ class Module; /// The sample profiler data loader pass. class SampleProfileLoaderPass : public PassInfoMixin { public: - SampleProfileLoaderPass(std::string File = "", bool IsThinLTOPreLink = false) - : ProfileFileName(File), IsThinLTOPreLink(IsThinLTOPreLink) {} + SampleProfileLoaderPass(std::string File = "", std::string RemappingFile = "", + bool IsThinLTOPreLink = false) + : ProfileFileName(File), ProfileRemappingFileName(RemappingFile), + IsThinLTOPreLink(IsThinLTOPreLink) {} PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); private: std::string ProfileFileName; + std::string ProfileRemappingFileName; bool IsThinLTOPreLink; }; diff --git a/include/llvm/Transforms/Instrumentation/PGOInstrumentation.h b/include/llvm/Transforms/Instrumentation/PGOInstrumentation.h index c0b37c470b7..fdc5df68a66 100644 --- a/include/llvm/Transforms/Instrumentation/PGOInstrumentation.h +++ b/include/llvm/Transforms/Instrumentation/PGOInstrumentation.h @@ -36,12 +36,14 @@ public: /// The profile annotation (profile-instr-use) pass for IR based PGO. class PGOInstrumentationUse : public PassInfoMixin { public: - PGOInstrumentationUse(std::string Filename = ""); + PGOInstrumentationUse(std::string Filename = "", + std::string RemappingFilename = ""); PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); private: std::string ProfileFileName; + std::string ProfileRemappingFileName; }; /// The indirect function call promotion pass. diff --git a/lib/LTO/LTO.cpp b/lib/LTO/LTO.cpp index 50d0075a608..6942cb28af2 100644 --- a/lib/LTO/LTO.cpp +++ b/lib/LTO/LTO.cpp @@ -263,8 +263,15 @@ static void computeCacheKey( if (!Conf.SampleProfile.empty()) { auto FileOrErr = MemoryBuffer::getFile(Conf.SampleProfile); - if (FileOrErr) + if (FileOrErr) { Hasher.update(FileOrErr.get()->getBuffer()); + + if (!Conf.ProfileRemapping.empty()) { + FileOrErr = MemoryBuffer::getFile(Conf.ProfileRemapping); + if (FileOrErr) + Hasher.update(FileOrErr.get()->getBuffer()); + } + } } Key = toHex(Hasher.result()); diff --git a/lib/LTO/LTOBackend.cpp b/lib/LTO/LTOBackend.cpp index be33ab84933..20fc40de4b9 100644 --- a/lib/LTO/LTOBackend.cpp +++ b/lib/LTO/LTOBackend.cpp @@ -155,7 +155,8 @@ static void runNewPMPasses(Config &Conf, Module &Mod, TargetMachine *TM, const ModuleSummaryIndex *ImportSummary) { Optional PGOOpt; if (!Conf.SampleProfile.empty()) - PGOOpt = PGOOptions("", "", Conf.SampleProfile, false, true); + PGOOpt = PGOOptions("", "", Conf.SampleProfile, Conf.ProfileRemapping, + false, true); PassBuilder PB(TM, PGOOpt); AAManager AA; diff --git a/lib/Passes/PassBuilder.cpp b/lib/Passes/PassBuilder.cpp index a880befc0d5..94afb5409e1 100644 --- a/lib/Passes/PassBuilder.cpp +++ b/lib/Passes/PassBuilder.cpp @@ -505,7 +505,8 @@ void PassBuilder::addPGOInstrPasses(ModulePassManager &MPM, bool DebugLogging, PassBuilder::OptimizationLevel Level, bool RunProfileGen, std::string ProfileGenFile, - std::string ProfileUseFile) { + std::string ProfileUseFile, + std::string ProfileRemappingFile) { // Generally running simplification passes and the inliner with an high // threshold results in smaller executables, but there may be cases where // the size grows, so let's be conservative here and skip this simplification @@ -559,7 +560,7 @@ void PassBuilder::addPGOInstrPasses(ModulePassManager &MPM, bool DebugLogging, } if (!ProfileUseFile.empty()) - MPM.addPass(PGOInstrumentationUse(ProfileUseFile)); + MPM.addPass(PGOInstrumentationUse(ProfileUseFile, ProfileRemappingFile)); } static InlineParams @@ -605,6 +606,7 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level, // Annotate sample profile right after early FPM to ensure freshness of // the debug info. MPM.addPass(SampleProfileLoaderPass(PGOOpt->SampleProfileFile, + PGOOpt->ProfileRemappingFile, Phase == ThinLTOPhase::PreLink)); // Do not invoke ICP in the ThinLTOPrelink phase as it makes it hard // for the profile annotation to be accurate in the ThinLTO backend. @@ -657,7 +659,8 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level, if (PGOOpt && Phase != ThinLTOPhase::PostLink && (!PGOOpt->ProfileGenFile.empty() || !PGOOpt->ProfileUseFile.empty())) { addPGOInstrPasses(MPM, DebugLogging, Level, PGOOpt->RunProfileGen, - PGOOpt->ProfileGenFile, PGOOpt->ProfileUseFile); + PGOOpt->ProfileGenFile, PGOOpt->ProfileUseFile, + PGOOpt->ProfileRemappingFile); MPM.addPass(PGOIndirectCallPromotion(false, false)); } diff --git a/lib/Transforms/IPO/SampleProfile.cpp b/lib/Transforms/IPO/SampleProfile.cpp index 182202fda05..4a69a0c2806 100644 --- a/lib/Transforms/IPO/SampleProfile.cpp +++ b/lib/Transforms/IPO/SampleProfile.cpp @@ -96,6 +96,13 @@ static cl::opt SampleProfileFile( "sample-profile-file", cl::init(""), cl::value_desc("filename"), cl::desc("Profile file loaded by -sample-profile"), cl::Hidden); +// The named file contains a set of transformations that may have been applied +// to the symbol names between the program from which the sample data was +// collected and the current program's symbols. +static cl::opt SampleProfileRemappingFile( + "sample-profile-remapping-file", cl::init(""), cl::value_desc("filename"), + cl::desc("Profile remapping file loaded by -sample-profile"), cl::Hidden); + static cl::opt SampleProfileMaxPropagateIterations( "sample-profile-max-propagate-iterations", cl::init(100), cl::desc("Maximum number of iterations to go through when propagating " @@ -183,12 +190,12 @@ private: class SampleProfileLoader { public: SampleProfileLoader( - StringRef Name, bool IsThinLTOPreLink, + StringRef Name, StringRef RemapName, bool IsThinLTOPreLink, std::function GetAssumptionCache, std::function GetTargetTransformInfo) : GetAC(std::move(GetAssumptionCache)), GetTTI(std::move(GetTargetTransformInfo)), Filename(Name), - IsThinLTOPreLink(IsThinLTOPreLink) {} + RemappingFilename(RemapName), IsThinLTOPreLink(IsThinLTOPreLink) {} bool doInitialization(Module &M); bool runOnModule(Module &M, ModuleAnalysisManager *AM, @@ -282,6 +289,9 @@ protected: /// Name of the profile file to load. std::string Filename; + /// Name of the profile remapping file to load. + std::string RemappingFilename; + /// Flag indicating whether the profile input loaded successfully. bool ProfileIsValid = false; @@ -311,13 +321,14 @@ public: SampleProfileLoaderLegacyPass(StringRef Name = SampleProfileFile, bool IsThinLTOPreLink = false) - : ModulePass(ID), SampleLoader(Name, IsThinLTOPreLink, - [&](Function &F) -> AssumptionCache & { - return ACT->getAssumptionCache(F); - }, - [&](Function &F) -> TargetTransformInfo & { - return TTIWP->getTTI(F); - }) { + : ModulePass(ID), + SampleLoader(Name, SampleProfileRemappingFile, IsThinLTOPreLink, + [&](Function &F) -> AssumptionCache & { + return ACT->getAssumptionCache(F); + }, + [&](Function &F) -> TargetTransformInfo & { + return TTIWP->getTTI(F); + }) { initializeSampleProfileLoaderLegacyPassPass( *PassRegistry::getPassRegistry()); } @@ -1515,11 +1526,26 @@ bool SampleProfileLoader::doInitialization(Module &M) { Reader = std::move(ReaderOrErr.get()); Reader->collectFuncsToUse(M); ProfileIsValid = (Reader->read() == sampleprof_error::success); + + if (!RemappingFilename.empty()) { + // Apply profile remappings to the loaded profile data if requested. + // For now, we only support remapping symbols encoded using the Itanium + // C++ ABI's name mangling scheme. + ReaderOrErr = SampleProfileReaderItaniumRemapper::create( + RemappingFilename, Ctx, std::move(Reader)); + if (std::error_code EC = ReaderOrErr.getError()) { + std::string Msg = "Could not open profile remapping file: " + EC.message(); + Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg)); + return false; + } + Reader = std::move(ReaderOrErr.get()); + ProfileIsValid = (Reader->read() == sampleprof_error::success); + } return true; } ModulePass *llvm::createSampleProfileLoaderPass() { - return new SampleProfileLoaderLegacyPass(SampleProfileFile); + return new SampleProfileLoaderLegacyPass(); } ModulePass *llvm::createSampleProfileLoaderPass(StringRef Name) { @@ -1612,6 +1638,8 @@ PreservedAnalyses SampleProfileLoaderPass::run(Module &M, SampleProfileLoader SampleLoader( ProfileFileName.empty() ? SampleProfileFile : ProfileFileName, + ProfileRemappingFileName.empty() ? SampleProfileRemappingFile + : ProfileRemappingFileName, IsThinLTOPreLink, GetAssumptionCache, GetTTI); SampleLoader.doInitialization(M); diff --git a/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index 307b7eaa219..ac851f660d9 100644 --- a/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -141,6 +141,11 @@ static cl::opt cl::value_desc("filename"), cl::desc("Specify the path of profile data file. This is" "mainly for test purpose.")); +static cl::opt PGOTestProfileRemappingFile( + "pgo-test-profile-remapping-file", cl::init(""), cl::Hidden, + cl::value_desc("filename"), + cl::desc("Specify the path of profile remapping file. This is mainly for " + "test purpose.")); // Command line option to disable value profiling. The default is false: // i.e. value profiling is enabled by default. This is for debug purpose. @@ -1429,13 +1434,14 @@ PreservedAnalyses PGOInstrumentationGen::run(Module &M, } static bool annotateAllFunctions( - Module &M, StringRef ProfileFileName, + Module &M, StringRef ProfileFileName, StringRef ProfileRemappingFileName, function_ref LookupBPI, function_ref LookupBFI) { LLVM_DEBUG(dbgs() << "Read in profile counters: "); auto &Ctx = M.getContext(); // Read the counter array from file. - auto ReaderOrErr = IndexedInstrProfReader::create(ProfileFileName); + auto ReaderOrErr = + IndexedInstrProfReader::create(ProfileFileName, ProfileRemappingFileName); if (Error E = ReaderOrErr.takeError()) { handleAllErrors(std::move(E), [&](const ErrorInfoBase &EI) { Ctx.diagnose( @@ -1529,10 +1535,14 @@ static bool annotateAllFunctions( return true; } -PGOInstrumentationUse::PGOInstrumentationUse(std::string Filename) - : ProfileFileName(std::move(Filename)) { +PGOInstrumentationUse::PGOInstrumentationUse(std::string Filename, + std::string RemappingFilename) + : ProfileFileName(std::move(Filename)), + ProfileRemappingFileName(std::move(RemappingFilename)) { if (!PGOTestProfileFile.empty()) ProfileFileName = PGOTestProfileFile; + if (!PGOTestProfileRemappingFile.empty()) + ProfileRemappingFileName = PGOTestProfileRemappingFile; } PreservedAnalyses PGOInstrumentationUse::run(Module &M, @@ -1547,7 +1557,8 @@ PreservedAnalyses PGOInstrumentationUse::run(Module &M, return &FAM.getResult(F); }; - if (!annotateAllFunctions(M, ProfileFileName, LookupBPI, LookupBFI)) + if (!annotateAllFunctions(M, ProfileFileName, ProfileRemappingFileName, + LookupBPI, LookupBFI)) return PreservedAnalyses::all(); return PreservedAnalyses::none(); @@ -1564,7 +1575,7 @@ bool PGOInstrumentationUseLegacyPass::runOnModule(Module &M) { return &this->getAnalysis(F).getBFI(); }; - return annotateAllFunctions(M, ProfileFileName, LookupBPI, LookupBFI); + return annotateAllFunctions(M, ProfileFileName, "", LookupBPI, LookupBFI); } static std::string getSimpleNodeName(const BasicBlock *Node) { diff --git a/test/Transforms/PGOProfile/Inputs/remap.map b/test/Transforms/PGOProfile/Inputs/remap.map new file mode 100644 index 00000000000..df3d82d38bd --- /dev/null +++ b/test/Transforms/PGOProfile/Inputs/remap.map @@ -0,0 +1,8 @@ +# foo:: and foo::detail:: are equivalent +name 3foo N3foo6detailE + +# foo::qux and foo::quux are equivalent +type N3foo3quxE N3foo4quuxE + +# N::X and M::X are equivalent +name N1N1XE N1M1XE diff --git a/test/Transforms/PGOProfile/Inputs/remap.proftext b/test/Transforms/PGOProfile/Inputs/remap.proftext new file mode 100644 index 00000000000..40054d78f5a --- /dev/null +++ b/test/Transforms/PGOProfile/Inputs/remap.proftext @@ -0,0 +1,8 @@ +# :ir is the flag to indicate this is IR level profile. +:ir +_ZN3foo3barERKN1N1XINS_4quuxEEE +25571299074 +2 +3 +2 + diff --git a/test/Transforms/PGOProfile/remap.ll b/test/Transforms/PGOProfile/remap.ll new file mode 100644 index 00000000000..2fdca9e33d1 --- /dev/null +++ b/test/Transforms/PGOProfile/remap.ll @@ -0,0 +1,28 @@ +; RUN: llvm-profdata merge %S/Inputs/remap.proftext -o %t.profdata +; RUN: opt < %s -passes=pgo-instr-use -pgo-test-profile-file=%t.profdata -pgo-test-profile-remapping-file=%S/Inputs/remap.map -S | FileCheck %s --check-prefix=USE + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define i32 @_ZN3foo3barERKN1M1XINS_6detail3quxEEE(i32 %i) { +; USE-LABEL: @_ZN3foo3barERKN1M1XINS_6detail3quxEEE +; USE-SAME: !prof ![[FUNC_ENTRY_COUNT:[0-9]+]] +entry: + %cmp = icmp sgt i32 %i, 0 + br i1 %cmp, label %if.then, label %if.end +; USE: br i1 %cmp, label %if.then, label %if.end +; USE-SAME: !prof ![[BW_ENTRY:[0-9]+]] + +if.then: + %add = add nsw i32 %i, 2 + br label %if.end + +if.end: + %retv = phi i32 [ %add, %if.then ], [ %i, %entry ] + ret i32 %retv +} + +; USE-DAG: {{![0-9]+}} = !{i32 1, !"ProfileSummary", {{![0-9]+}}} +; USE-DAG: {{![0-9]+}} = !{!"DetailedSummary", {{![0-9]+}}} +; USE-DAG: ![[FUNC_ENTRY_COUNT]] = !{!"function_entry_count", i64 3} +; USE-DAG: ![[BW_ENTRY]] = !{!"branch_weights", i32 2, i32 1} diff --git a/test/Transforms/SampleProfile/Inputs/remap.map b/test/Transforms/SampleProfile/Inputs/remap.map new file mode 100644 index 00000000000..df3d82d38bd --- /dev/null +++ b/test/Transforms/SampleProfile/Inputs/remap.map @@ -0,0 +1,8 @@ +# foo:: and foo::detail:: are equivalent +name 3foo N3foo6detailE + +# foo::qux and foo::quux are equivalent +type N3foo3quxE N3foo4quuxE + +# N::X and M::X are equivalent +name N1N1XE N1M1XE diff --git a/test/Transforms/SampleProfile/Inputs/remap.prof b/test/Transforms/SampleProfile/Inputs/remap.prof new file mode 100644 index 00000000000..8244a51a165 --- /dev/null +++ b/test/Transforms/SampleProfile/Inputs/remap.prof @@ -0,0 +1,10 @@ +_ZN3foo3barERKN1N1XINS_4quuxEEE:15680:2500 + 1: 2500 + 4: 1000 + 5: 1000 + 6: 800 + 7: 500 + 9: 10226 + 10: 2243 + 16: 0 + 18: 0 diff --git a/test/Transforms/SampleProfile/remap.ll b/test/Transforms/SampleProfile/remap.ll new file mode 100644 index 00000000000..206962a3bef --- /dev/null +++ b/test/Transforms/SampleProfile/remap.ll @@ -0,0 +1,60 @@ +; RUN: opt %s -passes=sample-profile -sample-profile-file=%S/Inputs/remap.prof -sample-profile-remapping-file=%S/Inputs/remap.map | opt -analyze -branch-prob | FileCheck %s + +; Reduced from branch.ll + +declare i1 @foo() + +define void @_ZN3foo3barERKN1M1XINS_6detail3quxEEE() !dbg !2 { +; CHECK: Printing analysis 'Branch Probability Analysis' for function '_ZN3foo3barERKN1M1XINS_6detail3quxEEE': + +entry: + %cmp = call i1 @foo(), !dbg !6 + br i1 %cmp, label %if.then, label %if.end +; CHECK: edge entry -> if.then probability is 0x4ccf6b16 / 0x80000000 = 60.01% +; CHECK: edge entry -> if.end probability is 0x333094ea / 0x80000000 = 39.99% + +if.then: + br label %return + +if.end: + %cmp1 = call i1 @foo(), !dbg !7 + br i1 %cmp1, label %if.then.2, label %if.else +; CHECK: edge if.end -> if.then.2 probability is 0x6652c748 / 0x80000000 = 79.94% +; CHECK: edge if.end -> if.else probability is 0x19ad38b8 / 0x80000000 = 20.06% + +if.then.2: + call i1 @foo(), !dbg !8 + br label %for.cond + +for.cond: + %cmp5 = call i1 @foo() + br i1 %cmp5, label %for.body, label %for.end, !prof !9 +; CHECK: edge for.cond -> for.body probability is 0x73333333 / 0x80000000 = 90.00% +; CHECK: edge for.cond -> for.end probability is 0x0ccccccd / 0x80000000 = 10.00% + +for.body: + br label %for.cond + +for.end: + br label %return + +if.else: + br label %return + +return: + ret void +} + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!4, !5} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "foo++", isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug, enums: !{}, retainedTypes: !{}) +!1 = !DIFile(filename: "test.cc", directory: "/foo/bar") +!2 = distinct !DISubprogram(name: "_ZN3foo3barERKN1M1XINS_6detail3quxEEE", scope: !1, file: !1, line: 4, type: !3, isLocal: false, isDefinition: true, scopeLine: 4, flags: DIFlagPrototyped, isOptimized: false, unit: !0, retainedNodes: !{}) +!3 = !DISubroutineType(types: !{}) +!4 = !{i32 2, !"Dwarf Version", i32 4} +!5 = !{i32 2, !"Debug Info Version", i32 3} +!6 = !DILocation(line: 5, column: 8, scope: !2) +!7 = !DILocation(line: 8, column: 6, scope: !2) +!8 = !DILocation(line: 10, column: 11, scope: !2) +!9 = !{!"branch_weights", i32 90, i32 10} diff --git a/tools/opt/NewPMDriver.cpp b/tools/opt/NewPMDriver.cpp index 55ca23cd6f3..e63547a79d0 100644 --- a/tools/opt/NewPMDriver.cpp +++ b/tools/opt/NewPMDriver.cpp @@ -108,6 +108,10 @@ static cl::opt PGOKindFlag( "Use sampled profile to guide PGO."))); static cl::opt ProfileFile( "profile-file", cl::desc("Path to the profile."), cl::Hidden); +static cl::opt + ProfileRemappingFile("profile-remapping-file", + cl::desc("Path to the profile remapping file."), + cl::Hidden); static cl::opt DebugInfoForProfiling( "new-pm-debug-info-for-profiling", cl::init(false), cl::Hidden, cl::desc("Emit special debug info to enable PGO profile generation.")); @@ -200,17 +204,17 @@ bool llvm::runPassPipeline(StringRef Arg0, Module &M, TargetMachine *TM, Optional P; switch (PGOKindFlag) { case InstrGen: - P = PGOOptions(ProfileFile, "", "", true); + P = PGOOptions(ProfileFile, "", "", "", true); break; case InstrUse: - P = PGOOptions("", ProfileFile, "", false); + P = PGOOptions("", ProfileFile, "", ProfileRemappingFile, false); break; case SampleUse: - P = PGOOptions("", "", ProfileFile, false); + P = PGOOptions("", "", ProfileFile, ProfileRemappingFile, false); break; case NoPGO: if (DebugInfoForProfiling) - P = PGOOptions("", "", "", false, true); + P = PGOOptions("", "", "", "", false, true); else P = None; } -- GitLab From 9528e40193fc42c0bb185ae2442a063b035e752c Mon Sep 17 00:00:00 2001 From: Saleem Abdulrasool Date: Wed, 10 Oct 2018 23:53:12 +0000 Subject: [PATCH 0034/1116] llvm-c: Add C APIs to access DebugLoc info Add thin shims to C interface to provide access to DebugLoc info for Instructions, GlobalVariables and Functions. Patch by Josh Berdine! git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344202 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm-c/Core.h | 38 ++++++++++++++++++++++ lib/IR/Core.cpp | 73 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 111 insertions(+) diff --git a/include/llvm-c/Core.h b/include/llvm-c/Core.h index f7f22387b53..2e8c29c23bf 100644 --- a/include/llvm-c/Core.h +++ b/include/llvm-c/Core.h @@ -929,6 +929,44 @@ void LLVMGetNamedMetadataOperands(LLVMModuleRef M, const char *Name, void LLVMAddNamedMetadataOperand(LLVMModuleRef M, const char *Name, LLVMValueRef Val); +/** + * Return the directory of the debug location for this value, which must be + * an llvm::Instruction, llvm::GlobalVariable, or llvm::Function. + * + * @see llvm::Instruction::getDebugLoc() + * @see llvm::GlobalVariable::getDebugInfo() + * @see llvm::Function::getSubprogram() + */ +const char *LLVMGetDebugLocDirectory(LLVMValueRef Val, unsigned *Length); + +/** + * Return the filename of the debug location for this value, which must be + * an llvm::Instruction, llvm::GlobalVariable, or llvm::Function. + * + * @see llvm::Instruction::getDebugLoc() + * @see llvm::GlobalVariable::getDebugInfo() + * @see llvm::Function::getSubprogram() + */ +const char *LLVMGetDebugLocFilename(LLVMValueRef Val, unsigned *Length); + +/** + * Return the line number of the debug location for this value, which must be + * an llvm::Instruction, llvm::GlobalVariable, or llvm::Function. + * + * @see llvm::Instruction::getDebugLoc() + * @see llvm::GlobalVariable::getDebugInfo() + * @see llvm::Function::getSubprogram() + */ +unsigned LLVMGetDebugLocLine(LLVMValueRef Val); + +/** + * Return the column number of the debug location for this value, which must be + * an llvm::Instruction. + * + * @see llvm::Instruction::getDebugLoc() + */ +unsigned LLVMGetDebugLocColumn(LLVMValueRef Val); + /** * Add a function to a module under a specified name. * diff --git a/lib/IR/Core.cpp b/lib/IR/Core.cpp index 410a426a4a2..639b6b4489a 100644 --- a/lib/IR/Core.cpp +++ b/lib/IR/Core.cpp @@ -17,6 +17,7 @@ #include "llvm/IR/Attributes.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/DiagnosticPrinter.h" @@ -1189,6 +1190,78 @@ void LLVMAddNamedMetadataOperand(LLVMModuleRef M, const char *Name, N->addOperand(extractMDNode(unwrap(Val))); } +const char *LLVMGetDebugLocDirectory(LLVMValueRef Val, unsigned *Length) { + if (!Length) return nullptr; + StringRef S; + if (const auto *I = unwrap(Val)) { + S = I->getDebugLoc()->getDirectory(); + } else if (const auto *GV = unwrap(Val)) { + SmallVector GVEs; + GV->getDebugInfo(GVEs); + if (GVEs.size()) + if (const DIGlobalVariable *DGV = GVEs[0]->getVariable()) + S = DGV->getDirectory(); + } else if (const auto *F = unwrap(Val)) { + if (const DISubprogram *DSP = F->getSubprogram()) + S = DSP->getDirectory(); + } else { + assert(0 && "Expected Instruction, GlobalVariable or Function"); + return nullptr; + } + *Length = S.size(); + return S.data(); +} + +const char *LLVMGetDebugLocFilename(LLVMValueRef Val, unsigned *Length) { + if (!Length) return nullptr; + StringRef S; + if (const auto *I = unwrap(Val)) { + S = I->getDebugLoc()->getFilename(); + } else if (const auto *GV = unwrap(Val)) { + SmallVector GVEs; + GV->getDebugInfo(GVEs); + if (GVEs.size()) + if (const DIGlobalVariable *DGV = GVEs[0]->getVariable()) + S = DGV->getFilename(); + } else if (const auto *F = unwrap(Val)) { + if (const DISubprogram *DSP = F->getSubprogram()) + S = DSP->getFilename(); + } else { + assert(0 && "Expected Instruction, GlobalVariable or Function"); + return nullptr; + } + *Length = S.size(); + return S.data(); +} + +unsigned LLVMGetDebugLocLine(LLVMValueRef Val) { + unsigned L = 0; + if (const auto *I = unwrap(Val)) { + L = I->getDebugLoc()->getLine(); + } else if (const auto *GV = unwrap(Val)) { + SmallVector GVEs; + GV->getDebugInfo(GVEs); + if (GVEs.size()) + if (const DIGlobalVariable *DGV = GVEs[0]->getVariable()) + L = DGV->getLine(); + } else if (const auto *F = unwrap(Val)) { + if (const DISubprogram *DSP = F->getSubprogram()) + L = DSP->getLine(); + } else { + assert(0 && "Expected Instruction, GlobalVariable or Function"); + return -1; + } + return L; +} + +unsigned LLVMGetDebugLocColumn(LLVMValueRef Val) { + unsigned C = 0; + if (const auto *I = unwrap(Val)) + if (const auto &L = I->getDebugLoc()) + C = L->getColumn(); + return C; +} + /*--.. Operations on scalar constants ......................................--*/ LLVMValueRef LLVMConstInt(LLVMTypeRef IntTy, unsigned long long N, -- GitLab From ca6f7dc5ec23fe28db50bbbad42112a9c9acce17 Mon Sep 17 00:00:00 2001 From: Thomas Lively Date: Thu, 11 Oct 2018 00:01:25 +0000 Subject: [PATCH 0035/1116] [WebAssembly] Saturating float to int intrinsics Summary: Although the saturating float to int instructions are already emitted from normal IR, the fpto{s,u}i instructions produce poison values if the argument cannot fit in the result type. These intrinsics are therefore necessary to get guaranteed defined saturating behavior. Reviewers: aheejin, dschuff Subscribers: sbc100, jgravelle-google, sunfish, llvm-commits Differential Revision: https://reviews.llvm.org/D53004 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344204 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/IR/IntrinsicsWebAssembly.td | 11 +++ .../WebAssembly/WebAssemblyInstrConv.td | 18 ++++ .../WebAssembly/WebAssemblyInstrSIMD.td | 10 +++ test/CodeGen/WebAssembly/conv.ll | 88 +++++++++++++++++++ test/CodeGen/WebAssembly/simd-intrinsics.ll | 48 ++++++++++ 5 files changed, 175 insertions(+) diff --git a/include/llvm/IR/IntrinsicsWebAssembly.td b/include/llvm/IR/IntrinsicsWebAssembly.td index 54408d317d2..adf7cb0ba0e 100644 --- a/include/llvm/IR/IntrinsicsWebAssembly.td +++ b/include/llvm/IR/IntrinsicsWebAssembly.td @@ -36,6 +36,17 @@ def int_wasm_mem_grow : Intrinsic<[llvm_anyint_ty], def int_wasm_current_memory : Intrinsic<[llvm_anyint_ty], [], [IntrReadMem]>; def int_wasm_grow_memory : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], []>; +//===----------------------------------------------------------------------===// +// Saturating float-to-int conversions +//===----------------------------------------------------------------------===// + +def int_wasm_trunc_saturate_signed : Intrinsic<[llvm_anyint_ty], + [llvm_anyfloat_ty], + [IntrNoMem, IntrSpeculatable]>; +def int_wasm_trunc_saturate_unsigned : Intrinsic<[llvm_anyint_ty], + [llvm_anyfloat_ty], + [IntrNoMem, IntrSpeculatable]>; + //===----------------------------------------------------------------------===// // Exception handling intrinsics //===----------------------------------------------------------------------===// diff --git a/lib/Target/WebAssembly/WebAssemblyInstrConv.td b/lib/Target/WebAssembly/WebAssemblyInstrConv.td index e9ba52799ee..0d772c743a7 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrConv.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrConv.td @@ -97,6 +97,24 @@ defm I64_TRUNC_U_SAT_F64 : I<(outs I64:$dst), (ins F64:$src), (outs), (ins), "i64.trunc_u:sat/f64", 0xfc07>, Requires<[HasNontrappingFPToInt]>; +// Lower llvm.wasm.trunc.saturate.* to saturating instructions +def : Pat<(int_wasm_trunc_saturate_signed F32:$src), + (I32_TRUNC_S_SAT_F32 F32:$src)>; +def : Pat<(int_wasm_trunc_saturate_unsigned F32:$src), + (I32_TRUNC_U_SAT_F32 F32:$src)>; +def : Pat<(int_wasm_trunc_saturate_signed F64:$src), + (I32_TRUNC_S_SAT_F64 F64:$src)>; +def : Pat<(int_wasm_trunc_saturate_unsigned F64:$src), + (I32_TRUNC_U_SAT_F64 F64:$src)>; +def : Pat<(int_wasm_trunc_saturate_signed F32:$src), + (I64_TRUNC_S_SAT_F32 F32:$src)>; +def : Pat<(int_wasm_trunc_saturate_unsigned F32:$src), + (I64_TRUNC_U_SAT_F32 F32:$src)>; +def : Pat<(int_wasm_trunc_saturate_signed F64:$src), + (I64_TRUNC_S_SAT_F64 F64:$src)>; +def : Pat<(int_wasm_trunc_saturate_unsigned F64:$src), + (I64_TRUNC_U_SAT_F64 F64:$src)>; + // Conversion from floating point to integer pseudo-instructions which don't // trap on overflow or invalid. let usesCustomInserter = 1, isCodeGenOnly = 1 in { diff --git a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td index 90bdc17890b..4fffd979cd6 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -782,6 +782,16 @@ defm "" : SIMDConvert; defm "" : SIMDConvert; defm "" : SIMDConvert; +// Lower llvm.wasm.trunc.saturate.* to saturating instructions +def : Pat<(v4i32 (int_wasm_trunc_saturate_signed (v4f32 V128:$src))), + (fp_to_sint_v4i32_v4f32 (v4f32 V128:$src))>; +def : Pat<(v4i32 (int_wasm_trunc_saturate_unsigned (v4f32 V128:$src))), + (fp_to_uint_v4i32_v4f32 (v4f32 V128:$src))>; +def : Pat<(v2i64 (int_wasm_trunc_saturate_signed (v2f64 V128:$src))), + (fp_to_sint_v2i64_v2f64 (v2f64 V128:$src))>; +def : Pat<(v2i64 (int_wasm_trunc_saturate_unsigned (v2f64 V128:$src))), + (fp_to_uint_v2i64_v2f64 (v2f64 V128:$src))>; + // Bitcasts are nops // Matching bitcast t1 to t1 causes strange errors, so avoid repeating types foreach t1 = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in diff --git a/test/CodeGen/WebAssembly/conv.ll b/test/CodeGen/WebAssembly/conv.ll index bd3ae29e28e..ea1ef9737c0 100644 --- a/test/CodeGen/WebAssembly/conv.ll +++ b/test/CodeGen/WebAssembly/conv.ll @@ -45,6 +45,17 @@ define i32 @i32_trunc_s_f32(float %x) { ret i32 %a } +; CHECK-LABEL: i32_trunc_sat_s_f32: +; CHECK-NEXT: .param f32{{$}} +; CHECK-NEXT: .result i32{{$}} +; CHECK-NEXT: i32.trunc_s:sat/f32 $push[[NUM:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[NUM]]{{$}} +declare i32 @llvm.wasm.trunc.saturate.signed.i32.f32(float) +define i32 @i32_trunc_sat_s_f32(float %x) { + %a = call i32 @llvm.wasm.trunc.saturate.signed.i32.f32(float %x) + ret i32 %a +} + ; CHECK-LABEL: i32_trunc_u_f32: ; CHECK-NEXT: .param f32{{$}} ; CHECK-NEXT: .result i32{{$}} @@ -55,6 +66,17 @@ define i32 @i32_trunc_u_f32(float %x) { ret i32 %a } +; CHECK-LABEL: i32_trunc_sat_u_f32: +; CHECK-NEXT: .param f32{{$}} +; CHECK-NEXT: .result i32{{$}} +; CHECK-NEXT: i32.trunc_u:sat/f32 $push[[NUM:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[NUM]]{{$}} +declare i32 @llvm.wasm.trunc.saturate.unsigned.i32.f32(float) +define i32 @i32_trunc_sat_u_f32(float %x) { + %a = call i32 @llvm.wasm.trunc.saturate.unsigned.i32.f32(float %x) + ret i32 %a +} + ; CHECK-LABEL: i32_trunc_s_f64: ; CHECK-NEXT: .param f64{{$}} ; CHECK-NEXT: .result i32{{$}} @@ -65,6 +87,17 @@ define i32 @i32_trunc_s_f64(double %x) { ret i32 %a } +; CHECK-LABEL: i32_trunc_sat_s_f64: +; CHECK-NEXT: .param f64{{$}} +; CHECK-NEXT: .result i32{{$}} +; CHECK-NEXT: i32.trunc_s:sat/f64 $push[[NUM:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[NUM]]{{$}} +declare i32 @llvm.wasm.trunc.saturate.signed.i32.f64(double) +define i32 @i32_trunc_sat_s_f64(double %x) { + %a = call i32 @llvm.wasm.trunc.saturate.signed.i32.f64(double %x) + ret i32 %a +} + ; CHECK-LABEL: i32_trunc_u_f64: ; CHECK-NEXT: .param f64{{$}} ; CHECK-NEXT: .result i32{{$}} @@ -75,6 +108,17 @@ define i32 @i32_trunc_u_f64(double %x) { ret i32 %a } +; CHECK-LABEL: i32_trunc_sat_u_f64: +; CHECK-NEXT: .param f64{{$}} +; CHECK-NEXT: .result i32{{$}} +; CHECK-NEXT: i32.trunc_u:sat/f64 $push[[NUM:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[NUM]]{{$}} +declare i32 @llvm.wasm.trunc.saturate.unsigned.i32.f64(double) +define i32 @i32_trunc_sat_u_f64(double %x) { + %a = call i32 @llvm.wasm.trunc.saturate.unsigned.i32.f64(double %x) + ret i32 %a +} + ; CHECK-LABEL: i64_trunc_s_f32: ; CHECK-NEXT: .param f32{{$}} ; CHECK-NEXT: .result i64{{$}} @@ -85,6 +129,17 @@ define i64 @i64_trunc_s_f32(float %x) { ret i64 %a } +; CHECK-LABEL: i64_trunc_sat_s_f32: +; CHECK-NEXT: .param f32{{$}} +; CHECK-NEXT: .result i64{{$}} +; CHECK-NEXT: i64.trunc_s:sat/f32 $push[[NUM:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[NUM]]{{$}} +declare i64 @llvm.wasm.trunc.saturate.signed.i64.f32(float) +define i64 @i64_trunc_sat_s_f32(float %x) { + %a = call i64 @llvm.wasm.trunc.saturate.signed.i64.f32(float %x) + ret i64 %a +} + ; CHECK-LABEL: i64_trunc_u_f32: ; CHECK-NEXT: .param f32{{$}} ; CHECK-NEXT: .result i64{{$}} @@ -95,6 +150,17 @@ define i64 @i64_trunc_u_f32(float %x) { ret i64 %a } +; CHECK-LABEL: i64_trunc_sat_u_f32: +; CHECK-NEXT: .param f32{{$}} +; CHECK-NEXT: .result i64{{$}} +; CHECK-NEXT: i64.trunc_u:sat/f32 $push[[NUM:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[NUM]]{{$}} +declare i64 @llvm.wasm.trunc.saturate.unsigned.i64.f32(float) +define i64 @i64_trunc_sat_u_f32(float %x) { + %a = call i64 @llvm.wasm.trunc.saturate.unsigned.i64.f32(float %x) + ret i64 %a +} + ; CHECK-LABEL: i64_trunc_s_f64: ; CHECK-NEXT: .param f64{{$}} ; CHECK-NEXT: .result i64{{$}} @@ -105,6 +171,17 @@ define i64 @i64_trunc_s_f64(double %x) { ret i64 %a } +; CHECK-LABEL: i64_trunc_sat_s_f64: +; CHECK-NEXT: .param f64{{$}} +; CHECK-NEXT: .result i64{{$}} +; CHECK-NEXT: i64.trunc_s:sat/f64 $push[[NUM:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[NUM]]{{$}} +declare i64 @llvm.wasm.trunc.saturate.signed.i64.f64(double) +define i64 @i64_trunc_sat_s_f64(double %x) { + %a = call i64 @llvm.wasm.trunc.saturate.signed.i64.f64(double %x) + ret i64 %a +} + ; CHECK-LABEL: i64_trunc_u_f64: ; CHECK-NEXT: .param f64{{$}} ; CHECK-NEXT: .result i64{{$}} @@ -115,6 +192,17 @@ define i64 @i64_trunc_u_f64(double %x) { ret i64 %a } +; CHECK-LABEL: i64_trunc_sat_u_f64: +; CHECK-NEXT: .param f64{{$}} +; CHECK-NEXT: .result i64{{$}} +; CHECK-NEXT: i64.trunc_u:sat/f64 $push[[NUM:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[NUM]]{{$}} +declare i64 @llvm.wasm.trunc.saturate.unsigned.i64.f64(double) +define i64 @i64_trunc_sat_u_f64(double %x) { + %a = call i64 @llvm.wasm.trunc.saturate.unsigned.i64.f64(double %x) + ret i64 %a +} + ; CHECK-LABEL: f32_convert_s_i32: ; CHECK-NEXT: .param i32{{$}} ; CHECK-NEXT: .result f32{{$}} diff --git a/test/CodeGen/WebAssembly/simd-intrinsics.ll b/test/CodeGen/WebAssembly/simd-intrinsics.ll index f9f4eb0cf9e..ab32929ceb8 100644 --- a/test/CodeGen/WebAssembly/simd-intrinsics.ll +++ b/test/CodeGen/WebAssembly/simd-intrinsics.ll @@ -226,6 +226,30 @@ define <4 x i32> @bitselect_v4i32(<4 x i32> %c, <4 x i32> %v1, <4 x i32> %v2) { ret <4 x i32> %a } +; CHECK-LABEL: trunc_sat_s_v4i32: +; NO-SIMD128-NOT: f32x4 +; SIMD128-NEXT: .param v128{{$}} +; SIMD128-NEXT: .result v128{{$}} +; SIMD128-NEXT: i32x4.trunc_sat_s/f32x4 $push[[R:[0-9]+]]=, $0 +; SIMD128-NEXT: return $pop[[R]] +declare <4 x i32> @llvm.wasm.trunc.saturate.signed.v4i32.v4f32(<4 x float>) +define <4 x i32> @trunc_sat_s_v4i32(<4 x float> %x) { + %a = call <4 x i32> @llvm.wasm.trunc.saturate.signed.v4i32.v4f32(<4 x float> %x) + ret <4 x i32> %a +} + +; CHECK-LABEL: trunc_sat_u_v4i32: +; NO-SIMD128-NOT: f32x4 +; SIMD128-NEXT: .param v128{{$}} +; SIMD128-NEXT: .result v128{{$}} +; SIMD128-NEXT: i32x4.trunc_sat_u/f32x4 $push[[R:[0-9]+]]=, $0 +; SIMD128-NEXT: return $pop[[R]] +declare <4 x i32> @llvm.wasm.trunc.saturate.unsigned.v4i32.v4f32(<4 x float>) +define <4 x i32> @trunc_sat_u_v4i32(<4 x float> %x) { + %a = call <4 x i32> @llvm.wasm.trunc.saturate.unsigned.v4i32.v4f32(<4 x float> %x) + ret <4 x i32> %a +} + ; ============================================================================== ; 2 x i64 ; ============================================================================== @@ -264,6 +288,30 @@ define <2 x i64> @bitselect_v2i64(<2 x i64> %c, <2 x i64> %v1, <2 x i64> %v2) { ret <2 x i64> %a } +; CHECK-LABEL: trunc_sat_s_v2i64: +; NO-SIMD128-NOT: f32x4 +; SIMD128-NEXT: .param v128{{$}} +; SIMD128-NEXT: .result v128{{$}} +; SIMD128-NEXT: i64x2.trunc_sat_s/f64x2 $push[[R:[0-9]+]]=, $0 +; SIMD128-NEXT: return $pop[[R]] +declare <2 x i64> @llvm.wasm.trunc.saturate.signed.v2i64.v2f64(<2 x double>) +define <2 x i64> @trunc_sat_s_v2i64(<2 x double> %x) { + %a = call <2 x i64> @llvm.wasm.trunc.saturate.signed.v2i64.v2f64(<2 x double> %x) + ret <2 x i64> %a +} + +; CHECK-LABEL: trunc_sat_u_v2i64: +; NO-SIMD128-NOT: f32x4 +; SIMD128-NEXT: .param v128{{$}} +; SIMD128-NEXT: .result v128{{$}} +; SIMD128-NEXT: i64x2.trunc_sat_u/f64x2 $push[[R:[0-9]+]]=, $0 +; SIMD128-NEXT: return $pop[[R]] +declare <2 x i64> @llvm.wasm.trunc.saturate.unsigned.v2i64.v2f64(<2 x double>) +define <2 x i64> @trunc_sat_u_v2i64(<2 x double> %x) { + %a = call <2 x i64> @llvm.wasm.trunc.saturate.unsigned.v2i64.v2f64(<2 x double> %x) + ret <2 x i64> %a +} + ; ============================================================================== ; 4 x f32 ; ============================================================================== -- GitLab From d36df14f65d85d69bce145f509bd0b28ee92dce4 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Thu, 11 Oct 2018 00:08:59 +0000 Subject: [PATCH 0036/1116] [MC][ELF] Fix section_mergeable_size.ll Some targets use %progbits instead of @progbits. Updating that check with a {{[@%]}}progbits regex to make those bots happy. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344206 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/CodeGen/Generic/section_mergeable_size.ll | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/CodeGen/Generic/section_mergeable_size.ll b/test/CodeGen/Generic/section_mergeable_size.ll index fbab7fe849f..0a7ddd110c4 100644 --- a/test/CodeGen/Generic/section_mergeable_size.ll +++ b/test/CodeGen/Generic/section_mergeable_size.ll @@ -1,3 +1,3 @@ ; RUN: llc < %s | FileCheck %s @a = internal unnamed_addr constant [1 x [1 x i32]] zeroinitializer, section ".init.rodata", align 4 -; CHECK: .init.rodata,"aM",@progbits,4 +; CHECK: .init.rodata,"aM",{{[@%]}}progbits,4 -- GitLab From 119d9f6b0f78730ac836eb96b931434a537dec42 Mon Sep 17 00:00:00 2001 From: Thomas Lively Date: Thu, 11 Oct 2018 00:49:24 +0000 Subject: [PATCH 0037/1116] [WebAssembly][NFC] Use intrinsic dag nodes directly Summary: Instead of custom lowering to WebAssemblyISD nodes first. Reviewers: aheejin, dschuff Subscribers: sbc100, jgravelle-google, sunfish, llvm-commits Differential Revision: https://reviews.llvm.org/D53119 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344211 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/WebAssembly/WebAssemblyISD.def | 7 ---- .../WebAssembly/WebAssemblyISelLowering.cpp | 38 ------------------ .../WebAssembly/WebAssemblyInstrSIMD.td | 39 +++++++------------ 3 files changed, 14 insertions(+), 70 deletions(-) diff --git a/lib/Target/WebAssembly/WebAssemblyISD.def b/lib/Target/WebAssembly/WebAssemblyISD.def index f326d37944f..3c44d04598c 100644 --- a/lib/Target/WebAssembly/WebAssemblyISD.def +++ b/lib/Target/WebAssembly/WebAssemblyISD.def @@ -22,12 +22,5 @@ HANDLE_NODETYPE(Wrapper) HANDLE_NODETYPE(BR_IF) HANDLE_NODETYPE(BR_TABLE) HANDLE_NODETYPE(SHUFFLE) -HANDLE_NODETYPE(ANYTRUE) -HANDLE_NODETYPE(ALLTRUE) -HANDLE_NODETYPE(BITSELECT) -HANDLE_NODETYPE(ADD_SAT_S) -HANDLE_NODETYPE(ADD_SAT_U) -HANDLE_NODETYPE(SUB_SAT_S) -HANDLE_NODETYPE(SUB_SAT_U) // add memory opcodes starting at ISD::FIRST_TARGET_MEMORY_OPCODE here... diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index 4ecbf6d7487..30c2e843408 100644 --- a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -966,44 +966,6 @@ WebAssemblyTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, default: return {}; // Don't custom lower most intrinsics. - case Intrinsic::wasm_add_saturate_signed: - case Intrinsic::wasm_add_saturate_unsigned: - case Intrinsic::wasm_sub_saturate_signed: - case Intrinsic::wasm_sub_saturate_unsigned: { - unsigned OpCode; - switch (IntNo) { - case Intrinsic::wasm_add_saturate_signed: - OpCode = WebAssemblyISD::ADD_SAT_S; - break; - case Intrinsic::wasm_add_saturate_unsigned: - OpCode = WebAssemblyISD::ADD_SAT_U; - break; - case Intrinsic::wasm_sub_saturate_signed: - OpCode = WebAssemblyISD::SUB_SAT_S; - break; - case Intrinsic::wasm_sub_saturate_unsigned: - OpCode = WebAssemblyISD::SUB_SAT_U; - break; - default: - llvm_unreachable("unexpected intrinsic id"); - break; - } - return DAG.getNode(OpCode, DL, Op.getValueType(), Op.getOperand(1), - Op.getOperand(2)); - } - - case Intrinsic::wasm_bitselect: - return DAG.getNode(WebAssemblyISD::BITSELECT, DL, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); - - case Intrinsic::wasm_anytrue: - case Intrinsic::wasm_alltrue: { - unsigned OpCode = IntNo == Intrinsic::wasm_anytrue - ? WebAssemblyISD::ANYTRUE - : WebAssemblyISD::ALLTRUE; - return DAG.getNode(OpCode, DL, Op.getValueType(), Op.getOperand(1)); - } - case Intrinsic::wasm_lsda: // TODO For now, just return 0 not to crash return DAG.getConstant(0, DL, Op.getValueType()); diff --git a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td index 4fffd979cd6..419aa0b437f 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -382,7 +382,9 @@ multiclass SIMDBinary simdop> { defm _#vec_t : SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs), (outs), (ins), - [(set (vec_t V128:$dst), (node V128:$lhs, V128:$rhs))], + [(set (vec_t V128:$dst), + (node (vec_t V128:$lhs), (vec_t V128:$rhs)) + )], vec#"."#name#"\t$dst, $lhs, $rhs", vec#"."#name, simdop>; } @@ -434,23 +436,19 @@ multiclass SIMDBinarySat baseInst> { defm "" : SIMDBinary; } -def wasm_saturate_t : SDTypeProfile<1, 2, - [SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>] ->; -def wasm_add_sat_s : SDNode<"WebAssemblyISD::ADD_SAT_S", wasm_saturate_t>; -def wasm_add_sat_u : SDNode<"WebAssemblyISD::ADD_SAT_U", wasm_saturate_t>; -def wasm_sub_sat_s : SDNode<"WebAssemblyISD::SUB_SAT_S", wasm_saturate_t>; -def wasm_sub_sat_u : SDNode<"WebAssemblyISD::SUB_SAT_U", wasm_saturate_t>; - // Saturating integer addition: add_saturate_s / add_saturate_u let isCommutable = 1 in { -defm ADD_SAT_S : SIMDBinarySat; -defm ADD_SAT_U : SIMDBinarySat; +defm ADD_SAT_S : + SIMDBinarySat; +defm ADD_SAT_U : + SIMDBinarySat; } // isCommutable = 1 // Saturating integer subtraction: sub_saturate_s / sub_saturate_u -defm SUB_SAT_S : SIMDBinarySat; -defm SUB_SAT_U : SIMDBinarySat; +defm SUB_SAT_S : + SIMDBinarySat; +defm SUB_SAT_U : + SIMDBinarySat; //===----------------------------------------------------------------------===// // Bit shifts @@ -518,16 +516,11 @@ defm "" : SIMDNot; defm "" : SIMDNot; // Bitwise select: v128.bitselect -def wasm_bitselect_t : SDTypeProfile<1, 3, - [SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>] ->; -def wasm_bitselect : SDNode<"WebAssemblyISD::BITSELECT", wasm_bitselect_t>; - multiclass Bitselect { defm BITSELECT_#vec_t : SIMD_I<(outs V128:$dst), (ins V128:$v1, V128:$v2, V128:$c), (outs), (ins), [(set (vec_t V128:$dst), - (vec_t (wasm_bitselect + (vec_t (int_wasm_bitselect (vec_t V128:$c), (vec_t V128:$v1), (vec_t V128:$v2) )) )], @@ -562,15 +555,11 @@ multiclass SIMDReduce baseInst> { defm "" : SIMDReduceVec; } -def wasm_reduce_t : SDTypeProfile<1, 1, [SDTCisVT<0, i32>, SDTCisVec<1>]>; - // Any lane true: any_true -def wasm_anytrue : SDNode<"WebAssemblyISD::ANYTRUE", wasm_reduce_t>; -defm ANYTRUE : SIMDReduce<"any_true", wasm_anytrue, 65>; +defm ANYTRUE : SIMDReduce<"any_true", int_wasm_anytrue, 65>; // All lanes true: all_true -def wasm_alltrue : SDNode<"WebAssemblyISD::ALLTRUE", wasm_reduce_t>; -defm ALLTRUE : SIMDReduce<"all_true", wasm_alltrue, 69>; +defm ALLTRUE : SIMDReduce<"all_true", int_wasm_alltrue, 69>; //===----------------------------------------------------------------------===// // Comparisons -- GitLab From 4599ef42e718015834c38475f2ed8865306d1f56 Mon Sep 17 00:00:00 2001 From: Zachary Turner Date: Thu, 11 Oct 2018 03:42:17 +0000 Subject: [PATCH 0038/1116] Use fully qualified namespace name. llvm::detail is not the only namespace named detail. So if someone has done a `using namespace llvm::support`, for example, this will fail with an ambiguous namespace name. Granted people generally shouldn't be using large namespaces like that, but it's common at local function scopes. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344216 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/ADT/DenseMap.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/include/llvm/ADT/DenseMap.h b/include/llvm/ADT/DenseMap.h index 380f1db0d04..8fe0f48adf2 100644 --- a/include/llvm/ADT/DenseMap.h +++ b/include/llvm/ADT/DenseMap.h @@ -46,9 +46,10 @@ struct DenseMapPair : public std::pair { } // end namespace detail -template < - typename KeyT, typename ValueT, typename KeyInfoT = DenseMapInfo, - typename Bucket = detail::DenseMapPair, bool IsConst = false> +template , + typename Bucket = llvm::detail::DenseMapPair, + bool IsConst = false> class DenseMapIterator; template , - typename BucketT = detail::DenseMapPair> + typename BucketT = llvm::detail::DenseMapPair> class DenseMap : public DenseMapBase, KeyT, ValueT, KeyInfoT, BucketT> { friend class DenseMapBase; @@ -798,7 +799,7 @@ private: template , - typename BucketT = detail::DenseMapPair> + typename BucketT = llvm::detail::DenseMapPair> class SmallDenseMap : public DenseMapBase< SmallDenseMap, KeyT, -- GitLab From 0801e41a5193a5c6d9794886524bbeb83f7f4150 Mon Sep 17 00:00:00 2001 From: Chris Bieneman Date: Thu, 11 Oct 2018 04:00:51 +0000 Subject: [PATCH 0039/1116] [Coverage] Apply filtered paths to summary Summary: The script to generate code coverage reports supports passing filter paths to llvm-cov when generating the HTML reports, but doesn't pass those paths to the summary generation as well. This results in a summary report that doesn't match the HTML report. This patch addresses the problem by also passing the filter paths to the summary report generation. Reviewers: vsk Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D53110 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344217 91177308-0d34-0410-b5e6-96231b3b80d8 --- utils/prepare-code-coverage-artifact.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/utils/prepare-code-coverage-artifact.py b/utils/prepare-code-coverage-artifact.py index 883cdd78049..5c4af242d0d 100644 --- a/utils/prepare-code-coverage-artifact.py +++ b/utils/prepare-code-coverage-artifact.py @@ -51,7 +51,8 @@ def prepare_html_report(host_llvm_cov, profile, report_dir, binaries, subprocess.check_call(invocation) with open(os.path.join(report_dir, 'summary.txt'), 'wb') as Summary: subprocess.check_call([host_llvm_cov, 'report'] + objects + - ['-instr-profile', profile], stdout=Summary) + ['-instr-profile', profile] + restricted_dirs, + stdout=Summary) print('Done!') def prepare_html_reports(host_llvm_cov, profdata_path, report_dir, binaries, -- GitLab From 6fb010f388bb2cb2f00fe039123092308ac4865d Mon Sep 17 00:00:00 2001 From: Chris Bieneman Date: Thu, 11 Oct 2018 04:02:53 +0000 Subject: [PATCH 0040/1116] [CMake] Unconditionally add .h and .td files to target sources Previously adding header and table gen files was conditional on using an IDE. Since these files have the `HEADER_FILE_ONLY` attribute applied they are ignored as sources by all non-IDE generators, so there is really no reason not to include them. Additionally having the CMake always include these files allows the CMake-server to include them in the sources list for targets, which is valuable to anyone using CMake-server integrated tools. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344218 91177308-0d34-0410-b5e6-96231b3b80d8 --- cmake/modules/LLVMProcessSources.cmake | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/cmake/modules/LLVMProcessSources.cmake b/cmake/modules/LLVMProcessSources.cmake index f65f31d797c..7cbd2863500 100644 --- a/cmake/modules/LLVMProcessSources.cmake +++ b/cmake/modules/LLVMProcessSources.cmake @@ -52,16 +52,15 @@ function(llvm_process_sources OUT_VAR) cmake_parse_arguments(ARG "" "" "ADDITIONAL_HEADERS;ADDITIONAL_HEADER_DIRS" ${ARGN}) set(sources ${ARG_UNPARSED_ARGUMENTS}) llvm_check_source_file_list( ${sources} ) - if( LLVM_ENABLE_IDE ) - # This adds .td and .h files to the Visual Studio solution: - add_td_sources(sources) - find_all_header_files(hdrs "${ARG_ADDITIONAL_HEADER_DIRS}") - if (hdrs) - set_source_files_properties(${hdrs} PROPERTIES HEADER_FILE_ONLY ON) - endif() - set_source_files_properties(${ARG_ADDITIONAL_HEADERS} PROPERTIES HEADER_FILE_ONLY ON) - list(APPEND sources ${ARG_ADDITIONAL_HEADERS} ${hdrs}) + + # This adds .td and .h files to the Visual Studio solution: + add_td_sources(sources) + find_all_header_files(hdrs "${ARG_ADDITIONAL_HEADER_DIRS}") + if (hdrs) + set_source_files_properties(${hdrs} PROPERTIES HEADER_FILE_ONLY ON) endif() + set_source_files_properties(${ARG_ADDITIONAL_HEADERS} PROPERTIES HEADER_FILE_ONLY ON) + list(APPEND sources ${ARG_ADDITIONAL_HEADERS} ${hdrs}) set( ${OUT_VAR} ${sources} PARENT_SCOPE ) endfunction(llvm_process_sources) -- GitLab From b1493403a4118390aeeb929e3d4ac7489dd0d167 Mon Sep 17 00:00:00 2001 From: Chris Bieneman Date: Thu, 11 Oct 2018 04:06:14 +0000 Subject: [PATCH 0041/1116] [CMake] Temporarily remove the LLVM_ENABLE_IDE option All uses of this option have been removed, and the intent is to change the purpose and default value of this option. To prevent it from having impacts on users, this patch temporarily removes the option and purges it from CMake caches. In a few days, once this has propagated to contributors I will re-introduce the option with the new default value. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344219 91177308-0d34-0410-b5e6-96231b3b80d8 --- cmake/modules/HandleLLVMOptions.cmake | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/cmake/modules/HandleLLVMOptions.cmake b/cmake/modules/HandleLLVMOptions.cmake index 0daaf7d95c0..85aebf6ed71 100644 --- a/cmake/modules/HandleLLVMOptions.cmake +++ b/cmake/modules/HandleLLVMOptions.cmake @@ -868,12 +868,16 @@ else() set(LLVM_ENABLE_PLUGINS ON) endif() -set(LLVM_ENABLE_IDE_default OFF) -if (XCODE OR MSVC_IDE OR CMAKE_EXTRA_GENERATOR) - set(LLVM_ENABLE_IDE_default ON) -endif() -option(LLVM_ENABLE_IDE "Generate targets and process sources for use with an IDE" - ${LLVM_ENABLE_IDE_default}) +# Remove LLVM_ENABLE_IDE from the CMake cache. This is a temporary change to +# allow CMake caches to be cleaned up so that we can change the default for this +# option and how it is used. +unset(LLVM_ENABLE_IDE CACHE) +#set(LLVM_ENABLE_IDE_default OFF) +#if (XCODE OR MSVC_IDE OR CMAKE_EXTRA_GENERATOR) +# set(LLVM_ENABLE_IDE_default ON) +#endif() +#option(LLVM_ENABLE_IDE "Generate targets and process sources for use with an IDE" +# ${LLVM_ENABLE_IDE_default}) function(get_compile_definitions) get_directory_property(top_dir_definitions DIRECTORY ${CMAKE_SOURCE_DIR} COMPILE_DEFINITIONS) -- GitLab From d342bc787ac2c06b23bf013303587e7f086fdc18 Mon Sep 17 00:00:00 2001 From: Martin Storsjo Date: Thu, 11 Oct 2018 06:53:38 +0000 Subject: [PATCH 0042/1116] [llvm-nm] Include the text "@FILE" in the output of --help libtool requires this text to be present, in order to conclude that the tool supports response files. Also add an explicit test of using response files with llvm-nm. Differential Revision: https://reviews.llvm.org/D53064 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344222 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/tools/llvm-nm/X86/response-file.test | 5 +++++ test/tools/llvm-nm/libtool-response-file.test | 4 ++++ tools/llvm-nm/llvm-nm.cpp | 2 ++ 3 files changed, 11 insertions(+) create mode 100644 test/tools/llvm-nm/X86/response-file.test create mode 100644 test/tools/llvm-nm/libtool-response-file.test diff --git a/test/tools/llvm-nm/X86/response-file.test b/test/tools/llvm-nm/X86/response-file.test new file mode 100644 index 00000000000..5c53960056c --- /dev/null +++ b/test/tools/llvm-nm/X86/response-file.test @@ -0,0 +1,5 @@ +# RUN: echo "-P %p/Inputs/hello.obj.elf-x86_64" > %t-response +# RUN: llvm-nm @%t-response | FileCheck %s + +CHECK: main T 0 0 +CHECK: puts U 0 0 diff --git a/test/tools/llvm-nm/libtool-response-file.test b/test/tools/llvm-nm/libtool-response-file.test new file mode 100644 index 00000000000..5d4af74e316 --- /dev/null +++ b/test/tools/llvm-nm/libtool-response-file.test @@ -0,0 +1,4 @@ +RUN: llvm-nm --help | FileCheck %s +Check that the output of llvm-nm --help contains the literal text @FILE; this +indicates to libtool that llvm-nm does support response files. +CHECK: @FILE diff --git a/tools/llvm-nm/llvm-nm.cpp b/tools/llvm-nm/llvm-nm.cpp index 7e1fd86d0b0..22fdd4ca85e 100644 --- a/tools/llvm-nm/llvm-nm.cpp +++ b/tools/llvm-nm/llvm-nm.cpp @@ -183,6 +183,8 @@ cl::opt DyldInfoOnly("dyldinfo-only", cl::opt NoLLVMBitcode("no-llvm-bc", cl::desc("Disable LLVM bitcode reader")); +cl::extrahelp HelpResponse("\nPass @FILE as argument to read options from FILE.\n"); + bool PrintAddress = true; bool MultipleFiles = false; -- GitLab From d7e48738baf8d9bfe70eea8383e0d230b1626ca6 Mon Sep 17 00:00:00 2001 From: Max Kazantsev Date: Thu, 11 Oct 2018 07:22:26 +0000 Subject: [PATCH 0043/1116] [IndVars] Drop "exact" flag from lshr and udiv when substituting their args There is a transform that may replace `lshr (x+1), 1` with `lshr x, 1` in case if it can prove that the result will be the same. However the initial instruction might have an `exact` flag set, and it now should be dropped unless we prove that it may hold. Incorrectly set `exact` attribute may then produce poison. Differential Revision: https://reviews.llvm.org/D53061 Reviewed By: sanjoy git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344223 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Utils/SimplifyIndVar.cpp | 9 ++ test/Transforms/IndVarSimplify/drop-exact.ll | 99 ++++++++++++++++++++ 2 files changed, 108 insertions(+) create mode 100644 test/Transforms/IndVarSimplify/drop-exact.ll diff --git a/lib/Transforms/Utils/SimplifyIndVar.cpp b/lib/Transforms/Utils/SimplifyIndVar.cpp index 51fda1c620b..7faf291e73d 100644 --- a/lib/Transforms/Utils/SimplifyIndVar.cpp +++ b/lib/Transforms/Utils/SimplifyIndVar.cpp @@ -108,6 +108,7 @@ Value *SimplifyIndvar::foldIVUser(Instruction *UseInst, Instruction *IVOperand) Value *IVSrc = nullptr; const unsigned OperIdx = 0; const SCEV *FoldedExpr = nullptr; + bool MustDropExactFlag = false; switch (UseInst->getOpcode()) { default: return nullptr; @@ -140,6 +141,11 @@ Value *SimplifyIndvar::foldIVUser(Instruction *UseInst, Instruction *IVOperand) APInt::getOneBitSet(BitWidth, D->getZExtValue())); } FoldedExpr = SE->getUDivExpr(SE->getSCEV(IVSrc), SE->getSCEV(D)); + // We might have 'exact' flag set at this point which will no longer be + // correct after we make the replacement. + if (UseInst->isExact() && + SE->getSCEV(IVSrc) != SE->getMulExpr(FoldedExpr, SE->getSCEV(D))) + MustDropExactFlag = true; } // We have something that might fold it's operand. Compare SCEVs. if (!SE->isSCEVable(UseInst->getType())) @@ -155,6 +161,9 @@ Value *SimplifyIndvar::foldIVUser(Instruction *UseInst, Instruction *IVOperand) UseInst->setOperand(OperIdx, IVSrc); assert(SE->getSCEV(UseInst) == FoldedExpr && "bad SCEV with folded oper"); + if (MustDropExactFlag) + UseInst->dropPoisonGeneratingFlags(); + ++NumElimOperand; Changed = true; if (IVOperand->use_empty()) diff --git a/test/Transforms/IndVarSimplify/drop-exact.ll b/test/Transforms/IndVarSimplify/drop-exact.ll new file mode 100644 index 00000000000..ab5b2b5a859 --- /dev/null +++ b/test/Transforms/IndVarSimplify/drop-exact.ll @@ -0,0 +1,99 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -indvars -S | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1" + +; We make a transform by getting rid of add nsw i32 %tmp17, -1; make sure that +; we drop "exact" flag on lshr as we do it. +define void @drop_exact(i32* %p, i64* %p1) { +; CHECK-LABEL: @drop_exact( +; CHECK-NEXT: bb: +; CHECK-NEXT: br label [[BB12:%.*]] +; CHECK: bb7: +; CHECK-NEXT: ret void +; CHECK: bb12: +; CHECK-NEXT: [[TMP13:%.*]] = phi i32 [ -47436, [[BB:%.*]] ], [ [[TMP15:%.*]], [[BB12]] ] +; CHECK-NEXT: [[TMP14:%.*]] = phi i32 [ 0, [[BB]] ], [ [[TMP42:%.*]], [[BB12]] ] +; CHECK-NEXT: [[TMP15]] = add nsw i32 [[TMP13]], -1 +; CHECK-NEXT: [[TMP16:%.*]] = shl i32 [[TMP15]], 1 +; CHECK-NEXT: [[TMP17:%.*]] = sub nsw i32 42831, [[TMP16]] +; CHECK-NEXT: [[TMP19:%.*]] = lshr i32 [[TMP17]], 1 +; CHECK-NEXT: [[TMP20:%.*]] = urem i32 [[TMP19]], 250 +; CHECK-NEXT: [[TMP22:%.*]] = lshr i32 [[TMP17]], 1 +; CHECK-NEXT: store i32 [[TMP22]], i32* [[P:%.*]], align 4 +; CHECK-NEXT: [[TMP26:%.*]] = zext i32 [[TMP20]] to i64 +; CHECK-NEXT: store i64 [[TMP26]], i64* [[P1:%.*]], align 4 +; CHECK-NEXT: [[TMP42]] = add nuw nsw i32 [[TMP14]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[TMP42]], 719 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[BB7:%.*]], label [[BB12]] +; +bb: + br label %bb12 + +bb7: ; preds = %bb12 + ret void + +bb12: ; preds = %bb12, %bb + %tmp13 = phi i32 [ -47436, %bb ], [ %tmp15, %bb12 ] + %tmp14 = phi i32 [ 0, %bb ], [ %tmp42, %bb12 ] + %tmp15 = add i32 %tmp13, -1 + %tmp16 = shl i32 %tmp15, 1 + %tmp17 = sub i32 42831, %tmp16 + %tmp19 = lshr i32 %tmp17, 1 + %tmp20 = urem i32 %tmp19, 250 + %tmp21 = add nsw i32 %tmp17, -1 + %tmp22 = lshr exact i32 %tmp21, 1 + store i32 %tmp22, i32* %p, align 4 + %tmp26 = zext i32 %tmp20 to i64 + store i64 %tmp26, i64* %p1, align 4 + %tmp42 = add nuw nsw i32 %tmp14, 1 + %tmp43 = icmp ugt i32 %tmp14, 717 + br i1 %tmp43, label %bb7, label %bb12 +} + +; Throw away add nsw i32 %tmp17, 0, do not drop exact flag. +define void @dont_drop_exact(i32* %p, i64* %p1) { +; CHECK-LABEL: @dont_drop_exact( +; CHECK-NEXT: bb: +; CHECK-NEXT: br label [[BB12:%.*]] +; CHECK: bb7: +; CHECK-NEXT: ret void +; CHECK: bb12: +; CHECK-NEXT: [[TMP13:%.*]] = phi i32 [ -47436, [[BB:%.*]] ], [ [[TMP15:%.*]], [[BB12]] ] +; CHECK-NEXT: [[TMP14:%.*]] = phi i32 [ 0, [[BB]] ], [ [[TMP42:%.*]], [[BB12]] ] +; CHECK-NEXT: [[TMP15]] = add nsw i32 [[TMP13]], -1 +; CHECK-NEXT: [[TMP16:%.*]] = shl i32 [[TMP15]], 1 +; CHECK-NEXT: [[TMP17:%.*]] = sub nsw i32 42831, [[TMP16]] +; CHECK-NEXT: [[TMP19:%.*]] = lshr i32 [[TMP17]], 1 +; CHECK-NEXT: [[TMP20:%.*]] = urem i32 [[TMP19]], 250 +; CHECK-NEXT: [[TMP22:%.*]] = lshr exact i32 [[TMP17]], 1 +; CHECK-NEXT: store i32 [[TMP22]], i32* [[P:%.*]], align 4 +; CHECK-NEXT: [[TMP26:%.*]] = zext i32 [[TMP20]] to i64 +; CHECK-NEXT: store i64 [[TMP26]], i64* [[P1:%.*]], align 4 +; CHECK-NEXT: [[TMP42]] = add nuw nsw i32 [[TMP14]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[TMP42]], 719 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[BB7:%.*]], label [[BB12]] +; +bb: + br label %bb12 + +bb7: ; preds = %bb12 + ret void + +bb12: ; preds = %bb12, %bb + %tmp13 = phi i32 [ -47436, %bb ], [ %tmp15, %bb12 ] + %tmp14 = phi i32 [ 0, %bb ], [ %tmp42, %bb12 ] + %tmp15 = add i32 %tmp13, -1 + %tmp16 = shl i32 %tmp15, 1 + %tmp17 = sub i32 42831, %tmp16 + %tmp19 = lshr i32 %tmp17, 1 + %tmp20 = urem i32 %tmp19, 250 + %tmp21 = add nsw i32 %tmp17, 0 + %tmp22 = lshr exact i32 %tmp21, 1 + store i32 %tmp22, i32* %p, align 4 + %tmp26 = zext i32 %tmp20 to i64 + store i64 %tmp26, i64* %p1, align 4 + %tmp42 = add nuw nsw i32 %tmp14, 1 + %tmp43 = icmp ugt i32 %tmp14, 717 + br i1 %tmp43, label %bb7, label %bb12 +} -- GitLab From b4d0c491d053d679b29dd78bc1ac00d4df00fbd3 Mon Sep 17 00:00:00 2001 From: Roman Lebedev Date: Thu, 11 Oct 2018 07:51:13 +0000 Subject: [PATCH 0044/1116] [X86][BMI1]: X86DAGToDAGISel: select BEXTR from x & ~(-1 << nbits) pattern Summary: As discussed in D48491, we can't really do this in the TableGen, since we need to produce *two* instructions. This only implements one single pattern. The other 3 patterns will be in follow-ups. I'm not sure yet if we want to also fuse shift into here (i.e `(x >> start) & ...`) Reviewers: RKSimon, craig.topper, spatel Reviewed By: craig.topper Differential Revision: https://reviews.llvm.org/D52304 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344224 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelDAGToDAG.cpp | 83 +++++++++++++++++ test/CodeGen/X86/extract-bits.ll | 139 +++++++++------------------- test/CodeGen/X86/extract-lowbits.ll | 103 +++++++-------------- 3 files changed, 164 insertions(+), 161 deletions(-) diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index 5eb4dbb1d98..c043c7c54cc 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -451,6 +451,7 @@ namespace { } bool foldLoadStoreIntoMemOperand(SDNode *Node); + bool matchBEXTR(SDNode *Node); bool shrinkAndImmediate(SDNode *N); bool isMaskZeroExtended(SDNode *N) const; bool tryShiftAmountMod(SDNode *N); @@ -2565,6 +2566,86 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) { return true; } +// See if this is an X & Mask that we can match to BEXTR. +// Where Mask is one of the following patterns: +// a) x & (1 << nbits) - 1 +// b) x & ~(-1 << nbits) +// c) x & (-1 >> (32 - y)) +// d) x << (32 - y) >> (32 - y) +bool X86DAGToDAGISel::matchBEXTR(SDNode *Node) { + // BEXTR is BMI instruction. However, if we have BMI2, we prefer BZHI. + if (!Subtarget->hasBMI() || Subtarget->hasBMI2()) + return false; + + MVT NVT = Node->getSimpleValueType(0); + + // Only supported for 32 and 64 bits. + if (NVT != MVT::i32 && NVT != MVT::i64) + return false; + + SDValue NBits; + + // b) x & ~(-1 << nbits) + auto matchPatternB = [&NBits](SDValue Mask) -> bool { + // Match `~()`. Must only have one use! + if (!isBitwiseNot(Mask) || !Mask->hasOneUse()) + return false; + // Match `-1 << nbits`. Must only have one use! + SDValue M0 = Mask->getOperand(0); + if (M0->getOpcode() != ISD::SHL || !M0->hasOneUse()) + return false; + if (!isAllOnesConstant(M0->getOperand(0))) + return false; + NBits = M0->getOperand(1); + return true; + }; + + auto matchLowBitMask = [&matchPatternB](SDValue Mask) -> bool { + // FIXME: patterns a, c, d. + return matchPatternB(Mask); + }; + + SDValue X = Node->getOperand(0); + SDValue Mask = Node->getOperand(1); + + if (matchLowBitMask(Mask)) { + // Great. + } else { + std::swap(X, Mask); + if (!matchLowBitMask(Mask)) + return false; + } + + SDLoc DL(Node); + + // Insert 8-bit NBits into lowest 8 bits of NVT-sized (32 or 64-bit) register. + // All the other bits are undefined, we do not care about them. + SDValue ImplDef = + SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, NVT), 0); + insertDAGNode(*CurDAG, NBits, ImplDef); + SDValue OrigNBits = NBits; + NBits = CurDAG->getTargetInsertSubreg(X86::sub_8bit, DL, NVT, ImplDef, NBits); + insertDAGNode(*CurDAG, OrigNBits, NBits); + + // The 'control' of BEXTR has the pattern of: + // [15...8 bit][ 7...0 bit] location + // [ bit count][ shift] name + // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11 + + // Shift NBits left by 8 bits, thus producing 'control'. + SDValue C8 = CurDAG->getConstant(8, DL, MVT::i8); + SDValue Control = CurDAG->getNode(ISD::SHL, DL, NVT, NBits, C8); + insertDAGNode(*CurDAG, OrigNBits, Control); + // NOTE: could also try to extract start from (x >> start) + + // And finally, form the BEXTR itself. + SDValue Extract = CurDAG->getNode(X86ISD::BEXTR, DL, NVT, X, Control); + ReplaceNode(Node, Extract.getNode()); + SelectCode(Extract.getNode()); + + return true; +} + // Emit a PCMISTR(I/M) instruction. MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad, const SDLoc &dl, @@ -2872,6 +2953,8 @@ void X86DAGToDAGISel::Select(SDNode *Node) { break; case ISD::AND: + if (matchBEXTR(Node)) + return; if (AndImmShrink && shrinkAndImmediate(Node)) return; diff --git a/test/CodeGen/X86/extract-bits.ll b/test/CodeGen/X86/extract-bits.ll index 98c9ab271cb..b16aeb3d350 100644 --- a/test/CodeGen/X86/extract-bits.ll +++ b/test/CodeGen/X86/extract-bits.ll @@ -1507,16 +1507,12 @@ define i32 @bextr32_b0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind { ; ; X86-BMI1NOTBM-LABEL: bextr32_b0: ; X86-BMI1NOTBM: # %bb.0: -; X86-BMI1NOTBM-NEXT: pushl %esi ; X86-BMI1NOTBM-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-BMI1NOTBM-NEXT: movb {{[0-9]+}}(%esp), %cl ; X86-BMI1NOTBM-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-BMI1NOTBM-NEXT: shrl %cl, %edx -; X86-BMI1NOTBM-NEXT: movl $-1, %esi -; X86-BMI1NOTBM-NEXT: movl %eax, %ecx -; X86-BMI1NOTBM-NEXT: shll %cl, %esi -; X86-BMI1NOTBM-NEXT: andnl %edx, %esi, %eax -; X86-BMI1NOTBM-NEXT: popl %esi +; X86-BMI1NOTBM-NEXT: shll $8, %eax +; X86-BMI1NOTBM-NEXT: bextrl %eax, %edx, %eax ; X86-BMI1NOTBM-NEXT: retl ; ; X86-BMI1BMI2-LABEL: bextr32_b0: @@ -1544,10 +1540,8 @@ define i32 @bextr32_b0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind { ; X64-BMI1NOTBM-NEXT: movl %esi, %ecx ; X64-BMI1NOTBM-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-BMI1NOTBM-NEXT: shrl %cl, %edi -; X64-BMI1NOTBM-NEXT: movl $-1, %eax -; X64-BMI1NOTBM-NEXT: movl %edx, %ecx -; X64-BMI1NOTBM-NEXT: shll %cl, %eax -; X64-BMI1NOTBM-NEXT: andnl %edi, %eax, %eax +; X64-BMI1NOTBM-NEXT: shll $8, %edx +; X64-BMI1NOTBM-NEXT: bextrl %edx, %edi, %eax ; X64-BMI1NOTBM-NEXT: retq ; ; X64-BMI1BMI2-LABEL: bextr32_b0: @@ -1580,16 +1574,12 @@ define i32 @bextr32_b1_indexzext(i32 %val, i8 zeroext %numskipbits, i8 zeroext % ; ; X86-BMI1NOTBM-LABEL: bextr32_b1_indexzext: ; X86-BMI1NOTBM: # %bb.0: -; X86-BMI1NOTBM-NEXT: pushl %esi ; X86-BMI1NOTBM-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-BMI1NOTBM-NEXT: movb {{[0-9]+}}(%esp), %cl ; X86-BMI1NOTBM-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-BMI1NOTBM-NEXT: shrl %cl, %edx -; X86-BMI1NOTBM-NEXT: movl $-1, %esi -; X86-BMI1NOTBM-NEXT: movl %eax, %ecx -; X86-BMI1NOTBM-NEXT: shll %cl, %esi -; X86-BMI1NOTBM-NEXT: andnl %edx, %esi, %eax -; X86-BMI1NOTBM-NEXT: popl %esi +; X86-BMI1NOTBM-NEXT: shll $8, %eax +; X86-BMI1NOTBM-NEXT: bextrl %eax, %edx, %eax ; X86-BMI1NOTBM-NEXT: retl ; ; X86-BMI1BMI2-LABEL: bextr32_b1_indexzext: @@ -1617,10 +1607,8 @@ define i32 @bextr32_b1_indexzext(i32 %val, i8 zeroext %numskipbits, i8 zeroext % ; X64-BMI1NOTBM-NEXT: movl %esi, %ecx ; X64-BMI1NOTBM-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-BMI1NOTBM-NEXT: shrl %cl, %edi -; X64-BMI1NOTBM-NEXT: movl $-1, %eax -; X64-BMI1NOTBM-NEXT: movl %edx, %ecx -; X64-BMI1NOTBM-NEXT: shll %cl, %eax -; X64-BMI1NOTBM-NEXT: andnl %edi, %eax, %eax +; X64-BMI1NOTBM-NEXT: shll $8, %edx +; X64-BMI1NOTBM-NEXT: bextrl %edx, %edi, %eax ; X64-BMI1NOTBM-NEXT: retq ; ; X64-BMI1BMI2-LABEL: bextr32_b1_indexzext: @@ -1656,17 +1644,13 @@ define i32 @bextr32_b2_load(i32* %w, i32 %numskipbits, i32 %numlowbits) nounwind ; ; X86-BMI1NOTBM-LABEL: bextr32_b2_load: ; X86-BMI1NOTBM: # %bb.0: -; X86-BMI1NOTBM-NEXT: pushl %esi ; X86-BMI1NOTBM-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-BMI1NOTBM-NEXT: movb {{[0-9]+}}(%esp), %cl ; X86-BMI1NOTBM-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-BMI1NOTBM-NEXT: movl (%edx), %edx ; X86-BMI1NOTBM-NEXT: shrl %cl, %edx -; X86-BMI1NOTBM-NEXT: movl $-1, %esi -; X86-BMI1NOTBM-NEXT: movl %eax, %ecx -; X86-BMI1NOTBM-NEXT: shll %cl, %esi -; X86-BMI1NOTBM-NEXT: andnl %edx, %esi, %eax -; X86-BMI1NOTBM-NEXT: popl %esi +; X86-BMI1NOTBM-NEXT: shll $8, %eax +; X86-BMI1NOTBM-NEXT: bextrl %eax, %edx, %eax ; X86-BMI1NOTBM-NEXT: retl ; ; X86-BMI1BMI2-LABEL: bextr32_b2_load: @@ -1697,10 +1681,8 @@ define i32 @bextr32_b2_load(i32* %w, i32 %numskipbits, i32 %numlowbits) nounwind ; X64-BMI1NOTBM-NEXT: movl (%rdi), %eax ; X64-BMI1NOTBM-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-BMI1NOTBM-NEXT: shrl %cl, %eax -; X64-BMI1NOTBM-NEXT: movl $-1, %esi -; X64-BMI1NOTBM-NEXT: movl %edx, %ecx -; X64-BMI1NOTBM-NEXT: shll %cl, %esi -; X64-BMI1NOTBM-NEXT: andnl %eax, %esi, %eax +; X64-BMI1NOTBM-NEXT: shll $8, %edx +; X64-BMI1NOTBM-NEXT: bextrl %edx, %eax, %eax ; X64-BMI1NOTBM-NEXT: retq ; ; X64-BMI1BMI2-LABEL: bextr32_b2_load: @@ -1735,17 +1717,13 @@ define i32 @bextr32_b3_load_indexzext(i32* %w, i8 zeroext %numskipbits, i8 zeroe ; ; X86-BMI1NOTBM-LABEL: bextr32_b3_load_indexzext: ; X86-BMI1NOTBM: # %bb.0: -; X86-BMI1NOTBM-NEXT: pushl %esi ; X86-BMI1NOTBM-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-BMI1NOTBM-NEXT: movb {{[0-9]+}}(%esp), %cl ; X86-BMI1NOTBM-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-BMI1NOTBM-NEXT: movl (%edx), %edx ; X86-BMI1NOTBM-NEXT: shrl %cl, %edx -; X86-BMI1NOTBM-NEXT: movl $-1, %esi -; X86-BMI1NOTBM-NEXT: movl %eax, %ecx -; X86-BMI1NOTBM-NEXT: shll %cl, %esi -; X86-BMI1NOTBM-NEXT: andnl %edx, %esi, %eax -; X86-BMI1NOTBM-NEXT: popl %esi +; X86-BMI1NOTBM-NEXT: shll $8, %eax +; X86-BMI1NOTBM-NEXT: bextrl %eax, %edx, %eax ; X86-BMI1NOTBM-NEXT: retl ; ; X86-BMI1BMI2-LABEL: bextr32_b3_load_indexzext: @@ -1776,10 +1754,8 @@ define i32 @bextr32_b3_load_indexzext(i32* %w, i8 zeroext %numskipbits, i8 zeroe ; X64-BMI1NOTBM-NEXT: movl (%rdi), %eax ; X64-BMI1NOTBM-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-BMI1NOTBM-NEXT: shrl %cl, %eax -; X64-BMI1NOTBM-NEXT: movl $-1, %esi -; X64-BMI1NOTBM-NEXT: movl %edx, %ecx -; X64-BMI1NOTBM-NEXT: shll %cl, %esi -; X64-BMI1NOTBM-NEXT: andnl %eax, %esi, %eax +; X64-BMI1NOTBM-NEXT: shll $8, %edx +; X64-BMI1NOTBM-NEXT: bextrl %edx, %eax, %eax ; X64-BMI1NOTBM-NEXT: retq ; ; X64-BMI1BMI2-LABEL: bextr32_b3_load_indexzext: @@ -1815,16 +1791,12 @@ define i32 @bextr32_b4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits) ; ; X86-BMI1NOTBM-LABEL: bextr32_b4_commutative: ; X86-BMI1NOTBM: # %bb.0: -; X86-BMI1NOTBM-NEXT: pushl %esi ; X86-BMI1NOTBM-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-BMI1NOTBM-NEXT: movb {{[0-9]+}}(%esp), %cl ; X86-BMI1NOTBM-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-BMI1NOTBM-NEXT: shrl %cl, %edx -; X86-BMI1NOTBM-NEXT: movl $-1, %esi -; X86-BMI1NOTBM-NEXT: movl %eax, %ecx -; X86-BMI1NOTBM-NEXT: shll %cl, %esi -; X86-BMI1NOTBM-NEXT: andnl %edx, %esi, %eax -; X86-BMI1NOTBM-NEXT: popl %esi +; X86-BMI1NOTBM-NEXT: shll $8, %eax +; X86-BMI1NOTBM-NEXT: bextrl %eax, %edx, %eax ; X86-BMI1NOTBM-NEXT: retl ; ; X86-BMI1BMI2-LABEL: bextr32_b4_commutative: @@ -1852,10 +1824,8 @@ define i32 @bextr32_b4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits) ; X64-BMI1NOTBM-NEXT: movl %esi, %ecx ; X64-BMI1NOTBM-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-BMI1NOTBM-NEXT: shrl %cl, %edi -; X64-BMI1NOTBM-NEXT: movl $-1, %eax -; X64-BMI1NOTBM-NEXT: movl %edx, %ecx -; X64-BMI1NOTBM-NEXT: shll %cl, %eax -; X64-BMI1NOTBM-NEXT: andnl %edi, %eax, %eax +; X64-BMI1NOTBM-NEXT: shll $8, %edx +; X64-BMI1NOTBM-NEXT: bextrl %edx, %edi, %eax ; X64-BMI1NOTBM-NEXT: retq ; ; X64-BMI1BMI2-LABEL: bextr32_b4_commutative: @@ -1896,24 +1866,19 @@ define i32 @bextr32_b5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits ; ; X86-BMI1NOTBM-LABEL: bextr32_b5_skipextrauses: ; X86-BMI1NOTBM: # %bb.0: -; X86-BMI1NOTBM-NEXT: pushl %edi ; X86-BMI1NOTBM-NEXT: pushl %esi -; X86-BMI1NOTBM-NEXT: pushl %eax -; X86-BMI1NOTBM-NEXT: movb {{[0-9]+}}(%esp), %dl -; X86-BMI1NOTBM-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-BMI1NOTBM-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-BMI1NOTBM-NEXT: movl %eax, %ecx -; X86-BMI1NOTBM-NEXT: shrl %cl, %esi -; X86-BMI1NOTBM-NEXT: movl $-1, %edi -; X86-BMI1NOTBM-NEXT: movl %edx, %ecx -; X86-BMI1NOTBM-NEXT: shll %cl, %edi -; X86-BMI1NOTBM-NEXT: andnl %esi, %edi, %esi -; X86-BMI1NOTBM-NEXT: movl %eax, (%esp) +; X86-BMI1NOTBM-NEXT: subl $8, %esp +; X86-BMI1NOTBM-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-BMI1NOTBM-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-BMI1NOTBM-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-BMI1NOTBM-NEXT: shrl %cl, %edx +; X86-BMI1NOTBM-NEXT: shll $8, %eax +; X86-BMI1NOTBM-NEXT: bextrl %eax, %edx, %esi +; X86-BMI1NOTBM-NEXT: movl %ecx, (%esp) ; X86-BMI1NOTBM-NEXT: calll use32 ; X86-BMI1NOTBM-NEXT: movl %esi, %eax -; X86-BMI1NOTBM-NEXT: addl $4, %esp +; X86-BMI1NOTBM-NEXT: addl $8, %esp ; X86-BMI1NOTBM-NEXT: popl %esi -; X86-BMI1NOTBM-NEXT: popl %edi ; X86-BMI1NOTBM-NEXT: retl ; ; X86-BMI1BMI2-LABEL: bextr32_b5_skipextrauses: @@ -1952,10 +1917,8 @@ define i32 @bextr32_b5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits ; X64-BMI1NOTBM-NEXT: pushq %rbx ; X64-BMI1NOTBM-NEXT: movl %esi, %ecx ; X64-BMI1NOTBM-NEXT: shrl %cl, %edi -; X64-BMI1NOTBM-NEXT: movl $-1, %eax -; X64-BMI1NOTBM-NEXT: movl %edx, %ecx -; X64-BMI1NOTBM-NEXT: shll %cl, %eax -; X64-BMI1NOTBM-NEXT: andnl %edi, %eax, %ebx +; X64-BMI1NOTBM-NEXT: shll $8, %edx +; X64-BMI1NOTBM-NEXT: bextrl %edx, %edi, %ebx ; X64-BMI1NOTBM-NEXT: movl %esi, %edi ; X64-BMI1NOTBM-NEXT: callq use32 ; X64-BMI1NOTBM-NEXT: movl %ebx, %eax @@ -2106,10 +2069,8 @@ define i64 @bextr64_b0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { ; X64-BMI1NOTBM-NEXT: movq %rsi, %rcx ; X64-BMI1NOTBM-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-BMI1NOTBM-NEXT: shrq %cl, %rdi -; X64-BMI1NOTBM-NEXT: movq $-1, %rax -; X64-BMI1NOTBM-NEXT: movl %edx, %ecx -; X64-BMI1NOTBM-NEXT: shlq %cl, %rax -; X64-BMI1NOTBM-NEXT: andnq %rdi, %rax, %rax +; X64-BMI1NOTBM-NEXT: shlq $8, %rdx +; X64-BMI1NOTBM-NEXT: bextrq %rdx, %rdi, %rax ; X64-BMI1NOTBM-NEXT: retq ; ; X64-BMI1BMI2-LABEL: bextr64_b0: @@ -2245,13 +2206,12 @@ define i64 @bextr64_b1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext % ; ; X64-BMI1NOTBM-LABEL: bextr64_b1_indexzext: ; X64-BMI1NOTBM: # %bb.0: +; X64-BMI1NOTBM-NEXT: # kill: def $edx killed $edx def $rdx ; X64-BMI1NOTBM-NEXT: movl %esi, %ecx ; X64-BMI1NOTBM-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-BMI1NOTBM-NEXT: shrq %cl, %rdi -; X64-BMI1NOTBM-NEXT: movq $-1, %rax -; X64-BMI1NOTBM-NEXT: movl %edx, %ecx -; X64-BMI1NOTBM-NEXT: shlq %cl, %rax -; X64-BMI1NOTBM-NEXT: andnq %rdi, %rax, %rax +; X64-BMI1NOTBM-NEXT: shlq $8, %rdx +; X64-BMI1NOTBM-NEXT: bextrq %rdx, %rdi, %rax ; X64-BMI1NOTBM-NEXT: retq ; ; X64-BMI1BMI2-LABEL: bextr64_b1_indexzext: @@ -2399,10 +2359,8 @@ define i64 @bextr64_b2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind ; X64-BMI1NOTBM-NEXT: movq (%rdi), %rax ; X64-BMI1NOTBM-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-BMI1NOTBM-NEXT: shrq %cl, %rax -; X64-BMI1NOTBM-NEXT: movq $-1, %rsi -; X64-BMI1NOTBM-NEXT: movl %edx, %ecx -; X64-BMI1NOTBM-NEXT: shlq %cl, %rsi -; X64-BMI1NOTBM-NEXT: andnq %rax, %rsi, %rax +; X64-BMI1NOTBM-NEXT: shlq $8, %rdx +; X64-BMI1NOTBM-NEXT: bextrq %rdx, %rax, %rax ; X64-BMI1NOTBM-NEXT: retq ; ; X64-BMI1BMI2-LABEL: bextr64_b2_load: @@ -2543,14 +2501,13 @@ define i64 @bextr64_b3_load_indexzext(i64* %w, i8 zeroext %numskipbits, i8 zeroe ; ; X64-BMI1NOTBM-LABEL: bextr64_b3_load_indexzext: ; X64-BMI1NOTBM: # %bb.0: +; X64-BMI1NOTBM-NEXT: # kill: def $edx killed $edx def $rdx ; X64-BMI1NOTBM-NEXT: movl %esi, %ecx ; X64-BMI1NOTBM-NEXT: movq (%rdi), %rax ; X64-BMI1NOTBM-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-BMI1NOTBM-NEXT: shrq %cl, %rax -; X64-BMI1NOTBM-NEXT: movq $-1, %rsi -; X64-BMI1NOTBM-NEXT: movl %edx, %ecx -; X64-BMI1NOTBM-NEXT: shlq %cl, %rsi -; X64-BMI1NOTBM-NEXT: andnq %rax, %rsi, %rax +; X64-BMI1NOTBM-NEXT: shlq $8, %rdx +; X64-BMI1NOTBM-NEXT: bextrq %rdx, %rax, %rax ; X64-BMI1NOTBM-NEXT: retq ; ; X64-BMI1BMI2-LABEL: bextr64_b3_load_indexzext: @@ -2694,10 +2651,8 @@ define i64 @bextr64_b4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits) ; X64-BMI1NOTBM-NEXT: movq %rsi, %rcx ; X64-BMI1NOTBM-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-BMI1NOTBM-NEXT: shrq %cl, %rdi -; X64-BMI1NOTBM-NEXT: movq $-1, %rax -; X64-BMI1NOTBM-NEXT: movl %edx, %ecx -; X64-BMI1NOTBM-NEXT: shlq %cl, %rax -; X64-BMI1NOTBM-NEXT: andnq %rdi, %rax, %rax +; X64-BMI1NOTBM-NEXT: shlq $8, %rdx +; X64-BMI1NOTBM-NEXT: bextrq %rdx, %rdi, %rax ; X64-BMI1NOTBM-NEXT: retq ; ; X64-BMI1BMI2-LABEL: bextr64_b4_commutative: @@ -2876,12 +2831,10 @@ define i64 @bextr64_b5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits ; X64-BMI1NOTBM-LABEL: bextr64_b5_skipextrauses: ; X64-BMI1NOTBM: # %bb.0: ; X64-BMI1NOTBM-NEXT: pushq %rbx -; X64-BMI1NOTBM-NEXT: movl %esi, %ecx +; X64-BMI1NOTBM-NEXT: movq %rsi, %rcx ; X64-BMI1NOTBM-NEXT: shrq %cl, %rdi -; X64-BMI1NOTBM-NEXT: movq $-1, %rax -; X64-BMI1NOTBM-NEXT: movl %edx, %ecx -; X64-BMI1NOTBM-NEXT: shlq %cl, %rax -; X64-BMI1NOTBM-NEXT: andnq %rdi, %rax, %rbx +; X64-BMI1NOTBM-NEXT: shlq $8, %rdx +; X64-BMI1NOTBM-NEXT: bextrq %rdx, %rdi, %rbx ; X64-BMI1NOTBM-NEXT: movq %rsi, %rdi ; X64-BMI1NOTBM-NEXT: callq use64 ; X64-BMI1NOTBM-NEXT: movq %rbx, %rax diff --git a/test/CodeGen/X86/extract-lowbits.ll b/test/CodeGen/X86/extract-lowbits.ll index 4af130cd825..43df34000d4 100644 --- a/test/CodeGen/X86/extract-lowbits.ll +++ b/test/CodeGen/X86/extract-lowbits.ll @@ -794,10 +794,9 @@ define i32 @bzhi32_b0(i32 %val, i32 %numlowbits) nounwind { ; ; X86-BMI1NOTBM-LABEL: bzhi32_b0: ; X86-BMI1NOTBM: # %bb.0: -; X86-BMI1NOTBM-NEXT: movb {{[0-9]+}}(%esp), %cl -; X86-BMI1NOTBM-NEXT: movl $-1, %eax -; X86-BMI1NOTBM-NEXT: shll %cl, %eax -; X86-BMI1NOTBM-NEXT: andnl {{[0-9]+}}(%esp), %eax, %eax +; X86-BMI1NOTBM-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-BMI1NOTBM-NEXT: shll $8, %eax +; X86-BMI1NOTBM-NEXT: bextrl %eax, {{[0-9]+}}(%esp), %eax ; X86-BMI1NOTBM-NEXT: retl ; ; X86-BMI1BMI2-LABEL: bzhi32_b0: @@ -818,11 +817,8 @@ define i32 @bzhi32_b0(i32 %val, i32 %numlowbits) nounwind { ; ; X64-BMI1NOTBM-LABEL: bzhi32_b0: ; X64-BMI1NOTBM: # %bb.0: -; X64-BMI1NOTBM-NEXT: movl %esi, %ecx -; X64-BMI1NOTBM-NEXT: movl $-1, %eax -; X64-BMI1NOTBM-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-BMI1NOTBM-NEXT: shll %cl, %eax -; X64-BMI1NOTBM-NEXT: andnl %edi, %eax, %eax +; X64-BMI1NOTBM-NEXT: shll $8, %esi +; X64-BMI1NOTBM-NEXT: bextrl %esi, %edi, %eax ; X64-BMI1NOTBM-NEXT: retq ; ; X64-BMI1BMI2-LABEL: bzhi32_b0: @@ -847,10 +843,9 @@ define i32 @bzhi32_b1_indexzext(i32 %val, i8 zeroext %numlowbits) nounwind { ; ; X86-BMI1NOTBM-LABEL: bzhi32_b1_indexzext: ; X86-BMI1NOTBM: # %bb.0: -; X86-BMI1NOTBM-NEXT: movb {{[0-9]+}}(%esp), %cl -; X86-BMI1NOTBM-NEXT: movl $-1, %eax -; X86-BMI1NOTBM-NEXT: shll %cl, %eax -; X86-BMI1NOTBM-NEXT: andnl {{[0-9]+}}(%esp), %eax, %eax +; X86-BMI1NOTBM-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-BMI1NOTBM-NEXT: shll $8, %eax +; X86-BMI1NOTBM-NEXT: bextrl %eax, {{[0-9]+}}(%esp), %eax ; X86-BMI1NOTBM-NEXT: retl ; ; X86-BMI1BMI2-LABEL: bzhi32_b1_indexzext: @@ -871,11 +866,8 @@ define i32 @bzhi32_b1_indexzext(i32 %val, i8 zeroext %numlowbits) nounwind { ; ; X64-BMI1NOTBM-LABEL: bzhi32_b1_indexzext: ; X64-BMI1NOTBM: # %bb.0: -; X64-BMI1NOTBM-NEXT: movl %esi, %ecx -; X64-BMI1NOTBM-NEXT: movl $-1, %eax -; X64-BMI1NOTBM-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-BMI1NOTBM-NEXT: shll %cl, %eax -; X64-BMI1NOTBM-NEXT: andnl %edi, %eax, %eax +; X64-BMI1NOTBM-NEXT: shll $8, %esi +; X64-BMI1NOTBM-NEXT: bextrl %esi, %edi, %eax ; X64-BMI1NOTBM-NEXT: retq ; ; X64-BMI1BMI2-LABEL: bzhi32_b1_indexzext: @@ -904,9 +896,8 @@ define i32 @bzhi32_b2_load(i32* %w, i32 %numlowbits) nounwind { ; X86-BMI1NOTBM: # %bb.0: ; X86-BMI1NOTBM-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BMI1NOTBM-NEXT: movb {{[0-9]+}}(%esp), %cl -; X86-BMI1NOTBM-NEXT: movl $-1, %edx -; X86-BMI1NOTBM-NEXT: shll %cl, %edx -; X86-BMI1NOTBM-NEXT: andnl (%eax), %edx, %eax +; X86-BMI1NOTBM-NEXT: shll $8, %ecx +; X86-BMI1NOTBM-NEXT: bextrl %ecx, (%eax), %eax ; X86-BMI1NOTBM-NEXT: retl ; ; X86-BMI1BMI2-LABEL: bzhi32_b2_load: @@ -928,11 +919,8 @@ define i32 @bzhi32_b2_load(i32* %w, i32 %numlowbits) nounwind { ; ; X64-BMI1NOTBM-LABEL: bzhi32_b2_load: ; X64-BMI1NOTBM: # %bb.0: -; X64-BMI1NOTBM-NEXT: movl %esi, %ecx -; X64-BMI1NOTBM-NEXT: movl $-1, %eax -; X64-BMI1NOTBM-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-BMI1NOTBM-NEXT: shll %cl, %eax -; X64-BMI1NOTBM-NEXT: andnl (%rdi), %eax, %eax +; X64-BMI1NOTBM-NEXT: shll $8, %esi +; X64-BMI1NOTBM-NEXT: bextrl %esi, (%rdi), %eax ; X64-BMI1NOTBM-NEXT: retq ; ; X64-BMI1BMI2-LABEL: bzhi32_b2_load: @@ -961,9 +949,8 @@ define i32 @bzhi32_b3_load_indexzext(i32* %w, i8 zeroext %numlowbits) nounwind { ; X86-BMI1NOTBM: # %bb.0: ; X86-BMI1NOTBM-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-BMI1NOTBM-NEXT: movb {{[0-9]+}}(%esp), %cl -; X86-BMI1NOTBM-NEXT: movl $-1, %edx -; X86-BMI1NOTBM-NEXT: shll %cl, %edx -; X86-BMI1NOTBM-NEXT: andnl (%eax), %edx, %eax +; X86-BMI1NOTBM-NEXT: shll $8, %ecx +; X86-BMI1NOTBM-NEXT: bextrl %ecx, (%eax), %eax ; X86-BMI1NOTBM-NEXT: retl ; ; X86-BMI1BMI2-LABEL: bzhi32_b3_load_indexzext: @@ -985,11 +972,8 @@ define i32 @bzhi32_b3_load_indexzext(i32* %w, i8 zeroext %numlowbits) nounwind { ; ; X64-BMI1NOTBM-LABEL: bzhi32_b3_load_indexzext: ; X64-BMI1NOTBM: # %bb.0: -; X64-BMI1NOTBM-NEXT: movl %esi, %ecx -; X64-BMI1NOTBM-NEXT: movl $-1, %eax -; X64-BMI1NOTBM-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-BMI1NOTBM-NEXT: shll %cl, %eax -; X64-BMI1NOTBM-NEXT: andnl (%rdi), %eax, %eax +; X64-BMI1NOTBM-NEXT: shll $8, %esi +; X64-BMI1NOTBM-NEXT: bextrl %esi, (%rdi), %eax ; X64-BMI1NOTBM-NEXT: retq ; ; X64-BMI1BMI2-LABEL: bzhi32_b3_load_indexzext: @@ -1016,10 +1000,9 @@ define i32 @bzhi32_b4_commutative(i32 %val, i32 %numlowbits) nounwind { ; ; X86-BMI1NOTBM-LABEL: bzhi32_b4_commutative: ; X86-BMI1NOTBM: # %bb.0: -; X86-BMI1NOTBM-NEXT: movb {{[0-9]+}}(%esp), %cl -; X86-BMI1NOTBM-NEXT: movl $-1, %eax -; X86-BMI1NOTBM-NEXT: shll %cl, %eax -; X86-BMI1NOTBM-NEXT: andnl {{[0-9]+}}(%esp), %eax, %eax +; X86-BMI1NOTBM-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-BMI1NOTBM-NEXT: shll $8, %eax +; X86-BMI1NOTBM-NEXT: bextrl %eax, {{[0-9]+}}(%esp), %eax ; X86-BMI1NOTBM-NEXT: retl ; ; X86-BMI1BMI2-LABEL: bzhi32_b4_commutative: @@ -1040,11 +1023,8 @@ define i32 @bzhi32_b4_commutative(i32 %val, i32 %numlowbits) nounwind { ; ; X64-BMI1NOTBM-LABEL: bzhi32_b4_commutative: ; X64-BMI1NOTBM: # %bb.0: -; X64-BMI1NOTBM-NEXT: movl %esi, %ecx -; X64-BMI1NOTBM-NEXT: movl $-1, %eax -; X64-BMI1NOTBM-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-BMI1NOTBM-NEXT: shll %cl, %eax -; X64-BMI1NOTBM-NEXT: andnl %edi, %eax, %eax +; X64-BMI1NOTBM-NEXT: shll $8, %esi +; X64-BMI1NOTBM-NEXT: bextrl %esi, %edi, %eax ; X64-BMI1NOTBM-NEXT: retq ; ; X64-BMI1BMI2-LABEL: bzhi32_b4_commutative: @@ -1128,11 +1108,8 @@ define i64 @bzhi64_b0(i64 %val, i64 %numlowbits) nounwind { ; ; X64-BMI1NOTBM-LABEL: bzhi64_b0: ; X64-BMI1NOTBM: # %bb.0: -; X64-BMI1NOTBM-NEXT: movq %rsi, %rcx -; X64-BMI1NOTBM-NEXT: movq $-1, %rax -; X64-BMI1NOTBM-NEXT: # kill: def $cl killed $cl killed $rcx -; X64-BMI1NOTBM-NEXT: shlq %cl, %rax -; X64-BMI1NOTBM-NEXT: andnq %rdi, %rax, %rax +; X64-BMI1NOTBM-NEXT: shlq $8, %rsi +; X64-BMI1NOTBM-NEXT: bextrq %rsi, %rdi, %rax ; X64-BMI1NOTBM-NEXT: retq ; ; X64-BMI1BMI2-LABEL: bzhi64_b0: @@ -1214,11 +1191,9 @@ define i64 @bzhi64_b1_indexzext(i64 %val, i8 zeroext %numlowbits) nounwind { ; ; X64-BMI1NOTBM-LABEL: bzhi64_b1_indexzext: ; X64-BMI1NOTBM: # %bb.0: -; X64-BMI1NOTBM-NEXT: movl %esi, %ecx -; X64-BMI1NOTBM-NEXT: movq $-1, %rax -; X64-BMI1NOTBM-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-BMI1NOTBM-NEXT: shlq %cl, %rax -; X64-BMI1NOTBM-NEXT: andnq %rdi, %rax, %rax +; X64-BMI1NOTBM-NEXT: # kill: def $esi killed $esi def $rsi +; X64-BMI1NOTBM-NEXT: shlq $8, %rsi +; X64-BMI1NOTBM-NEXT: bextrq %rsi, %rdi, %rax ; X64-BMI1NOTBM-NEXT: retq ; ; X64-BMI1BMI2-LABEL: bzhi64_b1_indexzext: @@ -1307,11 +1282,8 @@ define i64 @bzhi64_b2_load(i64* %w, i64 %numlowbits) nounwind { ; ; X64-BMI1NOTBM-LABEL: bzhi64_b2_load: ; X64-BMI1NOTBM: # %bb.0: -; X64-BMI1NOTBM-NEXT: movq %rsi, %rcx -; X64-BMI1NOTBM-NEXT: movq $-1, %rax -; X64-BMI1NOTBM-NEXT: # kill: def $cl killed $cl killed $rcx -; X64-BMI1NOTBM-NEXT: shlq %cl, %rax -; X64-BMI1NOTBM-NEXT: andnq (%rdi), %rax, %rax +; X64-BMI1NOTBM-NEXT: shlq $8, %rsi +; X64-BMI1NOTBM-NEXT: bextrq %rsi, (%rdi), %rax ; X64-BMI1NOTBM-NEXT: retq ; ; X64-BMI1BMI2-LABEL: bzhi64_b2_load: @@ -1399,11 +1371,9 @@ define i64 @bzhi64_b3_load_indexzext(i64* %w, i8 zeroext %numlowbits) nounwind { ; ; X64-BMI1NOTBM-LABEL: bzhi64_b3_load_indexzext: ; X64-BMI1NOTBM: # %bb.0: -; X64-BMI1NOTBM-NEXT: movl %esi, %ecx -; X64-BMI1NOTBM-NEXT: movq $-1, %rax -; X64-BMI1NOTBM-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-BMI1NOTBM-NEXT: shlq %cl, %rax -; X64-BMI1NOTBM-NEXT: andnq (%rdi), %rax, %rax +; X64-BMI1NOTBM-NEXT: # kill: def $esi killed $esi def $rsi +; X64-BMI1NOTBM-NEXT: shlq $8, %rsi +; X64-BMI1NOTBM-NEXT: bextrq %rsi, (%rdi), %rax ; X64-BMI1NOTBM-NEXT: retq ; ; X64-BMI1BMI2-LABEL: bzhi64_b3_load_indexzext: @@ -1488,11 +1458,8 @@ define i64 @bzhi64_b4_commutative(i64 %val, i64 %numlowbits) nounwind { ; ; X64-BMI1NOTBM-LABEL: bzhi64_b4_commutative: ; X64-BMI1NOTBM: # %bb.0: -; X64-BMI1NOTBM-NEXT: movq %rsi, %rcx -; X64-BMI1NOTBM-NEXT: movq $-1, %rax -; X64-BMI1NOTBM-NEXT: # kill: def $cl killed $cl killed $rcx -; X64-BMI1NOTBM-NEXT: shlq %cl, %rax -; X64-BMI1NOTBM-NEXT: andnq %rdi, %rax, %rax +; X64-BMI1NOTBM-NEXT: shlq $8, %rsi +; X64-BMI1NOTBM-NEXT: bextrq %rsi, %rdi, %rax ; X64-BMI1NOTBM-NEXT: retq ; ; X64-BMI1BMI2-LABEL: bzhi64_b4_commutative: -- GitLab From fb7e6913cb74b9709a840df8d6397a7eedd63606 Mon Sep 17 00:00:00 2001 From: Max Kazantsev Date: Thu, 11 Oct 2018 08:46:39 +0000 Subject: [PATCH 0045/1116] [NFC] Factor out getOrCreateAddRecExpr method git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344227 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Analysis/ScalarEvolution.h | 4 +++ lib/Analysis/ScalarEvolution.cpp | 42 ++++++++++++++----------- 2 files changed, 28 insertions(+), 18 deletions(-) diff --git a/include/llvm/Analysis/ScalarEvolution.h b/include/llvm/Analysis/ScalarEvolution.h index 89918e3c205..8f4200b07e5 100644 --- a/include/llvm/Analysis/ScalarEvolution.h +++ b/include/llvm/Analysis/ScalarEvolution.h @@ -1833,6 +1833,10 @@ private: const SCEV *getOrCreateMulExpr(SmallVectorImpl &Ops, SCEV::NoWrapFlags Flags); + // Get addrec expr already created or create a new one. + const SCEV *getOrCreateAddRecExpr(SmallVectorImpl &Ops, + const Loop *L, SCEV::NoWrapFlags Flags); + /// Return x if \p Val is f(x) where f is a 1-1 function. const SCEV *stripInjectiveFunctions(const SCEV *Val) const; diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp index d99d4767366..193020ed92f 100644 --- a/lib/Analysis/ScalarEvolution.cpp +++ b/lib/Analysis/ScalarEvolution.cpp @@ -2758,6 +2758,29 @@ ScalarEvolution::getOrCreateAddExpr(SmallVectorImpl &Ops, return S; } +const SCEV * +ScalarEvolution::getOrCreateAddRecExpr(SmallVectorImpl &Ops, + const Loop *L, SCEV::NoWrapFlags Flags) { + FoldingSetNodeID ID; + ID.AddInteger(scAddRecExpr); + for (unsigned i = 0, e = Ops.size(); i != e; ++i) + ID.AddPointer(Ops[i]); + ID.AddPointer(L); + void *IP = nullptr; + SCEVAddRecExpr *S = + static_cast(UniqueSCEVs.FindNodeOrInsertPos(ID, IP)); + if (!S) { + const SCEV **O = SCEVAllocator.Allocate(Ops.size()); + std::uninitialized_copy(Ops.begin(), Ops.end(), O); + S = new (SCEVAllocator) + SCEVAddRecExpr(ID.Intern(SCEVAllocator), O, Ops.size(), L); + UniqueSCEVs.InsertNode(S, IP); + addToLoopUseLists(S); + } + S->setNoWrapFlags(Flags); + return S; +} + const SCEV * ScalarEvolution::getOrCreateMulExpr(SmallVectorImpl &Ops, SCEV::NoWrapFlags Flags) { @@ -3408,24 +3431,7 @@ ScalarEvolution::getAddRecExpr(SmallVectorImpl &Operands, // Okay, it looks like we really DO need an addrec expr. Check to see if we // already have one, otherwise create a new one. - FoldingSetNodeID ID; - ID.AddInteger(scAddRecExpr); - for (unsigned i = 0, e = Operands.size(); i != e; ++i) - ID.AddPointer(Operands[i]); - ID.AddPointer(L); - void *IP = nullptr; - SCEVAddRecExpr *S = - static_cast(UniqueSCEVs.FindNodeOrInsertPos(ID, IP)); - if (!S) { - const SCEV **O = SCEVAllocator.Allocate(Operands.size()); - std::uninitialized_copy(Operands.begin(), Operands.end(), O); - S = new (SCEVAllocator) SCEVAddRecExpr(ID.Intern(SCEVAllocator), - O, Operands.size(), L); - UniqueSCEVs.InsertNode(S, IP); - addToLoopUseLists(S); - } - S->setNoWrapFlags(Flags); - return S; + return getOrCreateAddRecExpr(Operands, L, Flags); } const SCEV * -- GitLab From 6298cb983b2b06783bc0f60b88f104cbe932b824 Mon Sep 17 00:00:00 2001 From: Calixte Denizet Date: Thu, 11 Oct 2018 08:53:43 +0000 Subject: [PATCH 0046/1116] [gcov] Display the hit counter for the line of a function definition Summary: Right now there is no hit counter on the line of function. So the idea is add the line of the function to all the lines covered by the entry block. Tests in compiler-rt/profile will be fixed in another patch: https://reviews.llvm.org/D49854 Reviewers: marco-c, davidxl Reviewed By: marco-c Subscribers: sylvestre.ledru, llvm-commits Differential Revision: https://reviews.llvm.org/D49853 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344228 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Instrumentation/GCOVProfiling.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/lib/Transforms/Instrumentation/GCOVProfiling.cpp index 625b354cc38..a060dd53513 100644 --- a/lib/Transforms/Instrumentation/GCOVProfiling.cpp +++ b/lib/Transforms/Instrumentation/GCOVProfiling.cpp @@ -570,6 +570,12 @@ void GCOVProfiler::emitProfileNotes() { Options.ExitBlockBeforeBody)); GCOVFunction &Func = *Funcs.back(); + // Add the function line number to the lines of the entry block + // to have a counter for the function definition. + Func.getBlock(&EntryBlock) + .getFile(SP->getFilename()) + .addLine(SP->getLine()); + for (auto &BB : F) { GCOVBlock &Block = Func.getBlock(&BB); TerminatorInst *TI = BB.getTerminator(); -- GitLab From 522fc4db7cd5cd3e490fb5d0a95491ed3d99b4f1 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Thu, 11 Oct 2018 09:27:24 +0000 Subject: [PATCH 0047/1116] [LV] Ignore more debug info. We can avoid doing some unnecessary work by skipping debug instructions in a few loops. It also helps to ensure debug instructions do not prevent vectorization, although I do not have any concrete test cases for that. Reviewers: rengolin, hsaito, dcaballe, aprantl, vsk Reviewed By: rengolin, dcaballe Differential Revision: https://reviews.llvm.org/D53091 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344232 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index c7c4568377b..cad1711b17f 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4655,7 +4655,7 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() { // For each block. for (BasicBlock *BB : TheLoop->blocks()) { // For each instruction in the loop. - for (Instruction &I : *BB) { + for (Instruction &I : BB->instructionsWithoutDebug()) { Type *T = I.getType(); // Skip ignored values. @@ -4893,7 +4893,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef VFs) { unsigned Index = 0; for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { - for (Instruction &I : *BB) { + for (Instruction &I : BB->instructionsWithoutDebug()) { IdxToInstr[Index++] = &I; // Save the end location of each USE. -- GitLab From e13d09452159012b82271b7e655a02e16da76887 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Thu, 11 Oct 2018 09:46:25 +0000 Subject: [PATCH 0048/1116] [LV] Use SmallVector instead of DenseMap in calculateRegisterUsage (NFC). We assign indices sequentially for seen instructions, so we can just use a vector and push back the seen instructions. No need for using a DenseMap. Reviewers: hsaito, rengolin, nadav, dcaballe Reviewed By: rengolin Differential Revision: https://reviews.llvm.org/D53089 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344233 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index cad1711b17f..7ebe8d102b7 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4882,7 +4882,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef VFs) { using IntervalMap = DenseMap; // Maps instruction to its index. - DenseMap IdxToInstr; + SmallVector IdxToInstr; // Marks the end of each interval. IntervalMap EndPoint; // Saves the list of instruction indices that are used in the loop. @@ -4891,10 +4891,9 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef VFs) { // defined outside the loop, such as arguments and constants. SmallPtrSet LoopInvariants; - unsigned Index = 0; for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { for (Instruction &I : BB->instructionsWithoutDebug()) { - IdxToInstr[Index++] = &I; + IdxToInstr.push_back(&I); // Save the end location of each USE. for (Value *U : I.operands()) { @@ -4911,7 +4910,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef VFs) { } // Overwrite previous end points. - EndPoint[Instr] = Index; + EndPoint[Instr] = IdxToInstr.size(); Ends.insert(Instr); } } @@ -4948,7 +4947,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef VFs) { return std::max(1, VF * TypeSize / WidestRegister); }; - for (unsigned int i = 0; i < Index; ++i) { + for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { Instruction *I = IdxToInstr[i]; // Remove all of the instructions that end at this location. -- GitLab From 7a175729942d0d92d50149b3963133716d743a61 Mon Sep 17 00:00:00 2001 From: Andrea Di Biagio Date: Thu, 11 Oct 2018 10:39:03 +0000 Subject: [PATCH 0049/1116] [tblgen][CodeGenSchedule] Add a check for invalid RegisterFile definitions with zero physical registers. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344235 91177308-0d34-0410-b5e6-96231b3b80d8 --- tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp | 10 ++++------ utils/TableGen/CodeGenSchedule.cpp | 5 +++++ 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp b/tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp index 01131253b5b..4cfe1a50f53 100644 --- a/tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp +++ b/tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp @@ -45,13 +45,11 @@ void RegisterFile::initialize(const MCSchedModel &SM, unsigned NumRegs) { // object. The size of every register file, as well as the mapping between // register files and register classes is specified via tablegen. const MCExtraProcessorInfo &Info = SM.getExtraProcessorInfo(); - for (unsigned I = 0, E = Info.NumRegisterFiles; I < E; ++I) { + + // Skip invalid register file at index 0. + for (unsigned I = 1, E = Info.NumRegisterFiles; I < E; ++I) { const MCRegisterFileDesc &RF = Info.RegisterFiles[I]; - // Skip invalid register files with zero physical registers. - // TODO: verify this constraint in SubtargetEmitter, and convert this - // statement into an assert. - if (!RF.NumPhysRegs) - continue; + assert(RF.NumPhysRegs && "Invalid PRF with zero physical registers!"); // The cost of a register definition is equivalent to the number of // physical registers that are allocated at register renaming stage. diff --git a/utils/TableGen/CodeGenSchedule.cpp b/utils/TableGen/CodeGenSchedule.cpp index 881f1a813f2..f8d7d9ad3d3 100644 --- a/utils/TableGen/CodeGenSchedule.cpp +++ b/utils/TableGen/CodeGenSchedule.cpp @@ -1763,6 +1763,11 @@ void CodeGenSchedModels::collectRegisterFiles() { // Now set the number of physical registers as well as the cost of registers // in each register class. CGRF.NumPhysRegs = RF->getValueAsInt("NumPhysRegs"); + if (!CGRF.NumPhysRegs) { + PrintFatalError(RF->getLoc(), + "Invalid RegisterFile with zero physical registers"); + } + RecVec RegisterClasses = RF->getValueAsListOfDefs("RegClasses"); std::vector RegisterCosts = RF->getValueAsListOfInts("RegCosts"); for (unsigned I = 0, E = RegisterClasses.size(); I < E; ++I) { -- GitLab From 3572c80cdbe16b19fa1c09846a50e8b7728b5ca1 Mon Sep 17 00:00:00 2001 From: David Green Date: Thu, 11 Oct 2018 10:46:12 +0000 Subject: [PATCH 0050/1116] [InstCombine] Add tests for demand bits of min/max. NFC. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344236 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../InstCombine/minmax-demandbits.ll | 256 ++++++++++++++++++ 1 file changed, 256 insertions(+) create mode 100644 test/Transforms/InstCombine/minmax-demandbits.ll diff --git a/test/Transforms/InstCombine/minmax-demandbits.ll b/test/Transforms/InstCombine/minmax-demandbits.ll new file mode 100644 index 00000000000..8977c19856f --- /dev/null +++ b/test/Transforms/InstCombine/minmax-demandbits.ll @@ -0,0 +1,256 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -instcombine < %s | FileCheck %s + + +define i32 @and_umax_less(i32 %A) { +; CHECK-LABEL: @and_umax_less( +; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt i32 [[A:%.*]], 31 +; CHECK-NEXT: [[L1:%.*]] = select i1 [[TMP1]], i32 [[A]], i32 31 +; CHECK-NEXT: [[X:%.*]] = and i32 [[L1]], -32 +; CHECK-NEXT: ret i32 [[X]] +; + %l0 = icmp ugt i32 31, %A + %l1 = select i1 %l0, i32 31, i32 %A + %x = and i32 %l1, -32 + ret i32 %x +} + +define i32 @and_umax_muchless(i32 %A) { +; CHECK-LABEL: @and_umax_muchless( +; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt i32 [[A:%.*]], 12 +; CHECK-NEXT: [[L1:%.*]] = select i1 [[TMP1]], i32 [[A]], i32 12 +; CHECK-NEXT: [[X:%.*]] = and i32 [[L1]], -32 +; CHECK-NEXT: ret i32 [[X]] +; + %l0 = icmp ugt i32 12, %A + %l1 = select i1 %l0, i32 12, i32 %A + %x = and i32 %l1, -32 + ret i32 %x +} + +define i32 @and_umax_more(i32 %A) { +; CHECK-LABEL: @and_umax_more( +; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt i32 [[A:%.*]], 32 +; CHECK-NEXT: [[L1:%.*]] = select i1 [[TMP1]], i32 [[A]], i32 32 +; CHECK-NEXT: [[X:%.*]] = and i32 [[L1]], -32 +; CHECK-NEXT: ret i32 [[X]] +; + %l0 = icmp ugt i32 32, %A + %l1 = select i1 %l0, i32 32, i32 %A + %x = and i32 %l1, -32 + ret i32 %x +} + +define i32 @shr_umax(i32 %A) { +; CHECK-LABEL: @shr_umax( +; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt i32 [[A:%.*]], 15 +; CHECK-NEXT: [[L1:%.*]] = select i1 [[TMP1]], i32 [[A]], i32 15 +; CHECK-NEXT: [[X:%.*]] = lshr i32 [[L1]], 4 +; CHECK-NEXT: ret i32 [[X]] +; + %l0 = icmp ugt i32 15, %A + %l1 = select i1 %l0, i32 15, i32 %A + %x = lshr i32 %l1, 4 + ret i32 %x +} + +; Various constants for C2 & umax(A, C1) + +define i8 @t_0_1(i8 %A) { +; CHECK-LABEL: @t_0_1( +; CHECK-NEXT: [[X:%.*]] = and i8 [[A:%.*]], 1 +; CHECK-NEXT: ret i8 [[X]] +; + %l2 = icmp ugt i8 %A, 0 + %l1 = select i1 %l2, i8 %A, i8 0 + %x = and i8 %l1, 1 + ret i8 %x +} + +define i8 @t_0_10(i8 %A) { +; CHECK-LABEL: @t_0_10( +; CHECK-NEXT: [[X:%.*]] = and i8 [[A:%.*]], 10 +; CHECK-NEXT: ret i8 [[X]] +; + %l2 = icmp ugt i8 %A, 0 + %l1 = select i1 %l2, i8 %A, i8 0 + %x = and i8 %l1, 10 + ret i8 %x +} + +define i8 @t_1_10(i8 %A) { +; CHECK-LABEL: @t_1_10( +; CHECK-NEXT: [[L2:%.*]] = icmp ugt i8 [[A:%.*]], 1 +; CHECK-NEXT: [[L1:%.*]] = select i1 [[L2]], i8 [[A]], i8 1 +; CHECK-NEXT: [[X:%.*]] = and i8 [[L1]], 10 +; CHECK-NEXT: ret i8 [[X]] +; + %l2 = icmp ugt i8 %A, 1 + %l1 = select i1 %l2, i8 %A, i8 1 + %x = and i8 %l1, 10 + ret i8 %x +} + +define i8 @t_2_4(i8 %A) { +; CHECK-LABEL: @t_2_4( +; CHECK-NEXT: [[L2:%.*]] = icmp ugt i8 [[A:%.*]], 2 +; CHECK-NEXT: [[L1:%.*]] = select i1 [[L2]], i8 [[A]], i8 2 +; CHECK-NEXT: [[X:%.*]] = and i8 [[L1]], 4 +; CHECK-NEXT: ret i8 [[X]] +; + %l2 = icmp ugt i8 %A, 2 + %l1 = select i1 %l2, i8 %A, i8 2 + %x = and i8 %l1, 4 + ret i8 %x +} + +define i8 @t_2_192(i8 %A) { +; CHECK-LABEL: @t_2_192( +; CHECK-NEXT: [[L2:%.*]] = icmp ugt i8 [[A:%.*]], 2 +; CHECK-NEXT: [[L1:%.*]] = select i1 [[L2]], i8 [[A]], i8 2 +; CHECK-NEXT: [[X:%.*]] = and i8 [[L1]], -64 +; CHECK-NEXT: ret i8 [[X]] +; + %l2 = icmp ugt i8 %A, 2 + %l1 = select i1 %l2, i8 %A, i8 2 + %x = and i8 %l1, -64 + ret i8 %x +} + +define i8 @t_2_63_or(i8 %A) { +; CHECK-LABEL: @t_2_63_or( +; CHECK-NEXT: [[L2:%.*]] = icmp ugt i8 [[A:%.*]], 2 +; CHECK-NEXT: [[L1:%.*]] = select i1 [[L2]], i8 [[A]], i8 2 +; CHECK-NEXT: [[X:%.*]] = or i8 [[L1]], 63 +; CHECK-NEXT: ret i8 [[X]] +; + %l2 = icmp ugt i8 %A, 2 + %l1 = select i1 %l2, i8 %A, i8 2 + %x = or i8 %l1, 63 + ret i8 %x +} + +define i8 @f_1_1(i8 %A) { +; CHECK-LABEL: @f_1_1( +; CHECK-NEXT: [[L2:%.*]] = icmp ugt i8 [[A:%.*]], 1 +; CHECK-NEXT: [[L1:%.*]] = select i1 [[L2]], i8 [[A]], i8 1 +; CHECK-NEXT: [[X:%.*]] = and i8 [[L1]], 1 +; CHECK-NEXT: ret i8 [[X]] +; + %l2 = icmp ugt i8 %A, 1 + %l1 = select i1 %l2, i8 %A, i8 1 + %x = and i8 %l1, 1 + ret i8 %x +} + +define i8 @f_32_32(i8 %A) { +; CHECK-LABEL: @f_32_32( +; CHECK-NEXT: [[L2:%.*]] = icmp ugt i8 [[A:%.*]], 32 +; CHECK-NEXT: [[L1:%.*]] = select i1 [[L2]], i8 [[A]], i8 32 +; CHECK-NEXT: [[X:%.*]] = and i8 [[L1]], -32 +; CHECK-NEXT: ret i8 [[X]] +; + %l2 = icmp ugt i8 %A, 32 + %l1 = select i1 %l2, i8 %A, i8 32 + %x = and i8 %l1, -32 + ret i8 %x +} + +define i8 @f_191_192(i8 %A) { +; CHECK-LABEL: @f_191_192( +; CHECK-NEXT: [[L2:%.*]] = icmp ugt i8 [[A:%.*]], -65 +; CHECK-NEXT: [[L1:%.*]] = select i1 [[L2]], i8 [[A]], i8 -65 +; CHECK-NEXT: [[X:%.*]] = and i8 [[L1]], -64 +; CHECK-NEXT: ret i8 [[X]] +; + %l2 = icmp ugt i8 %A, 191 + %l1 = select i1 %l2, i8 %A, i8 191 + %x = and i8 %l1, 192 + ret i8 %x +} + +define i8 @f_10_1(i8 %A) { +; CHECK-LABEL: @f_10_1( +; CHECK-NEXT: [[L2:%.*]] = icmp ugt i8 [[A:%.*]], 10 +; CHECK-NEXT: [[L1:%.*]] = select i1 [[L2]], i8 [[A]], i8 10 +; CHECK-NEXT: [[X:%.*]] = and i8 [[L1]], 1 +; CHECK-NEXT: ret i8 [[X]] +; + %l2 = icmp ugt i8 %A, 10 + %l1 = select i1 %l2, i8 %A, i8 10 + %x = and i8 %l1, 1 + ret i8 %x +} + +define i32 @and_umin(i32 %A) { +; CHECK-LABEL: @and_umin( +; CHECK-NEXT: ret i32 0 +; + %l0 = icmp ult i32 15, %A + %l1 = select i1 %l0, i32 15, i32 %A + %x = and i32 %l1, -32 + ret i32 %x +} + +define i32 @or_umin(i32 %A) { +; CHECK-LABEL: @or_umin( +; CHECK-NEXT: ret i32 31 +; + %l0 = icmp ult i32 15, %A + %l1 = select i1 %l0, i32 15, i32 %A + %x = or i32 %l1, 31 + ret i32 %x +} + +define i8 @or_min_31_30(i8 %A) { +; CHECK-LABEL: @or_min_31_30( +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i8 [[A:%.*]], -30 +; CHECK-NEXT: [[MIN:%.*]] = select i1 [[CMP]], i8 [[A]], i8 -30 +; CHECK-NEXT: [[R:%.*]] = or i8 [[MIN]], 31 +; CHECK-NEXT: ret i8 [[R]] +; + %cmp = icmp ult i8 %A, -30 + %min = select i1 %cmp, i8 %A, i8 -30 + %r = or i8 %min, 31 + ret i8 %r +} + +define i8 @and_min_7_7(i8 %A) { +; CHECK-LABEL: @and_min_7_7( +; CHECK-NEXT: [[L2:%.*]] = icmp ult i8 [[A:%.*]], -7 +; CHECK-NEXT: [[MIN:%.*]] = select i1 [[L2]], i8 [[A]], i8 -7 +; CHECK-NEXT: [[R:%.*]] = and i8 [[MIN]], -8 +; CHECK-NEXT: ret i8 [[R]] +; + %l2 = icmp ult i8 %A, -7 + %min = select i1 %l2, i8 %A, i8 -7 + %r = and i8 %min, -8 + ret i8 %r +} + +define i8 @and_min_7_8(i8 %A) { +; CHECK-LABEL: @and_min_7_8( +; CHECK-NEXT: [[L2:%.*]] = icmp ult i8 [[A:%.*]], -8 +; CHECK-NEXT: [[MIN:%.*]] = select i1 [[L2]], i8 [[A]], i8 -8 +; CHECK-NEXT: [[R:%.*]] = and i8 [[MIN]], -8 +; CHECK-NEXT: ret i8 [[R]] +; + %l2 = icmp ult i8 %A, -8 + %min = select i1 %l2, i8 %A, i8 -8 + %r = and i8 %min, -8 + ret i8 %r +} + +define i8 @and_min_7_9(i8 %A) { +; CHECK-LABEL: @and_min_7_9( +; CHECK-NEXT: [[L2:%.*]] = icmp ult i8 [[A:%.*]], -9 +; CHECK-NEXT: [[MIN:%.*]] = select i1 [[L2]], i8 [[A]], i8 -9 +; CHECK-NEXT: [[R:%.*]] = and i8 [[MIN]], -8 +; CHECK-NEXT: ret i8 [[R]] +; + %l2 = icmp ult i8 %A, -9 + %min = select i1 %l2, i8 %A, i8 -9 + %r = and i8 %min, -8 + ret i8 %r +} + -- GitLab From c3c05ee92fb49821124862c964f91a7e3a27f0b2 Mon Sep 17 00:00:00 2001 From: David Green Date: Thu, 11 Oct 2018 11:04:09 +0000 Subject: [PATCH 0051/1116] [InstCombine] Demand bits of UMax Use the demanded bits of umax(A,C) to prove we can just use A so long as the lowest non-zero bit of DemandMask is higher than the highest non-zero bit of C Differential Revision: https://reviews.llvm.org/D53033 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344237 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../InstCombineSimplifyDemanded.cpp | 20 ++++++++++--- .../InstCombine/minmax-demandbits.ll | 28 +++++-------------- 2 files changed, 23 insertions(+), 25 deletions(-) diff --git a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index 936daa828a5..18a2b2fdbfe 100644 --- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -314,11 +314,22 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, Known.One = std::move(IKnownOne); break; } - case Instruction::Select: - // If this is a select as part of a min/max pattern, don't simplify any - // further in case we break the structure. + case Instruction::Select: { Value *LHS, *RHS; - if (matchSelectPattern(I, LHS, RHS).Flavor != SPF_UNKNOWN) + SelectPatternFlavor SPF = matchSelectPattern(I, LHS, RHS).Flavor; + if (SPF == SPF_UMAX) { + // UMax(A, C) == A if ... + // The lowest non-zero bit of DemandMask is higher than the highest + // non-zero bit of C. + const APInt *C; + unsigned CTZ = DemandedMask.countTrailingZeros(); + if (match(RHS, m_APInt(C)) && CTZ >= C->getActiveBits()) + return LHS; + } + + // If this is a select as part of any other min/max pattern, don't simplify + // any further in case we break the structure. + if (SPF != SPF_UNKNOWN) return nullptr; if (SimplifyDemandedBits(I, 2, DemandedMask, RHSKnown, Depth + 1) || @@ -336,6 +347,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, Known.One = RHSKnown.One & LHSKnown.One; Known.Zero = RHSKnown.Zero & LHSKnown.Zero; break; + } case Instruction::ZExt: case Instruction::Trunc: { unsigned SrcBitWidth = I->getOperand(0)->getType()->getScalarSizeInBits(); diff --git a/test/Transforms/InstCombine/minmax-demandbits.ll b/test/Transforms/InstCombine/minmax-demandbits.ll index 8977c19856f..f838560f965 100644 --- a/test/Transforms/InstCombine/minmax-demandbits.ll +++ b/test/Transforms/InstCombine/minmax-demandbits.ll @@ -4,9 +4,7 @@ define i32 @and_umax_less(i32 %A) { ; CHECK-LABEL: @and_umax_less( -; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt i32 [[A:%.*]], 31 -; CHECK-NEXT: [[L1:%.*]] = select i1 [[TMP1]], i32 [[A]], i32 31 -; CHECK-NEXT: [[X:%.*]] = and i32 [[L1]], -32 +; CHECK-NEXT: [[X:%.*]] = and i32 [[A:%.*]], -32 ; CHECK-NEXT: ret i32 [[X]] ; %l0 = icmp ugt i32 31, %A @@ -17,9 +15,7 @@ define i32 @and_umax_less(i32 %A) { define i32 @and_umax_muchless(i32 %A) { ; CHECK-LABEL: @and_umax_muchless( -; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt i32 [[A:%.*]], 12 -; CHECK-NEXT: [[L1:%.*]] = select i1 [[TMP1]], i32 [[A]], i32 12 -; CHECK-NEXT: [[X:%.*]] = and i32 [[L1]], -32 +; CHECK-NEXT: [[X:%.*]] = and i32 [[A:%.*]], -32 ; CHECK-NEXT: ret i32 [[X]] ; %l0 = icmp ugt i32 12, %A @@ -43,9 +39,7 @@ define i32 @and_umax_more(i32 %A) { define i32 @shr_umax(i32 %A) { ; CHECK-LABEL: @shr_umax( -; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt i32 [[A:%.*]], 15 -; CHECK-NEXT: [[L1:%.*]] = select i1 [[TMP1]], i32 [[A]], i32 15 -; CHECK-NEXT: [[X:%.*]] = lshr i32 [[L1]], 4 +; CHECK-NEXT: [[X:%.*]] = lshr i32 [[A:%.*]], 4 ; CHECK-NEXT: ret i32 [[X]] ; %l0 = icmp ugt i32 15, %A @@ -80,9 +74,7 @@ define i8 @t_0_10(i8 %A) { define i8 @t_1_10(i8 %A) { ; CHECK-LABEL: @t_1_10( -; CHECK-NEXT: [[L2:%.*]] = icmp ugt i8 [[A:%.*]], 1 -; CHECK-NEXT: [[L1:%.*]] = select i1 [[L2]], i8 [[A]], i8 1 -; CHECK-NEXT: [[X:%.*]] = and i8 [[L1]], 10 +; CHECK-NEXT: [[X:%.*]] = and i8 [[A:%.*]], 10 ; CHECK-NEXT: ret i8 [[X]] ; %l2 = icmp ugt i8 %A, 1 @@ -93,9 +85,7 @@ define i8 @t_1_10(i8 %A) { define i8 @t_2_4(i8 %A) { ; CHECK-LABEL: @t_2_4( -; CHECK-NEXT: [[L2:%.*]] = icmp ugt i8 [[A:%.*]], 2 -; CHECK-NEXT: [[L1:%.*]] = select i1 [[L2]], i8 [[A]], i8 2 -; CHECK-NEXT: [[X:%.*]] = and i8 [[L1]], 4 +; CHECK-NEXT: [[X:%.*]] = and i8 [[A:%.*]], 4 ; CHECK-NEXT: ret i8 [[X]] ; %l2 = icmp ugt i8 %A, 2 @@ -106,9 +96,7 @@ define i8 @t_2_4(i8 %A) { define i8 @t_2_192(i8 %A) { ; CHECK-LABEL: @t_2_192( -; CHECK-NEXT: [[L2:%.*]] = icmp ugt i8 [[A:%.*]], 2 -; CHECK-NEXT: [[L1:%.*]] = select i1 [[L2]], i8 [[A]], i8 2 -; CHECK-NEXT: [[X:%.*]] = and i8 [[L1]], -64 +; CHECK-NEXT: [[X:%.*]] = and i8 [[A:%.*]], -64 ; CHECK-NEXT: ret i8 [[X]] ; %l2 = icmp ugt i8 %A, 2 @@ -119,9 +107,7 @@ define i8 @t_2_192(i8 %A) { define i8 @t_2_63_or(i8 %A) { ; CHECK-LABEL: @t_2_63_or( -; CHECK-NEXT: [[L2:%.*]] = icmp ugt i8 [[A:%.*]], 2 -; CHECK-NEXT: [[L1:%.*]] = select i1 [[L2]], i8 [[A]], i8 2 -; CHECK-NEXT: [[X:%.*]] = or i8 [[L1]], 63 +; CHECK-NEXT: [[X:%.*]] = or i8 [[A:%.*]], 63 ; CHECK-NEXT: ret i8 [[X]] ; %l2 = icmp ugt i8 %A, 2 -- GitLab From 3e9802e2f980fe66a85921585d0fdcfb40d96184 Mon Sep 17 00:00:00 2001 From: Alex Bradbury Date: Thu, 11 Oct 2018 11:11:58 +0000 Subject: [PATCH 0052/1116] [RISCV] Re-generate test/CodeGen/RISCV/vararg.ll after r344142 The improved load-store forwarding committed in r344142 broke this test. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344238 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/CodeGen/RISCV/vararg.ll | 66 +++++++++++++++++------------------- 1 file changed, 32 insertions(+), 34 deletions(-) diff --git a/test/CodeGen/RISCV/vararg.ll b/test/CodeGen/RISCV/vararg.ll index ac08f346fbb..77f8f300956 100644 --- a/test/CodeGen/RISCV/vararg.ll +++ b/test/CodeGen/RISCV/vararg.ll @@ -17,16 +17,16 @@ define i32 @va1(i8* %fmt, ...) nounwind { ; RV32I-FPELIM-LABEL: va1: ; RV32I-FPELIM: # %bb.0: ; RV32I-FPELIM-NEXT: addi sp, sp, -48 -; RV32I-FPELIM-NEXT: sw a1, 20(sp) +; RV32I-FPELIM-NEXT: mv a0, a1 ; RV32I-FPELIM-NEXT: sw a7, 44(sp) ; RV32I-FPELIM-NEXT: sw a6, 40(sp) ; RV32I-FPELIM-NEXT: sw a5, 36(sp) ; RV32I-FPELIM-NEXT: sw a4, 32(sp) ; RV32I-FPELIM-NEXT: sw a3, 28(sp) ; RV32I-FPELIM-NEXT: sw a2, 24(sp) -; RV32I-FPELIM-NEXT: addi a0, sp, 24 -; RV32I-FPELIM-NEXT: sw a0, 12(sp) -; RV32I-FPELIM-NEXT: lw a0, 20(sp) +; RV32I-FPELIM-NEXT: addi a1, sp, 24 +; RV32I-FPELIM-NEXT: sw a1, 12(sp) +; RV32I-FPELIM-NEXT: sw a0, 20(sp) ; RV32I-FPELIM-NEXT: addi sp, sp, 48 ; RV32I-FPELIM-NEXT: ret ; @@ -36,16 +36,16 @@ define i32 @va1(i8* %fmt, ...) nounwind { ; RV32I-WITHFP-NEXT: sw ra, 12(sp) ; RV32I-WITHFP-NEXT: sw s0, 8(sp) ; RV32I-WITHFP-NEXT: addi s0, sp, 16 -; RV32I-WITHFP-NEXT: sw a1, 4(s0) +; RV32I-WITHFP-NEXT: mv a0, a1 ; RV32I-WITHFP-NEXT: sw a7, 28(s0) ; RV32I-WITHFP-NEXT: sw a6, 24(s0) ; RV32I-WITHFP-NEXT: sw a5, 20(s0) ; RV32I-WITHFP-NEXT: sw a4, 16(s0) ; RV32I-WITHFP-NEXT: sw a3, 12(s0) ; RV32I-WITHFP-NEXT: sw a2, 8(s0) -; RV32I-WITHFP-NEXT: addi a0, s0, 8 -; RV32I-WITHFP-NEXT: sw a0, -12(s0) -; RV32I-WITHFP-NEXT: lw a0, 4(s0) +; RV32I-WITHFP-NEXT: addi a1, s0, 8 +; RV32I-WITHFP-NEXT: sw a1, -12(s0) +; RV32I-WITHFP-NEXT: sw a0, 4(s0) ; RV32I-WITHFP-NEXT: lw s0, 8(sp) ; RV32I-WITHFP-NEXT: lw ra, 12(sp) ; RV32I-WITHFP-NEXT: addi sp, sp, 48 @@ -66,16 +66,16 @@ define i32 @va1_va_arg(i8* %fmt, ...) nounwind { ; RV32I-FPELIM-LABEL: va1_va_arg: ; RV32I-FPELIM: # %bb.0: ; RV32I-FPELIM-NEXT: addi sp, sp, -48 -; RV32I-FPELIM-NEXT: sw a1, 20(sp) +; RV32I-FPELIM-NEXT: mv a0, a1 ; RV32I-FPELIM-NEXT: sw a7, 44(sp) ; RV32I-FPELIM-NEXT: sw a6, 40(sp) ; RV32I-FPELIM-NEXT: sw a5, 36(sp) ; RV32I-FPELIM-NEXT: sw a4, 32(sp) ; RV32I-FPELIM-NEXT: sw a3, 28(sp) ; RV32I-FPELIM-NEXT: sw a2, 24(sp) -; RV32I-FPELIM-NEXT: addi a0, sp, 24 -; RV32I-FPELIM-NEXT: sw a0, 12(sp) -; RV32I-FPELIM-NEXT: lw a0, 20(sp) +; RV32I-FPELIM-NEXT: addi a1, sp, 24 +; RV32I-FPELIM-NEXT: sw a1, 12(sp) +; RV32I-FPELIM-NEXT: sw a0, 20(sp) ; RV32I-FPELIM-NEXT: addi sp, sp, 48 ; RV32I-FPELIM-NEXT: ret ; @@ -85,16 +85,16 @@ define i32 @va1_va_arg(i8* %fmt, ...) nounwind { ; RV32I-WITHFP-NEXT: sw ra, 12(sp) ; RV32I-WITHFP-NEXT: sw s0, 8(sp) ; RV32I-WITHFP-NEXT: addi s0, sp, 16 -; RV32I-WITHFP-NEXT: sw a1, 4(s0) +; RV32I-WITHFP-NEXT: mv a0, a1 ; RV32I-WITHFP-NEXT: sw a7, 28(s0) ; RV32I-WITHFP-NEXT: sw a6, 24(s0) ; RV32I-WITHFP-NEXT: sw a5, 20(s0) ; RV32I-WITHFP-NEXT: sw a4, 16(s0) ; RV32I-WITHFP-NEXT: sw a3, 12(s0) ; RV32I-WITHFP-NEXT: sw a2, 8(s0) -; RV32I-WITHFP-NEXT: addi a0, s0, 8 -; RV32I-WITHFP-NEXT: sw a0, -12(s0) -; RV32I-WITHFP-NEXT: lw a0, 4(s0) +; RV32I-WITHFP-NEXT: addi a1, s0, 8 +; RV32I-WITHFP-NEXT: sw a1, -12(s0) +; RV32I-WITHFP-NEXT: sw a0, 4(s0) ; RV32I-WITHFP-NEXT: lw s0, 8(sp) ; RV32I-WITHFP-NEXT: lw ra, 12(sp) ; RV32I-WITHFP-NEXT: addi sp, sp, 48 @@ -117,7 +117,7 @@ define i32 @va1_va_arg_alloca(i8* %fmt, ...) nounwind { ; RV32I-FPELIM-NEXT: sw s0, 8(sp) ; RV32I-FPELIM-NEXT: sw s1, 4(sp) ; RV32I-FPELIM-NEXT: addi s0, sp, 16 -; RV32I-FPELIM-NEXT: sw a1, 4(s0) +; RV32I-FPELIM-NEXT: mv s1, a1 ; RV32I-FPELIM-NEXT: sw a7, 28(s0) ; RV32I-FPELIM-NEXT: sw a6, 24(s0) ; RV32I-FPELIM-NEXT: sw a5, 20(s0) @@ -126,8 +126,8 @@ define i32 @va1_va_arg_alloca(i8* %fmt, ...) nounwind { ; RV32I-FPELIM-NEXT: sw a2, 8(s0) ; RV32I-FPELIM-NEXT: addi a0, s0, 8 ; RV32I-FPELIM-NEXT: sw a0, -16(s0) -; RV32I-FPELIM-NEXT: lw s1, 4(s0) -; RV32I-FPELIM-NEXT: addi a0, s1, 15 +; RV32I-FPELIM-NEXT: sw a1, 4(s0) +; RV32I-FPELIM-NEXT: addi a0, a1, 15 ; RV32I-FPELIM-NEXT: andi a0, a0, -16 ; RV32I-FPELIM-NEXT: sub a0, sp, a0 ; RV32I-FPELIM-NEXT: mv sp, a0 @@ -147,7 +147,7 @@ define i32 @va1_va_arg_alloca(i8* %fmt, ...) nounwind { ; RV32I-WITHFP-NEXT: sw s0, 8(sp) ; RV32I-WITHFP-NEXT: sw s1, 4(sp) ; RV32I-WITHFP-NEXT: addi s0, sp, 16 -; RV32I-WITHFP-NEXT: sw a1, 4(s0) +; RV32I-WITHFP-NEXT: mv s1, a1 ; RV32I-WITHFP-NEXT: sw a7, 28(s0) ; RV32I-WITHFP-NEXT: sw a6, 24(s0) ; RV32I-WITHFP-NEXT: sw a5, 20(s0) @@ -156,8 +156,8 @@ define i32 @va1_va_arg_alloca(i8* %fmt, ...) nounwind { ; RV32I-WITHFP-NEXT: sw a2, 8(s0) ; RV32I-WITHFP-NEXT: addi a0, s0, 8 ; RV32I-WITHFP-NEXT: sw a0, -16(s0) -; RV32I-WITHFP-NEXT: lw s1, 4(s0) -; RV32I-WITHFP-NEXT: addi a0, s1, 15 +; RV32I-WITHFP-NEXT: sw a1, 4(s0) +; RV32I-WITHFP-NEXT: addi a0, a1, 15 ; RV32I-WITHFP-NEXT: andi a0, a0, -16 ; RV32I-WITHFP-NEXT: sub a0, sp, a0 ; RV32I-WITHFP-NEXT: mv sp, a0 @@ -535,17 +535,17 @@ define i32 @va4_va_copy(i32 %argno, ...) nounwind { ; RV32I-FPELIM-NEXT: addi sp, sp, -48 ; RV32I-FPELIM-NEXT: sw ra, 12(sp) ; RV32I-FPELIM-NEXT: sw s1, 8(sp) -; RV32I-FPELIM-NEXT: sw a1, 20(sp) +; RV32I-FPELIM-NEXT: mv s1, a1 ; RV32I-FPELIM-NEXT: sw a7, 44(sp) ; RV32I-FPELIM-NEXT: sw a6, 40(sp) ; RV32I-FPELIM-NEXT: sw a5, 36(sp) ; RV32I-FPELIM-NEXT: sw a4, 32(sp) ; RV32I-FPELIM-NEXT: sw a3, 28(sp) ; RV32I-FPELIM-NEXT: sw a2, 24(sp) +; RV32I-FPELIM-NEXT: sw a1, 20(sp) ; RV32I-FPELIM-NEXT: addi a0, sp, 24 ; RV32I-FPELIM-NEXT: sw a0, 4(sp) ; RV32I-FPELIM-NEXT: sw a0, 0(sp) -; RV32I-FPELIM-NEXT: lw s1, 20(sp) ; RV32I-FPELIM-NEXT: call notdead ; RV32I-FPELIM-NEXT: lw a0, 4(sp) ; RV32I-FPELIM-NEXT: addi a0, a0, 3 @@ -578,17 +578,17 @@ define i32 @va4_va_copy(i32 %argno, ...) nounwind { ; RV32I-WITHFP-NEXT: sw s0, 24(sp) ; RV32I-WITHFP-NEXT: sw s1, 20(sp) ; RV32I-WITHFP-NEXT: addi s0, sp, 32 -; RV32I-WITHFP-NEXT: sw a1, 4(s0) +; RV32I-WITHFP-NEXT: mv s1, a1 ; RV32I-WITHFP-NEXT: sw a7, 28(s0) ; RV32I-WITHFP-NEXT: sw a6, 24(s0) ; RV32I-WITHFP-NEXT: sw a5, 20(s0) ; RV32I-WITHFP-NEXT: sw a4, 16(s0) ; RV32I-WITHFP-NEXT: sw a3, 12(s0) ; RV32I-WITHFP-NEXT: sw a2, 8(s0) +; RV32I-WITHFP-NEXT: sw a1, 4(s0) ; RV32I-WITHFP-NEXT: addi a0, s0, 8 ; RV32I-WITHFP-NEXT: sw a0, -16(s0) ; RV32I-WITHFP-NEXT: sw a0, -20(s0) -; RV32I-WITHFP-NEXT: lw s1, 4(s0) ; RV32I-WITHFP-NEXT: call notdead ; RV32I-WITHFP-NEXT: lw a0, -16(s0) ; RV32I-WITHFP-NEXT: addi a0, a0, 3 @@ -777,7 +777,6 @@ define i32 @va6_no_fixed_args(...) nounwind { ; RV32I-FPELIM-LABEL: va6_no_fixed_args: ; RV32I-FPELIM: # %bb.0: ; RV32I-FPELIM-NEXT: addi sp, sp, -48 -; RV32I-FPELIM-NEXT: sw a0, 16(sp) ; RV32I-FPELIM-NEXT: sw a7, 44(sp) ; RV32I-FPELIM-NEXT: sw a6, 40(sp) ; RV32I-FPELIM-NEXT: sw a5, 36(sp) @@ -785,9 +784,9 @@ define i32 @va6_no_fixed_args(...) nounwind { ; RV32I-FPELIM-NEXT: sw a3, 28(sp) ; RV32I-FPELIM-NEXT: sw a2, 24(sp) ; RV32I-FPELIM-NEXT: sw a1, 20(sp) -; RV32I-FPELIM-NEXT: addi a0, sp, 20 -; RV32I-FPELIM-NEXT: sw a0, 12(sp) -; RV32I-FPELIM-NEXT: lw a0, 16(sp) +; RV32I-FPELIM-NEXT: addi a1, sp, 20 +; RV32I-FPELIM-NEXT: sw a1, 12(sp) +; RV32I-FPELIM-NEXT: sw a0, 16(sp) ; RV32I-FPELIM-NEXT: addi sp, sp, 48 ; RV32I-FPELIM-NEXT: ret ; @@ -797,7 +796,6 @@ define i32 @va6_no_fixed_args(...) nounwind { ; RV32I-WITHFP-NEXT: sw ra, 12(sp) ; RV32I-WITHFP-NEXT: sw s0, 8(sp) ; RV32I-WITHFP-NEXT: addi s0, sp, 16 -; RV32I-WITHFP-NEXT: sw a0, 0(s0) ; RV32I-WITHFP-NEXT: sw a7, 28(s0) ; RV32I-WITHFP-NEXT: sw a6, 24(s0) ; RV32I-WITHFP-NEXT: sw a5, 20(s0) @@ -805,9 +803,9 @@ define i32 @va6_no_fixed_args(...) nounwind { ; RV32I-WITHFP-NEXT: sw a3, 12(s0) ; RV32I-WITHFP-NEXT: sw a2, 8(s0) ; RV32I-WITHFP-NEXT: sw a1, 4(s0) -; RV32I-WITHFP-NEXT: addi a0, s0, 4 -; RV32I-WITHFP-NEXT: sw a0, -12(s0) -; RV32I-WITHFP-NEXT: lw a0, 0(s0) +; RV32I-WITHFP-NEXT: addi a1, s0, 4 +; RV32I-WITHFP-NEXT: sw a1, -12(s0) +; RV32I-WITHFP-NEXT: sw a0, 0(s0) ; RV32I-WITHFP-NEXT: lw s0, 8(sp) ; RV32I-WITHFP-NEXT: lw ra, 12(sp) ; RV32I-WITHFP-NEXT: addi sp, sp, 48 -- GitLab From b4f227a2a58661618237bc78de4b4c7deb3b48c5 Mon Sep 17 00:00:00 2001 From: David Green Date: Thu, 11 Oct 2018 11:28:27 +0000 Subject: [PATCH 0053/1116] [InstCombine] Demand bits of UMin This is the umin alternative to the umax code from rL344237. We use DeMorgans law on the umax case to bring us to the same thing on umin, but using countLeadingOnes, not countLeadingZeros. Differential Revision: https://reviews.llvm.org/D53036 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344239 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../InstCombine/InstCombineSimplifyDemanded.cpp | 10 ++++++++++ test/Transforms/InstCombine/minmax-demandbits.ll | 12 +++--------- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index 18a2b2fdbfe..45cacc73d63 100644 --- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -325,6 +325,16 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, unsigned CTZ = DemandedMask.countTrailingZeros(); if (match(RHS, m_APInt(C)) && CTZ >= C->getActiveBits()) return LHS; + } else if (SPF == SPF_UMIN) { + // UMin(A, C) == A if ... + // The lowest non-zero bit of DemandMask is higher than the highest + // non-one bit of C. + // This comes from using DeMorgans on the above umax example. + const APInt *C; + unsigned CTZ = DemandedMask.countTrailingZeros(); + if (match(RHS, m_APInt(C)) && + CTZ >= C->getBitWidth() - C->countLeadingOnes()) + return LHS; } // If this is a select as part of any other min/max pattern, don't simplify diff --git a/test/Transforms/InstCombine/minmax-demandbits.ll b/test/Transforms/InstCombine/minmax-demandbits.ll index f838560f965..29a569663d2 100644 --- a/test/Transforms/InstCombine/minmax-demandbits.ll +++ b/test/Transforms/InstCombine/minmax-demandbits.ll @@ -190,9 +190,7 @@ define i32 @or_umin(i32 %A) { define i8 @or_min_31_30(i8 %A) { ; CHECK-LABEL: @or_min_31_30( -; CHECK-NEXT: [[CMP:%.*]] = icmp ult i8 [[A:%.*]], -30 -; CHECK-NEXT: [[MIN:%.*]] = select i1 [[CMP]], i8 [[A]], i8 -30 -; CHECK-NEXT: [[R:%.*]] = or i8 [[MIN]], 31 +; CHECK-NEXT: [[R:%.*]] = or i8 [[A:%.*]], 31 ; CHECK-NEXT: ret i8 [[R]] ; %cmp = icmp ult i8 %A, -30 @@ -203,9 +201,7 @@ define i8 @or_min_31_30(i8 %A) { define i8 @and_min_7_7(i8 %A) { ; CHECK-LABEL: @and_min_7_7( -; CHECK-NEXT: [[L2:%.*]] = icmp ult i8 [[A:%.*]], -7 -; CHECK-NEXT: [[MIN:%.*]] = select i1 [[L2]], i8 [[A]], i8 -7 -; CHECK-NEXT: [[R:%.*]] = and i8 [[MIN]], -8 +; CHECK-NEXT: [[R:%.*]] = and i8 [[A:%.*]], -8 ; CHECK-NEXT: ret i8 [[R]] ; %l2 = icmp ult i8 %A, -7 @@ -216,9 +212,7 @@ define i8 @and_min_7_7(i8 %A) { define i8 @and_min_7_8(i8 %A) { ; CHECK-LABEL: @and_min_7_8( -; CHECK-NEXT: [[L2:%.*]] = icmp ult i8 [[A:%.*]], -8 -; CHECK-NEXT: [[MIN:%.*]] = select i1 [[L2]], i8 [[A]], i8 -8 -; CHECK-NEXT: [[R:%.*]] = and i8 [[MIN]], -8 +; CHECK-NEXT: [[R:%.*]] = and i8 [[A:%.*]], -8 ; CHECK-NEXT: ret i8 [[R]] ; %l2 = icmp ult i8 %A, -8 -- GitLab From 14c745c241fbf61c6794fc7249e1384159df397e Mon Sep 17 00:00:00 2001 From: Dylan McKay Date: Thu, 11 Oct 2018 12:49:50 +0000 Subject: [PATCH 0054/1116] Generalize an IR verifier check to work with non-zero program address spaces This commit modifies an existing IR verifier check that assumes all functions will be located in the default address space 0. Rather than using the default paramater value getPointerTo(AddrSpace=0), explicitly specify the program memory address space from the data layout. This only affects targets that specify a nonzero address space in their data layouts. The only in-tree target that does this is AVR. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344243 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/IR/Verifier.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/IR/Verifier.cpp b/lib/IR/Verifier.cpp index 4b954c710e3..8304ec6e8f4 100644 --- a/lib/IR/Verifier.cpp +++ b/lib/IR/Verifier.cpp @@ -632,7 +632,8 @@ void Verifier::visitGlobalVariable(const GlobalVariable &GV) { if (ArrayType *ATy = dyn_cast(GV.getValueType())) { StructType *STy = dyn_cast(ATy->getElementType()); PointerType *FuncPtrTy = - FunctionType::get(Type::getVoidTy(Context), false)->getPointerTo(); + FunctionType::get(Type::getVoidTy(Context), false)-> + getPointerTo(DL.getProgramAddressSpace()); // FIXME: Reject the 2-field form in LLVM 4.0. Assert(STy && (STy->getNumElements() == 2 || STy->getNumElements() == 3) && -- GitLab From 1f4ef788a3bc636c5726f2f8ab95f11bdc0a9d58 Mon Sep 17 00:00:00 2001 From: "Diogo N. Sampaio" Date: Thu, 11 Oct 2018 14:10:32 +0000 Subject: [PATCH 0055/1116] [AARCH64][FIX] Emit data symbol for constant pool data The ARM64 elf emitter would omit printing data symbol for zero filled constant data. This patch overrides the emitFill method as to enforce that the symbol is correctly printed. Differential revision: https://reviews.llvm.org/D53132 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344248 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../AArch64/MCTargetDesc/AArch64ELFStreamer.cpp | 5 +++++ test/MC/AArch64/CheckDataSymbol.s | 15 +++++++++++++++ 2 files changed, 20 insertions(+) create mode 100644 test/MC/AArch64/CheckDataSymbol.s diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp index c0ef8b67028..a09ac6b94c1 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp @@ -154,6 +154,11 @@ public: MCELFStreamer::EmitValueImpl(Value, Size, Loc); } + void emitFill(const MCExpr &NumBytes, uint64_t FillValue, + SMLoc Loc) override { + EmitDataMappingSymbol(); + MCObjectStreamer::emitFill(NumBytes, FillValue, Loc); + } private: enum ElfMappingSymbol { EMS_None, diff --git a/test/MC/AArch64/CheckDataSymbol.s b/test/MC/AArch64/CheckDataSymbol.s new file mode 100644 index 00000000000..ea3ed7b2873 --- /dev/null +++ b/test/MC/AArch64/CheckDataSymbol.s @@ -0,0 +1,15 @@ +# RUN: llvm-mc -filetype=obj -assemble \ +# RUN: -triple=aarch64- %s -o - \ +# RUN: | llvm-readobj -s -t - | FileCheck %s +# CHECK: Name: $d.1 ({{[1-9][0-9]+}}) +# CHECK-NEXT: Value: 0x4 +# CHECK-NEXT: Size: 0 +# CHECK-NEXT: Binding: Local (0x0) +# CHECK-NEXT: Type: None (0x0) +# CHECK-NEXT: Other: 0 +# CHECK-NEXT: Section: .text (0x2) +# CHECK-NEXT: } + +.text +nop +.zero 4 -- GitLab From 4b25d67d1bc4a69db42057bf363dd8d4a9315d9f Mon Sep 17 00:00:00 2001 From: Amara Emerson Date: Thu, 11 Oct 2018 14:51:11 +0000 Subject: [PATCH 0056/1116] [InstCombine] Fix SimplifyLibCalls erasing an instruction while IC still had references to it. InstCombine keeps a worklist and assumes that optimizations don't eraseFromParent() the instruction, which SimplifyLibCalls violates. This change adds a new callback to SimplifyLibCalls to let clients specify their own hander for erasing actions. Differential Revision: https://reviews.llvm.org/D52729 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344251 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../llvm/Transforms/Utils/SimplifyLibCalls.h | 23 +++++++++++++---- .../InstCombine/InstCombineCalls.cpp | 6 ++++- lib/Transforms/Utils/SimplifyLibCalls.cpp | 24 ++++++++++-------- .../InstCombine/simplify-libcalls-erased.ll | 25 +++++++++++++++++++ 4 files changed, 62 insertions(+), 16 deletions(-) create mode 100644 test/Transforms/InstCombine/simplify-libcalls-erased.ll diff --git a/include/llvm/Transforms/Utils/SimplifyLibCalls.h b/include/llvm/Transforms/Utils/SimplifyLibCalls.h index 2b344f44107..025bcd44e31 100644 --- a/include/llvm/Transforms/Utils/SimplifyLibCalls.h +++ b/include/llvm/Transforms/Utils/SimplifyLibCalls.h @@ -77,21 +77,34 @@ private: OptimizationRemarkEmitter &ORE; bool UnsafeFPShrink; function_ref Replacer; + function_ref Eraser; /// Internal wrapper for RAUW that is the default implementation. /// /// Other users may provide an alternate function with this signature instead /// of this one. - static void replaceAllUsesWithDefault(Instruction *I, Value *With); + static void replaceAllUsesWithDefault(Instruction *I, Value *With) { + I->replaceAllUsesWith(With); + } + + /// Internal wrapper for eraseFromParent that is the default implementation. + static void eraseFromParentDefault(Instruction *I) { I->eraseFromParent(); } /// Replace an instruction's uses with a value using our replacer. void replaceAllUsesWith(Instruction *I, Value *With); + /// Erase an instruction from its parent with our eraser. + void eraseFromParent(Instruction *I); + + Value *foldMallocMemset(CallInst *Memset, IRBuilder<> &B); + public: - LibCallSimplifier(const DataLayout &DL, const TargetLibraryInfo *TLI, - OptimizationRemarkEmitter &ORE, - function_ref Replacer = - &replaceAllUsesWithDefault); + LibCallSimplifier( + const DataLayout &DL, const TargetLibraryInfo *TLI, + OptimizationRemarkEmitter &ORE, + function_ref Replacer = + &replaceAllUsesWithDefault, + function_ref Eraser = &eraseFromParentDefault); /// optimizeCall - Take the given call instruction and return a more /// optimal value to replace the instruction with or 0 if a more diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp index 4e404933a22..714c6176884 100644 --- a/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -3960,7 +3960,11 @@ Instruction *InstCombiner::tryOptimizeCall(CallInst *CI) { auto InstCombineRAUW = [this](Instruction *From, Value *With) { replaceInstUsesWith(*From, With); }; - LibCallSimplifier Simplifier(DL, &TLI, ORE, InstCombineRAUW); + auto InstCombineErase = [this](Instruction *I) { + eraseInstFromFunction(*I); + }; + LibCallSimplifier Simplifier(DL, &TLI, ORE, InstCombineRAUW, + InstCombineErase); if (Value *With = Simplifier.optimizeCall(CI)) { ++NumSimplified; return CI->use_empty() ? CI : replaceInstUsesWith(*CI, With); diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp index 3789181a898..41a495a0484 100644 --- a/lib/Transforms/Utils/SimplifyLibCalls.cpp +++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -923,8 +923,7 @@ Value *LibCallSimplifier::optimizeMemMove(CallInst *CI, IRBuilder<> &B) { } /// Fold memset[_chk](malloc(n), 0, n) --> calloc(1, n). -static Value *foldMallocMemset(CallInst *Memset, IRBuilder<> &B, - const TargetLibraryInfo &TLI) { +Value *LibCallSimplifier::foldMallocMemset(CallInst *Memset, IRBuilder<> &B) { // This has to be a memset of zeros (bzero). auto *FillValue = dyn_cast(Memset->getArgOperand(1)); if (!FillValue || FillValue->getZExtValue() != 0) @@ -944,7 +943,7 @@ static Value *foldMallocMemset(CallInst *Memset, IRBuilder<> &B, return nullptr; LibFunc Func; - if (!TLI.getLibFunc(*InnerCallee, Func) || !TLI.has(Func) || + if (!TLI->getLibFunc(*InnerCallee, Func) || !TLI->has(Func) || Func != LibFunc_malloc) return nullptr; @@ -959,18 +958,18 @@ static Value *foldMallocMemset(CallInst *Memset, IRBuilder<> &B, IntegerType *SizeType = DL.getIntPtrType(B.GetInsertBlock()->getContext()); Value *Calloc = emitCalloc(ConstantInt::get(SizeType, 1), Malloc->getArgOperand(0), Malloc->getAttributes(), - B, TLI); + B, *TLI); if (!Calloc) return nullptr; Malloc->replaceAllUsesWith(Calloc); - Malloc->eraseFromParent(); + eraseFromParent(Malloc); return Calloc; } Value *LibCallSimplifier::optimizeMemSet(CallInst *CI, IRBuilder<> &B) { - if (auto *Calloc = foldMallocMemset(CI, B, *TLI)) + if (auto *Calloc = foldMallocMemset(CI, B)) return Calloc; // memset(p, v, n) -> llvm.memset(align 1 p, v, n) @@ -1246,7 +1245,7 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilder<> &B) { // effects (e.g., errno). When the only consumer for the original // exp{,2}() is pow(), then it has to be explicitly erased. BaseFn->replaceAllUsesWith(ExpFn); - BaseFn->eraseFromParent(); + eraseFromParent(BaseFn); return ExpFn; } @@ -2591,7 +2590,7 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) { if (Value *V = optimizeStringMemoryLibCall(SimplifiedCI, TmpBuilder)) { // If we were able to further simplify, remove the now redundant call. SimplifiedCI->replaceAllUsesWith(V); - SimplifiedCI->eraseFromParent(); + eraseFromParent(SimplifiedCI); return V; } } @@ -2670,15 +2669,20 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) { LibCallSimplifier::LibCallSimplifier( const DataLayout &DL, const TargetLibraryInfo *TLI, OptimizationRemarkEmitter &ORE, - function_ref Replacer) + function_ref Replacer, + function_ref Eraser) : FortifiedSimplifier(TLI), DL(DL), TLI(TLI), ORE(ORE), - UnsafeFPShrink(false), Replacer(Replacer) {} + UnsafeFPShrink(false), Replacer(Replacer), Eraser(Eraser) {} void LibCallSimplifier::replaceAllUsesWith(Instruction *I, Value *With) { // Indirect through the replacer used in this instance. Replacer(I, With); } +void LibCallSimplifier::eraseFromParent(Instruction *I) { + Eraser(I); +} + // TODO: // Additional cases that we need to add to this file: // diff --git a/test/Transforms/InstCombine/simplify-libcalls-erased.ll b/test/Transforms/InstCombine/simplify-libcalls-erased.ll new file mode 100644 index 00000000000..19cfcf8eba9 --- /dev/null +++ b/test/Transforms/InstCombine/simplify-libcalls-erased.ll @@ -0,0 +1,25 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S < %s -instcombine | FileCheck %s + +target triple = "x86_64" + +define double @pow_exp(double %x, double %y) { +; CHECK-LABEL: @pow_exp( +; CHECK-NEXT: [[MUL:%.*]] = fmul fast double [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: [[EXP:%.*]] = call fast double @llvm.exp.f64(double [[MUL]]) +; CHECK-NEXT: ret double [[EXP]] +; + %A = alloca i1 + %call = call fast double @exp(double %x) #1 + %pow = call fast double @llvm.pow.f64(double %call, double %y) + %C1 = fcmp ule double %call, %pow + store i1 %C1, i1* %A + ret double %pow +} + +declare double @exp(double) + +declare double @llvm.pow.f64(double, double) #0 + +attributes #0 = { nounwind readnone speculatable } +attributes #1 = { nounwind readnone } -- GitLab From 09ab8e9f3ad978820f216f13888e93366478ce07 Mon Sep 17 00:00:00 2001 From: Andrea Di Biagio Date: Thu, 11 Oct 2018 14:54:54 +0000 Subject: [PATCH 0057/1116] [llvm-mca][BtVer2] Add tests for optimizable GPR register moves. NFC git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344253 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../X86/BtVer2/reg-move-elimination-4.s | 108 ++++++++++++++++++ .../X86/BtVer2/reg-move-elimination-5.s | 108 ++++++++++++++++++ 2 files changed, 216 insertions(+) create mode 100644 test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-4.s create mode 100644 test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-5.s diff --git a/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-4.s b/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-4.s new file mode 100644 index 00000000000..72ca7693c5f --- /dev/null +++ b/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-4.s @@ -0,0 +1,108 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=3 -timeline -register-file-stats < %s | FileCheck %s + +xor %eax, %eax +mov %eax, %ebx +mov %ebx, %ecx +mov %ecx, %edx +mov %edx, %eax + +# CHECK: Iterations: 3 +# CHECK-NEXT: Instructions: 15 +# CHECK-NEXT: Total Cycles: 12 +# CHECK-NEXT: Total uOps: 15 + +# CHECK: Dispatch Width: 2 +# CHECK-NEXT: uOps Per Cycle: 1.25 +# CHECK-NEXT: IPC: 1.25 +# CHECK-NEXT: Block RThroughput: 2.5 + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 0 0.50 xorl %eax, %eax +# CHECK-NEXT: 1 1 0.50 movl %eax, %ebx +# CHECK-NEXT: 1 1 0.50 movl %ebx, %ecx +# CHECK-NEXT: 1 1 0.50 movl %ecx, %edx +# CHECK-NEXT: 1 1 0.50 movl %edx, %eax + +# CHECK: Register File statistics: +# CHECK-NEXT: Total number of mappings created: 12 +# CHECK-NEXT: Max number of mappings used: 7 + +# CHECK: * Register File #1 -- JFpuPRF: +# CHECK-NEXT: Number of physical registers: 72 +# CHECK-NEXT: Total number of mappings created: 0 +# CHECK-NEXT: Max number of mappings used: 0 + +# CHECK: * Register File #2 -- JIntegerPRF: +# CHECK-NEXT: Number of physical registers: 64 +# CHECK-NEXT: Total number of mappings created: 12 +# CHECK-NEXT: Max number of mappings used: 7 + +# CHECK: Resources: +# CHECK-NEXT: [0] - JALU0 +# CHECK-NEXT: [1] - JALU1 +# CHECK-NEXT: [2] - JDiv +# CHECK-NEXT: [3] - JFPA +# CHECK-NEXT: [4] - JFPM +# CHECK-NEXT: [5] - JFPU0 +# CHECK-NEXT: [6] - JFPU1 +# CHECK-NEXT: [7] - JLAGU +# CHECK-NEXT: [8] - JMul +# CHECK-NEXT: [9] - JSAGU +# CHECK-NEXT: [10] - JSTC +# CHECK-NEXT: [11] - JVALU0 +# CHECK-NEXT: [12] - JVALU1 +# CHECK-NEXT: [13] - JVIMUL + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] +# CHECK-NEXT: 2.00 2.00 - - - - - - - - - - - - + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions: +# CHECK-NEXT: - - - - - - - - - - - - - - xorl %eax, %eax +# CHECK-NEXT: 0.33 0.67 - - - - - - - - - - - - movl %eax, %ebx +# CHECK-NEXT: 1.00 - - - - - - - - - - - - - movl %ebx, %ecx +# CHECK-NEXT: - 1.00 - - - - - - - - - - - - movl %ecx, %edx +# CHECK-NEXT: 0.67 0.33 - - - - - - - - - - - - movl %edx, %eax + +# CHECK: Timeline view: +# CHECK-NEXT: 01 +# CHECK-NEXT: Index 0123456789 + +# CHECK: [0,0] DR . .. xorl %eax, %eax +# CHECK-NEXT: [0,1] DeER . .. movl %eax, %ebx +# CHECK-NEXT: [0,2] .DeER. .. movl %ebx, %ecx +# CHECK-NEXT: [0,3] .D=eER .. movl %ecx, %edx +# CHECK-NEXT: [0,4] . D=eER .. movl %edx, %eax +# CHECK-NEXT: [1,0] . D---R .. xorl %eax, %eax +# CHECK-NEXT: [1,1] . DeE-R .. movl %eax, %ebx +# CHECK-NEXT: [1,2] . D=eER .. movl %ebx, %ecx +# CHECK-NEXT: [1,3] . D=eER .. movl %ecx, %edx +# CHECK-NEXT: [1,4] . D==eER.. movl %edx, %eax +# CHECK-NEXT: [2,0] . D---R.. xorl %eax, %eax +# CHECK-NEXT: [2,1] . DeE--R. movl %eax, %ebx +# CHECK-NEXT: [2,2] . .DeE-R. movl %ebx, %ecx +# CHECK-NEXT: [2,3] . .D=eE-R movl %ecx, %edx +# CHECK-NEXT: [2,4] . . D=eER movl %edx, %eax + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 3 0.0 0.0 2.0 xorl %eax, %eax +# CHECK-NEXT: 1. 3 1.0 1.0 1.0 movl %eax, %ebx +# CHECK-NEXT: 2. 3 1.3 0.0 0.3 movl %ebx, %ecx +# CHECK-NEXT: 3. 3 2.0 0.0 0.3 movl %ecx, %edx +# CHECK-NEXT: 4. 3 2.3 0.0 0.0 movl %edx, %eax diff --git a/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-5.s b/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-5.s new file mode 100644 index 00000000000..7d6b75f7c3f --- /dev/null +++ b/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-5.s @@ -0,0 +1,108 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=3 -timeline -register-file-stats < %s | FileCheck %s + +xor %rax, %rax +mov %rax, %rbx +mov %rbx, %rcx +mov %rcx, %rdx +mov %rdx, %rax + +# CHECK: Iterations: 3 +# CHECK-NEXT: Instructions: 15 +# CHECK-NEXT: Total Cycles: 12 +# CHECK-NEXT: Total uOps: 15 + +# CHECK: Dispatch Width: 2 +# CHECK-NEXT: uOps Per Cycle: 1.25 +# CHECK-NEXT: IPC: 1.25 +# CHECK-NEXT: Block RThroughput: 2.5 + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 0 0.50 xorq %rax, %rax +# CHECK-NEXT: 1 1 0.50 movq %rax, %rbx +# CHECK-NEXT: 1 1 0.50 movq %rbx, %rcx +# CHECK-NEXT: 1 1 0.50 movq %rcx, %rdx +# CHECK-NEXT: 1 1 0.50 movq %rdx, %rax + +# CHECK: Register File statistics: +# CHECK-NEXT: Total number of mappings created: 12 +# CHECK-NEXT: Max number of mappings used: 7 + +# CHECK: * Register File #1 -- JFpuPRF: +# CHECK-NEXT: Number of physical registers: 72 +# CHECK-NEXT: Total number of mappings created: 0 +# CHECK-NEXT: Max number of mappings used: 0 + +# CHECK: * Register File #2 -- JIntegerPRF: +# CHECK-NEXT: Number of physical registers: 64 +# CHECK-NEXT: Total number of mappings created: 12 +# CHECK-NEXT: Max number of mappings used: 7 + +# CHECK: Resources: +# CHECK-NEXT: [0] - JALU0 +# CHECK-NEXT: [1] - JALU1 +# CHECK-NEXT: [2] - JDiv +# CHECK-NEXT: [3] - JFPA +# CHECK-NEXT: [4] - JFPM +# CHECK-NEXT: [5] - JFPU0 +# CHECK-NEXT: [6] - JFPU1 +# CHECK-NEXT: [7] - JLAGU +# CHECK-NEXT: [8] - JMul +# CHECK-NEXT: [9] - JSAGU +# CHECK-NEXT: [10] - JSTC +# CHECK-NEXT: [11] - JVALU0 +# CHECK-NEXT: [12] - JVALU1 +# CHECK-NEXT: [13] - JVIMUL + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] +# CHECK-NEXT: 2.00 2.00 - - - - - - - - - - - - + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions: +# CHECK-NEXT: - - - - - - - - - - - - - - xorq %rax, %rax +# CHECK-NEXT: 0.33 0.67 - - - - - - - - - - - - movq %rax, %rbx +# CHECK-NEXT: 1.00 - - - - - - - - - - - - - movq %rbx, %rcx +# CHECK-NEXT: - 1.00 - - - - - - - - - - - - movq %rcx, %rdx +# CHECK-NEXT: 0.67 0.33 - - - - - - - - - - - - movq %rdx, %rax + +# CHECK: Timeline view: +# CHECK-NEXT: 01 +# CHECK-NEXT: Index 0123456789 + +# CHECK: [0,0] DR . .. xorq %rax, %rax +# CHECK-NEXT: [0,1] DeER . .. movq %rax, %rbx +# CHECK-NEXT: [0,2] .DeER. .. movq %rbx, %rcx +# CHECK-NEXT: [0,3] .D=eER .. movq %rcx, %rdx +# CHECK-NEXT: [0,4] . D=eER .. movq %rdx, %rax +# CHECK-NEXT: [1,0] . D---R .. xorq %rax, %rax +# CHECK-NEXT: [1,1] . DeE-R .. movq %rax, %rbx +# CHECK-NEXT: [1,2] . D=eER .. movq %rbx, %rcx +# CHECK-NEXT: [1,3] . D=eER .. movq %rcx, %rdx +# CHECK-NEXT: [1,4] . D==eER.. movq %rdx, %rax +# CHECK-NEXT: [2,0] . D---R.. xorq %rax, %rax +# CHECK-NEXT: [2,1] . DeE--R. movq %rax, %rbx +# CHECK-NEXT: [2,2] . .DeE-R. movq %rbx, %rcx +# CHECK-NEXT: [2,3] . .D=eE-R movq %rcx, %rdx +# CHECK-NEXT: [2,4] . . D=eER movq %rdx, %rax + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 3 0.0 0.0 2.0 xorq %rax, %rax +# CHECK-NEXT: 1. 3 1.0 1.0 1.0 movq %rax, %rbx +# CHECK-NEXT: 2. 3 1.3 0.0 0.3 movq %rbx, %rcx +# CHECK-NEXT: 3. 3 2.0 0.0 0.3 movq %rcx, %rdx +# CHECK-NEXT: 4. 3 2.3 0.0 0.0 movq %rdx, %rax -- GitLab From 7b42e952770909c390a606159577782927263286 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Thu, 11 Oct 2018 16:07:25 +0000 Subject: [PATCH 0058/1116] [DAGCombiner] move comment closer to the corresponding code; NFC git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344255 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index eca5d8369eb..4a80c1d358d 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -15451,14 +15451,13 @@ SDValue DAGCombiner::ReplaceExtractVectorEltOfLoadWithNarrowedLoad( } SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) { - // (vextract (scalar_to_vector val, 0) -> val SDValue InVec = N->getOperand(0); EVT VT = InVec.getValueType(); EVT NVT = N->getValueType(0); - if (InVec.isUndef()) return DAG.getUNDEF(NVT); + // (vextract (scalar_to_vector val, 0) -> val if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) { // Check if the result type doesn't match the inserted element type. A // SCALAR_TO_VECTOR may truncate the inserted element and the -- GitLab From ea0a193dbf8790b8e3a3a5e24a1da61e6d154ef9 Mon Sep 17 00:00:00 2001 From: Jordan Rupprecht Date: Thu, 11 Oct 2018 17:55:11 +0000 Subject: [PATCH 0059/1116] [llvm-nm] Fix crash when running with --print-armap on corrupt archives. error() in llvm-nm intentionally does not return so that the callee can move on to future files/slices. When printing the archive map, this is not currently handled (the caller assumes that error() returns), so processing continues despite there being an error. Also, change one return to a break, so that symbols can be printed even if the archive map is corrupt. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344268 91177308-0d34-0410-b5e6-96231b3b80d8 --- tools/llvm-nm/llvm-nm.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/llvm-nm/llvm-nm.cpp b/tools/llvm-nm/llvm-nm.cpp index 22fdd4ca85e..7e257d8ce89 100644 --- a/tools/llvm-nm/llvm-nm.cpp +++ b/tools/llvm-nm/llvm-nm.cpp @@ -1755,12 +1755,14 @@ static void dumpSymbolNamesFromFile(std::string &Filename) { outs() << "Archive map\n"; for (; I != E; ++I) { Expected C = I->getMember(); - if (!C) + if (!C) { error(C.takeError(), Filename); + break; + } Expected FileNameOrErr = C->getName(); if (!FileNameOrErr) { error(FileNameOrErr.takeError(), Filename); - return; + break; } StringRef SymName = I->getName(); outs() << SymName << " in " << FileNameOrErr.get() << "\n"; -- GitLab From 488a8b20ffdf168bbfbc987aef57baa2de903630 Mon Sep 17 00:00:00 2001 From: Zachary Turner Date: Thu, 11 Oct 2018 18:01:55 +0000 Subject: [PATCH 0060/1116] Better support for POSIX paths in PDBs. While it doesn't make a *ton* of sense for POSIX paths to be in PDBs, it's possible to occur in real scenarios involving cross compilation. The tools need to be able to handle this, because certain types of debugging scenarios are possible without a running process and so don't necessarily require you to be on a Windows system. These include post-mortem debugging and binary forensics (e.g. using a debugger to disassemble functions and examine symbols without running the process). There's changes in clang, LLD, and lldb in this patch. After this the cross-platform disassembly and source-list tests pass on Linux. Furthermore, the behavior of LLD can now be summarized by a much simpler rule than before: Unless you specify /pdbsourcepath and /pdbaltpath, the PDB ends up with paths that are valid within the context of the machine that the link is performed on. Differential Revision: https://reviews.llvm.org/D53149 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344269 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/AsmPrinter/CodeViewDebug.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp index 4d45a103c5a..8232f076a93 100644 --- a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp +++ b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp @@ -73,6 +73,7 @@ #include "llvm/Support/Error.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FormatVariadic.h" +#include "llvm/Support/Path.h" #include "llvm/Support/SMLoc.h" #include "llvm/Support/ScopedPrinter.h" #include "llvm/Target/TargetLoweringObjectFile.h" @@ -134,7 +135,9 @@ StringRef CodeViewDebug::getFullFilepath(const DIFile *File) { // If this is a Unix-style path, just use it as is. Don't try to canonicalize // it textually because one of the path components could be a symlink. - if (!Dir.empty() && Dir[0] == '/') { + if (Dir.startswith("/") || Filename.startswith("/")) { + if (llvm::sys::path::is_absolute(Filename, llvm::sys::path::Style::posix)) + return Filename; Filepath = Dir; if (Dir.back() != '/') Filepath += '/'; -- GitLab From a6f9ade27a7adb8ea7b8585f79dd3bd237e5501d Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 11 Oct 2018 18:06:07 +0000 Subject: [PATCH 0061/1116] [X86] Restore X86ISelDAGToDAG::matchBEXTRFromAnd. Teach address matching to create a BEXTR pattern from a (shl (and X, mask >> C1) if C1 can be folded into addressing mode. This is an alternative to D53080 since I think using a BEXTR for a shifted mask is definitely an improvement when the shl can be absorbed into addressing mode. The other cases I'm less sure about. We already have several tricks for handling an and of a shift in address matching. This adds a new case for BEXTR. I've moved the BEXTR matching code back to X86ISelDAGToDAG to allow it to match. I suppose alternatively we could directly emit a X86ISD::BEXTR node that isel could pattern match. But I'm trying to view BEXTR matching as an isel concern so DAG combine can see 'and' and 'shift' operations that are well understood. We did lose a couple cases from tbm_patterns.ll, but I think there are ways to recover that. I've also put back the manual load folding code in matchBEXTRFromAnd that I removed a few months ago in r324939. This gives us some more freedom to make decisions based on the ability to fold a load. I haven't done anything with that yet. Differential Revision: https://reviews.llvm.org/D53126 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344270 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelDAGToDAG.cpp | 152 +++++++++++++++++++++++++++++ lib/Target/X86/X86ISelLowering.cpp | 66 ------------- lib/Target/X86/X86InstrCompiler.td | 14 --- test/CodeGen/X86/extract-bits.ll | 78 ++++++++++++--- test/CodeGen/X86/tbm_patterns.ll | 6 +- 5 files changed, 218 insertions(+), 98 deletions(-) diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index c043c7c54cc..f8ec4a2bcfc 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -451,6 +451,7 @@ namespace { } bool foldLoadStoreIntoMemOperand(SDNode *Node); + bool matchBEXTRFromAndImm(SDNode *Node); bool matchBEXTR(SDNode *Node); bool shrinkAndImmediate(SDNode *N); bool isMaskZeroExtended(SDNode *N) const; @@ -1340,6 +1341,64 @@ static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N, return false; } +// Transform "(X >> SHIFT) & (MASK << C1)" to +// "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be +// matched to a BEXTR later. Returns false if the simplification is performed. +static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N, + uint64_t Mask, + SDValue Shift, SDValue X, + X86ISelAddressMode &AM, + const X86Subtarget &Subtarget) { + if (Shift.getOpcode() != ISD::SRL || + !isa(Shift.getOperand(1)) || + !Shift.hasOneUse() || !N.hasOneUse()) + return true; + + // Only do this if BEXTR will be matched by matchBEXTRFromAndImm. + if (!Subtarget.hasTBM() && + !(Subtarget.hasBMI() && Subtarget.hasFastBEXTR())) + return true; + + // We need to ensure that mask is a continuous run of bits. + if (!isShiftedMask_64(Mask)) return true; + + unsigned ShiftAmt = Shift.getConstantOperandVal(1); + + // The amount of shift we're trying to fit into the addressing mode is taken + // from the trailing zeros of the mask. + unsigned AMShiftAmt = countTrailingZeros(Mask); + + // There is nothing we can do here unless the mask is removing some bits. + // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits. + if (AMShiftAmt <= 0 || AMShiftAmt > 3) return true; + + MVT VT = N.getSimpleValueType(); + SDLoc DL(N); + SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8); + SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt); + SDValue NewMask = DAG.getConstant(Mask >> AMShiftAmt, DL, VT); + SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, NewSRL, NewMask); + SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8); + SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewAnd, NewSHLAmt); + + // Insert the new nodes into the topological ordering. We must do this in + // a valid topological ordering as nothing is going to go back and re-sort + // these nodes. We continually insert before 'N' in sequence as this is + // essentially a pre-flattened and pre-sorted sequence of nodes. There is no + // hierarchy left to express. + insertDAGNode(DAG, N, NewSRLAmt); + insertDAGNode(DAG, N, NewSRL); + insertDAGNode(DAG, N, NewMask); + insertDAGNode(DAG, N, NewAnd); + insertDAGNode(DAG, N, NewSHLAmt); + insertDAGNode(DAG, N, NewSHL); + DAG.ReplaceAllUsesWith(N, NewSHL); + + AM.Scale = 1 << AMShiftAmt; + AM.IndexReg = NewAnd; + return false; +} + bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, unsigned Depth) { SDLoc dl(N); @@ -1620,6 +1679,11 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, // a scale on the outside of the mask. if (!foldMaskedShiftToScaledMask(*CurDAG, N, Mask, Shift, X, AM)) return false; + + // Try to fold the mask and shift into BEXTR and scale. + if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask, Shift, X, AM, *Subtarget)) + return false; + break; } } @@ -2646,6 +2710,92 @@ bool X86DAGToDAGISel::matchBEXTR(SDNode *Node) { return true; } +// See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI. +bool X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) { + MVT NVT = Node->getSimpleValueType(0); + SDLoc dl(Node); + + SDValue N0 = Node->getOperand(0); + SDValue N1 = Node->getOperand(1); + + // If we have TBM we can use an immediate for the control. If we have BMI + // we should only do this if the BEXTR instruction is implemented well. + // Otherwise moving the control into a register makes this more costly. + // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM + // hoisting the move immediate would make it worthwhile with a less optimal + // BEXTR? + if (!Subtarget->hasTBM() && + !(Subtarget->hasBMI() && Subtarget->hasFastBEXTR())) + return false; + + // Must have a shift right. + if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA) + return false; + + // Shift can't have additional users. + if (!N0->hasOneUse()) + return false; + + // Only supported for 32 and 64 bits. + if (NVT != MVT::i32 && NVT != MVT::i64) + return false; + + // Shift amount and RHS of and must be constant. + ConstantSDNode *MaskCst = dyn_cast(N1); + ConstantSDNode *ShiftCst = dyn_cast(N0->getOperand(1)); + if (!MaskCst || !ShiftCst) + return false; + + // And RHS must be a mask. + uint64_t Mask = MaskCst->getZExtValue(); + if (!isMask_64(Mask)) + return false; + + uint64_t Shift = ShiftCst->getZExtValue(); + uint64_t MaskSize = countPopulation(Mask); + + // Don't interfere with something that can be handled by extracting AH. + // TODO: If we are able to fold a load, BEXTR might still be better than AH. + if (Shift == 8 && MaskSize == 8) + return false; + + // Make sure we are only using bits that were in the original value, not + // shifted in. + if (Shift + MaskSize > NVT.getSizeInBits()) + return false; + + SDValue New = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT); + unsigned ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri; + unsigned MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi; + + // BMI requires the immediate to placed in a register. + if (!Subtarget->hasTBM()) { + ROpc = NVT == MVT::i64 ? X86::BEXTR64rr : X86::BEXTR32rr; + MOpc = NVT == MVT::i64 ? X86::BEXTR64rm : X86::BEXTR32rm; + unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri; + New = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, New), 0); + } + + MachineSDNode *NewNode; + SDValue Input = N0->getOperand(0); + SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; + if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { + SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, New, Input.getOperand(0) }; + SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other); + NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); + // Update the chain. + ReplaceUses(Input.getValue(1), SDValue(NewNode, 1)); + // Record the mem-refs + CurDAG->setNodeMemRefs(NewNode, {cast(Input)->getMemOperand()}); + } else { + NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, Input, New); + } + + ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0)); + CurDAG->RemoveDeadNode(Node); + return true; +} + // Emit a PCMISTR(I/M) instruction. MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad, const SDLoc &dl, @@ -2953,6 +3103,8 @@ void X86DAGToDAGISel::Select(SDNode *Node) { break; case ISD::AND: + if (matchBEXTRFromAndImm(Node)) + return; if (matchBEXTR(Node)) return; if (AndImmShrink && shrinkAndImmediate(Node)) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index ab9a14a65a1..67f98d8ee72 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -35278,69 +35278,6 @@ static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG, return SDValue(); } -static bool hasBEXTR(const X86Subtarget &Subtarget, EVT VT) { - // If we have TBM we can use an immediate for the control. If we have BMI - // we should only do this if the BEXTR instruction is implemented well. - // Otherwise moving the control into a register makes this more costly. - // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM - // hoisting the move immediate would make it worthwhile with a less optimal - // BEXTR? - if (!Subtarget.hasTBM() && !(Subtarget.hasBMI() && Subtarget.hasFastBEXTR())) - return false; - return (VT == MVT::i32 || (VT == MVT::i64 && Subtarget.is64Bit())); -} - -// See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI. -static SDValue combineAndIntoBEXTR(SDNode *Node, SelectionDAG &DAG, - const X86Subtarget &Subtarget) { - EVT NVT = Node->getValueType(0); - SDLoc dl(Node); - - SDValue N0 = Node->getOperand(0); - SDValue N1 = Node->getOperand(1); - - // Check if subtarget has BEXTR instruction for the node's type - if (!hasBEXTR(Subtarget, NVT)) - return SDValue(); - - // Must have a shift right. - if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA) - return SDValue(); - - // Shift can't have additional users. - if (!N0->hasOneUse()) - return SDValue(); - - // Shift amount and RHS of and must be constant. - ConstantSDNode *MaskCst = dyn_cast(N1); - ConstantSDNode *ShiftCst = dyn_cast(N0->getOperand(1)); - if (!MaskCst || !ShiftCst) - return SDValue(); - - // And RHS must be a mask. - uint64_t Mask = MaskCst->getZExtValue(); - if (!isMask_64(Mask)) - return SDValue(); - - uint64_t Shift = ShiftCst->getZExtValue(); - uint64_t MaskSize = countPopulation(Mask); - - // Don't interfere with something that can be handled by extracting AH. - // TODO: If we are able to fold a load, BEXTR might still be better than AH. - if (Shift == 8 && MaskSize == 8) - return SDValue(); - - // Make sure we are only using bits that were in the original value, not - // shifted in. - if (Shift + MaskSize > NVT.getSizeInBits()) - return SDValue(); - - // Create a BEXTR node. - SDValue C = DAG.getConstant(Shift | (MaskSize << 8), dl, NVT); - SDValue New = DAG.getNode(X86ISD::BEXTR, dl, NVT, N0->getOperand(0), C); - return New; -} - // Look for (and (ctpop X), 1) which is the IR form of __builtin_parity. // Turn it into series of XORs and a setnp. static SDValue combineParity(SDNode *N, SelectionDAG &DAG, @@ -35442,9 +35379,6 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, if (DCI.isBeforeLegalizeOps()) return SDValue(); - if (SDValue R = combineAndIntoBEXTR(N, DAG, Subtarget)) - return R; - if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget)) return R; diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td index de45b4697ac..051832bf4bc 100644 --- a/lib/Target/X86/X86InstrCompiler.td +++ b/lib/Target/X86/X86InstrCompiler.td @@ -2135,17 +2135,3 @@ def : Pat<(cttz_zero_undef (loadi64 addr:$src)), (BSF64rm addr:$src)>; let Predicates = [HasMOVBE] in { def : Pat<(bswap GR16:$src), (ROL16ri GR16:$src, (i8 8))>; } - -// These patterns are selected by some custom code in X86ISelDAGToDAG.cpp that -// custom combines and+srl into BEXTR. We use these patterns to avoid a bunch -// of manual code for folding loads. -let Predicates = [HasBMI, NoTBM] in { - def : Pat<(X86bextr GR32:$src1, (i32 imm:$src2)), - (BEXTR32rr GR32:$src1, (MOV32ri imm:$src2))>; - def : Pat<(X86bextr (loadi32 addr:$src1), (i32 imm:$src2)), - (BEXTR32rm addr:$src1, (MOV32ri imm:$src2))>; - def : Pat<(X86bextr GR64:$src1, mov64imm32:$src2), - (BEXTR64rr GR64:$src1, (MOV32ri64 mov64imm32:$src2))>; - def : Pat<(X86bextr (loadi64 addr:$src1), mov64imm32:$src2), - (BEXTR64rm addr:$src1, (MOV32ri64 mov64imm32:$src2))>; -} // HasBMI, NoTBM diff --git a/test/CodeGen/X86/extract-bits.ll b/test/CodeGen/X86/extract-bits.ll index b16aeb3d350..06f316b14d0 100644 --- a/test/CodeGen/X86/extract-bits.ll +++ b/test/CodeGen/X86/extract-bits.ll @@ -5568,23 +5568,69 @@ define i64 @bextr64_d5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits ; https://bugs.llvm.org/show_bug.cgi?id=38938 define void @pr38938(i32* %a0, i64* %a1) { -; X86-LABEL: pr38938: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %ecx -; X86-NEXT: shrl $19, %ecx -; X86-NEXT: andl $4092, %ecx # imm = 0xFFC -; X86-NEXT: incl (%eax,%ecx) -; X86-NEXT: retl +; X86-NOBMI-LABEL: pr38938: +; X86-NOBMI: # %bb.0: +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NOBMI-NEXT: movl (%ecx), %ecx +; X86-NOBMI-NEXT: shrl $19, %ecx +; X86-NOBMI-NEXT: andl $4092, %ecx # imm = 0xFFC +; X86-NOBMI-NEXT: incl (%eax,%ecx) +; X86-NOBMI-NEXT: retl ; -; X64-LABEL: pr38938: -; X64: # %bb.0: -; X64-NEXT: movq (%rsi), %rax -; X64-NEXT: shrq $19, %rax -; X64-NEXT: andl $4092, %eax # imm = 0xFFC -; X64-NEXT: incl (%rdi,%rax) -; X64-NEXT: retq +; X86-BMI1NOTBM-LABEL: pr38938: +; X86-BMI1NOTBM: # %bb.0: +; X86-BMI1NOTBM-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-BMI1NOTBM-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-BMI1NOTBM-NEXT: movl $2581, %edx # imm = 0xA15 +; X86-BMI1NOTBM-NEXT: bextrl %edx, (%ecx), %ecx +; X86-BMI1NOTBM-NEXT: incl (%eax,%ecx,4) +; X86-BMI1NOTBM-NEXT: retl +; +; X86-BMI1TBM-LABEL: pr38938: +; X86-BMI1TBM: # %bb.0: +; X86-BMI1TBM-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-BMI1TBM-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-BMI1TBM-NEXT: bextrl $2581, (%ecx), %ecx # imm = 0xA15 +; X86-BMI1TBM-NEXT: incl (%eax,%ecx,4) +; X86-BMI1TBM-NEXT: retl +; +; X86-BMI1NOTBMBMI2-LABEL: pr38938: +; X86-BMI1NOTBMBMI2: # %bb.0: +; X86-BMI1NOTBMBMI2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-BMI1NOTBMBMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-BMI1NOTBMBMI2-NEXT: movl $2581, %edx # imm = 0xA15 +; X86-BMI1NOTBMBMI2-NEXT: bextrl %edx, (%ecx), %ecx +; X86-BMI1NOTBMBMI2-NEXT: incl (%eax,%ecx,4) +; X86-BMI1NOTBMBMI2-NEXT: retl +; +; X64-NOBMI-LABEL: pr38938: +; X64-NOBMI: # %bb.0: +; X64-NOBMI-NEXT: movq (%rsi), %rax +; X64-NOBMI-NEXT: shrq $19, %rax +; X64-NOBMI-NEXT: andl $4092, %eax # imm = 0xFFC +; X64-NOBMI-NEXT: incl (%rdi,%rax) +; X64-NOBMI-NEXT: retq +; +; X64-BMI1NOTBM-LABEL: pr38938: +; X64-BMI1NOTBM: # %bb.0: +; X64-BMI1NOTBM-NEXT: movl $2581, %eax # imm = 0xA15 +; X64-BMI1NOTBM-NEXT: bextrq %rax, (%rsi), %rax +; X64-BMI1NOTBM-NEXT: incl (%rdi,%rax,4) +; X64-BMI1NOTBM-NEXT: retq +; +; X64-BMI1TBM-LABEL: pr38938: +; X64-BMI1TBM: # %bb.0: +; X64-BMI1TBM-NEXT: bextrq $2581, (%rsi), %rax # imm = 0xA15 +; X64-BMI1TBM-NEXT: incl (%rdi,%rax,4) +; X64-BMI1TBM-NEXT: retq +; +; X64-BMI1NOTBMBMI2-LABEL: pr38938: +; X64-BMI1NOTBMBMI2: # %bb.0: +; X64-BMI1NOTBMBMI2-NEXT: movl $2581, %eax # imm = 0xA15 +; X64-BMI1NOTBMBMI2-NEXT: bextrq %rax, (%rsi), %rax +; X64-BMI1NOTBMBMI2-NEXT: incl (%rdi,%rax,4) +; X64-BMI1NOTBMBMI2-NEXT: retq %tmp = load i64, i64* %a1, align 8 %tmp1 = lshr i64 %tmp, 21 %tmp2 = and i64 %tmp1, 1023 diff --git a/test/CodeGen/X86/tbm_patterns.ll b/test/CodeGen/X86/tbm_patterns.ll index 2b335ea4268..6865cc5a0ef 100644 --- a/test/CodeGen/X86/tbm_patterns.ll +++ b/test/CodeGen/X86/tbm_patterns.ll @@ -53,7 +53,8 @@ define i32 @test_x86_tbm_bextri_u32_z2(i32 %a, i32 %b, i32 %c) nounwind { ; CHECK-LABEL: test_x86_tbm_bextri_u32_z2: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %esi, %eax -; CHECK-NEXT: bextrl $3076, %edi, %ecx # imm = 0xC04 +; CHECK-NEXT: shrl $4, %edi +; CHECK-NEXT: testl $4095, %edi # imm = 0xFFF ; CHECK-NEXT: cmovnel %edx, %eax ; CHECK-NEXT: retq %t0 = lshr i32 %a, 4 @@ -113,7 +114,8 @@ define i64 @test_x86_tbm_bextri_u64_z2(i64 %a, i64 %b, i64 %c) nounwind { ; CHECK-LABEL: test_x86_tbm_bextri_u64_z2: ; CHECK: # %bb.0: ; CHECK-NEXT: movq %rsi, %rax -; CHECK-NEXT: bextrl $3076, %edi, %ecx # imm = 0xC04 +; CHECK-NEXT: shrl $4, %edi +; CHECK-NEXT: testl $4095, %edi # imm = 0xFFF ; CHECK-NEXT: cmovneq %rdx, %rax ; CHECK-NEXT: retq %t0 = lshr i64 %a, 4 -- GitLab From 29b3e08ecf43747ba6723ff373ef23aace083bf8 Mon Sep 17 00:00:00 2001 From: Krzysztof Parzyszek Date: Thu, 11 Oct 2018 18:26:02 +0000 Subject: [PATCH 0062/1116] [Hexagon] Eliminate potential sources of non-determinism in HCE Also, avoid comparing GUIDs when ordering global addresses, because source file location can cause different GUID to be calculated. As a result, a pair of symbols can compare "less" in one directory, but "greater" in another. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344271 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/Hexagon/HexagonConstExtenders.cpp | 42 +++++++++++++++----- 1 file changed, 33 insertions(+), 9 deletions(-) diff --git a/lib/Target/Hexagon/HexagonConstExtenders.cpp b/lib/Target/Hexagon/HexagonConstExtenders.cpp index 6b48384c737..d096445f144 100644 --- a/lib/Target/Hexagon/HexagonConstExtenders.cpp +++ b/lib/Target/Hexagon/HexagonConstExtenders.cpp @@ -376,7 +376,7 @@ namespace { using IndexList = SetVector; using ExtenderInit = std::pair; using AssignmentMap = std::map; - using LocDefMap = std::map; + using LocDefList = std::vector>; const HexagonInstrInfo *HII = nullptr; const HexagonRegisterInfo *HRI = nullptr; @@ -399,7 +399,7 @@ namespace { void assignInits(const ExtRoot &ER, unsigned Begin, unsigned End, AssignmentMap &IMap); void calculatePlacement(const ExtenderInit &ExtI, const IndexList &Refs, - LocDefMap &Defs); + LocDefList &Defs); Register insertInitializer(Loc DefL, const ExtenderInit &ExtI); bool replaceInstrExact(const ExtDesc &ED, Register ExtR); bool replaceInstrExpr(const ExtDesc &ED, const ExtenderInit &ExtI, @@ -731,7 +731,12 @@ bool HCE::ExtRoot::operator< (const HCE::ExtRoot &ER) const { case MachineOperand::MO_ExternalSymbol: return StringRef(V.SymbolName) < StringRef(ER.V.SymbolName); case MachineOperand::MO_GlobalAddress: - return V.GV->getGUID() < ER.V.GV->getGUID(); + // Do not use GUIDs, since they depend on the source path. Moving the + // source file to a different directory could cause different GUID + // values for a pair of given symbols. These symbols could then compare + // "less" in one directory, but "greater" in another. + assert(!V.GV->getName().empty() && !ER.V.GV->getName().empty()); + return V.GV->getName() < ER.V.GV->getName(); case MachineOperand::MO_BlockAddress: { const BasicBlock *ThisB = V.BA->getBasicBlock(); const BasicBlock *OtherB = ER.V.BA->getBasicBlock(); @@ -1236,9 +1241,13 @@ void HCE::collectInstr(MachineInstr &MI) { void HCE::collect(MachineFunction &MF) { Extenders.clear(); - for (MachineBasicBlock &MBB : MF) + for (MachineBasicBlock &MBB : MF) { + // Skip unreachable blocks. + if (MBB.getNumber() == -1) + continue; for (MachineInstr &MI : MBB) collectInstr(MI); + } } void HCE::assignInits(const ExtRoot &ER, unsigned Begin, unsigned End, @@ -1463,7 +1472,7 @@ void HCE::assignInits(const ExtRoot &ER, unsigned Begin, unsigned End, } void HCE::calculatePlacement(const ExtenderInit &ExtI, const IndexList &Refs, - LocDefMap &Defs) { + LocDefList &Defs) { if (Refs.empty()) return; @@ -1510,7 +1519,7 @@ void HCE::calculatePlacement(const ExtenderInit &ExtI, const IndexList &Refs, It = DomB->getFirstTerminator(); } Loc DefLoc(DomB, It); - Defs.emplace(DefLoc, Refs); + Defs.emplace_back(DefLoc, Refs); } HCE::Register HCE::insertInitializer(Loc DefL, const ExtenderInit &ExtI) { @@ -1880,7 +1889,7 @@ bool HCE::replaceInstr(unsigned Idx, Register ExtR, const ExtenderInit &ExtI) { } bool HCE::replaceExtenders(const AssignmentMap &IMap) { - LocDefMap Defs; + LocDefList Defs; bool Changed = false; for (const std::pair &P : IMap) { @@ -1947,8 +1956,23 @@ bool HCE::runOnMachineFunction(MachineFunction &MF) { AssignmentMap IMap; collect(MF); - llvm::sort(Extenders, [](const ExtDesc &A, const ExtDesc &B) { - return ExtValue(A) < ExtValue(B); + llvm::sort(Extenders, [this](const ExtDesc &A, const ExtDesc &B) { + ExtValue VA(A), VB(B); + if (VA != VB) + return VA < VB; + const MachineInstr *MA = A.UseMI; + const MachineInstr *MB = B.UseMI; + if (MA == MB) { + // If it's the same instruction, compare operand numbers. + return A.OpNum < B.OpNum; + } + + const MachineBasicBlock *BA = MA->getParent(); + const MachineBasicBlock *BB = MB->getParent(); + assert(BA->getNumber() != -1 && BB->getNumber() != -1); + if (BA != BB) + return BA->getNumber() < BB->getNumber(); + return MDT->dominates(MA, MB); }); bool Changed = false; -- GitLab From f5647cf249aea2806e8cd6cfe06f783a54b46825 Mon Sep 17 00:00:00 2001 From: Nirav Dave Date: Thu, 11 Oct 2018 18:28:59 +0000 Subject: [PATCH 0063/1116] [DAG] Fix Big Endian in Load-Store forwarding Summary: Correct offset calculation in load-store forwarding for big-endian targets. Reviewers: rnk, RKSimon, waltl Subscribers: sdardis, nemanjai, hiraditya, jrtc27, atanasyan, jsji, llvm-commits Differential Revision: https://reviews.llvm.org/D53147 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344272 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 5 ++ test/CodeGen/Mips/cconv/vector.ll | 70 +++++++++++++------ .../PowerPC/big-endian-store-forward.ll | 16 +++++ 3 files changed, 68 insertions(+), 23 deletions(-) create mode 100644 test/CodeGen/PowerPC/big-endian-store-forward.ll diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 4a80c1d358d..16834dc1a26 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -12847,6 +12847,11 @@ SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) { if (!STCoversLD) return SDValue(); + // Normalize for Endianness. + if (DAG.getDataLayout().isBigEndian()) + Offset = + (STMemType.getSizeInBits() - LDMemType.getSizeInBits()) / 8 - Offset; + // Memory as copy space (potentially masked). if (Offset == 0 && LDType == STType && STMemType == LDMemType) { // Simple case: Direct non-truncating forwarding diff --git a/test/CodeGen/Mips/cconv/vector.ll b/test/CodeGen/Mips/cconv/vector.ll index 8cec16683ca..29ffe23f712 100644 --- a/test/CodeGen/Mips/cconv/vector.ll +++ b/test/CodeGen/Mips/cconv/vector.ll @@ -2045,29 +2045,29 @@ define <2 x i32> @i32_2(<2 x i32> %a, <2 x i32> %b) { ; MIPS32R5EB-NEXT: jr $ra ; MIPS32R5EB-NEXT: nop ; -; MIPS64R5-LABEL: i32_2: -; MIPS64R5: # %bb.0: -; MIPS64R5-NEXT: daddiu $sp, $sp, -32 -; MIPS64R5-NEXT: .cfi_def_cfa_offset 32 -; MIPS64R5-NEXT: sd $5, 16($sp) -; MIPS64R5-NEXT: sd $4, 24($sp) -; MIPS64R5-NEXT: ldi.b $w0, 0 -; MIPS64R5-NEXT: lw $1, 20($sp) -; MIPS64R5-NEXT: move.v $w1, $w0 -; MIPS64R5-NEXT: insert.d $w1[0], $5 -; MIPS64R5-NEXT: insert.d $w1[1], $1 -; MIPS64R5-NEXT: insert.d $w0[0], $4 -; MIPS64R5-NEXT: lw $1, 28($sp) -; MIPS64R5-NEXT: insert.d $w0[1], $1 -; MIPS64R5-NEXT: addv.d $w0, $w0, $w1 -; MIPS64R5-NEXT: copy_s.d $1, $w0[0] -; MIPS64R5-NEXT: copy_s.d $2, $w0[1] -; MIPS64R5-NEXT: sw $2, 12($sp) -; MIPS64R5-NEXT: sw $1, 8($sp) -; MIPS64R5-NEXT: ld $2, 8($sp) -; MIPS64R5-NEXT: daddiu $sp, $sp, 32 -; MIPS64R5-NEXT: jr $ra -; MIPS64R5-NEXT: nop +; MIPS64R5EB-LABEL: i32_2: +; MIPS64R5EB: # %bb.0: +; MIPS64R5EB-NEXT: daddiu $sp, $sp, -32 +; MIPS64R5EB-NEXT: .cfi_def_cfa_offset 32 +; MIPS64R5EB-NEXT: sd $5, 16($sp) +; MIPS64R5EB-NEXT: sd $4, 24($sp) +; MIPS64R5EB-NEXT: ldi.b $w0, 0 +; MIPS64R5EB-NEXT: lw $1, 16($sp) +; MIPS64R5EB-NEXT: move.v $w1, $w0 +; MIPS64R5EB-NEXT: insert.d $w1[0], $1 +; MIPS64R5EB-NEXT: insert.d $w1[1], $5 +; MIPS64R5EB-NEXT: lw $1, 24($sp) +; MIPS64R5EB-NEXT: insert.d $w0[0], $1 +; MIPS64R5EB-NEXT: insert.d $w0[1], $4 +; MIPS64R5EB-NEXT: addv.d $w0, $w0, $w1 +; MIPS64R5EB-NEXT: copy_s.d $1, $w0[0] +; MIPS64R5EB-NEXT: copy_s.d $2, $w0[1] +; MIPS64R5EB-NEXT: sw $2, 12($sp) +; MIPS64R5EB-NEXT: sw $1, 8($sp) +; MIPS64R5EB-NEXT: ld $2, 8($sp) +; MIPS64R5EB-NEXT: daddiu $sp, $sp, 32 +; MIPS64R5EB-NEXT: jr $ra +; MIPS64R5EB-NEXT: nop ; ; MIPS32R5EL-LABEL: i32_2: ; MIPS32R5EL: # %bb.0: @@ -2093,6 +2093,30 @@ define <2 x i32> @i32_2(<2 x i32> %a, <2 x i32> %b) { ; MIPS32R5EL-NEXT: addiu $sp, $sp, 48 ; MIPS32R5EL-NEXT: jr $ra ; MIPS32R5EL-NEXT: nop +; +; MIPS64R5EL-LABEL: i32_2: +; MIPS64R5EL: # %bb.0: +; MIPS64R5EL-NEXT: daddiu $sp, $sp, -32 +; MIPS64R5EL-NEXT: .cfi_def_cfa_offset 32 +; MIPS64R5EL-NEXT: sd $5, 16($sp) +; MIPS64R5EL-NEXT: sd $4, 24($sp) +; MIPS64R5EL-NEXT: ldi.b $w0, 0 +; MIPS64R5EL-NEXT: lw $1, 20($sp) +; MIPS64R5EL-NEXT: move.v $w1, $w0 +; MIPS64R5EL-NEXT: insert.d $w1[0], $5 +; MIPS64R5EL-NEXT: insert.d $w1[1], $1 +; MIPS64R5EL-NEXT: insert.d $w0[0], $4 +; MIPS64R5EL-NEXT: lw $1, 28($sp) +; MIPS64R5EL-NEXT: insert.d $w0[1], $1 +; MIPS64R5EL-NEXT: addv.d $w0, $w0, $w1 +; MIPS64R5EL-NEXT: copy_s.d $1, $w0[0] +; MIPS64R5EL-NEXT: copy_s.d $2, $w0[1] +; MIPS64R5EL-NEXT: sw $2, 12($sp) +; MIPS64R5EL-NEXT: sw $1, 8($sp) +; MIPS64R5EL-NEXT: ld $2, 8($sp) +; MIPS64R5EL-NEXT: daddiu $sp, $sp, 32 +; MIPS64R5EL-NEXT: jr $ra +; MIPS64R5EL-NEXT: nop %1 = add <2 x i32> %a, %b ret <2 x i32> %1 } diff --git a/test/CodeGen/PowerPC/big-endian-store-forward.ll b/test/CodeGen/PowerPC/big-endian-store-forward.ll new file mode 100644 index 00000000000..1125a229005 --- /dev/null +++ b/test/CodeGen/PowerPC/big-endian-store-forward.ll @@ -0,0 +1,16 @@ +; RUN: llc -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s + +; The load is to the high byte of the 2-byte store +@g = global i8 -75 + +define void @f(i16 %v) { +; CHECK-LABEL: f +; CHECK: sth 3, -2(1) +; CHECK: lbz 3, -2(1) + %p32 = alloca i16 + store i16 %v, i16* %p32 + %p16 = bitcast i16* %p32 to i8* + %tmp = load i8, i8* %p16 + store i8 %tmp, i8* @g + ret void +} -- GitLab From 2f2ce25a6b4dc4d37c6f45866ed7310adb6233dc Mon Sep 17 00:00:00 2001 From: Leonard Chan Date: Thu, 11 Oct 2018 18:31:51 +0000 Subject: [PATCH 0064/1116] [PassManager/Sanitizer] Port of AddresSanitizer pass from legacy to new PassManager This patch ports the legacy pass manager to the new one to take advantage of the benefits of the new PM. This involved moving a lot of the declarations for `AddressSantizer` to a header so that it can be publicly used via PassRegistry.def which I believe contains all the passes managed by the new PM. This patch essentially decouples the instrumentation from the legacy PM such hat it can be used by both legacy and new PM infrastructure. Differential Revision: https://reviews.llvm.org/D52739 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344274 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/InitializePasses.h | 4 +- .../Instrumentation/AddressSanitizerPass.h | 41 ++++ lib/Passes/PassBuilder.cpp | 3 +- lib/Passes/PassRegistry.def | 2 + .../Instrumentation/AddressSanitizer.cpp | 178 +++++++++++------- .../Instrumentation/Instrumentation.cpp | 4 +- .../Instrumentation/AddressSanitizer/basic.ll | 2 + 7 files changed, 166 insertions(+), 68 deletions(-) create mode 100644 include/llvm/Transforms/Instrumentation/AddressSanitizerPass.h diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h index 1a9c6f82bfd..42bfc55b1aa 100644 --- a/include/llvm/InitializePasses.h +++ b/include/llvm/InitializePasses.h @@ -65,8 +65,8 @@ void initializeAAEvalLegacyPassPass(PassRegistry&); void initializeAAResultsWrapperPassPass(PassRegistry&); void initializeADCELegacyPassPass(PassRegistry&); void initializeAddDiscriminatorsLegacyPassPass(PassRegistry&); -void initializeAddressSanitizerModulePass(PassRegistry&); -void initializeAddressSanitizerPass(PassRegistry&); +void initializeAddressSanitizerModuleLegacyPassPass(PassRegistry &); +void initializeAddressSanitizerLegacyPassPass(PassRegistry &); void initializeAggressiveInstCombinerLegacyPassPass(PassRegistry&); void initializeAliasSetPrinterPass(PassRegistry&); void initializeAlignmentFromAssumptionsPass(PassRegistry&); diff --git a/include/llvm/Transforms/Instrumentation/AddressSanitizerPass.h b/include/llvm/Transforms/Instrumentation/AddressSanitizerPass.h new file mode 100644 index 00000000000..021e1bd4c24 --- /dev/null +++ b/include/llvm/Transforms/Instrumentation/AddressSanitizerPass.h @@ -0,0 +1,41 @@ +//===--------- Definition of the AddressSanitizer class ---------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares the AddressSanitizer class which is a port of the legacy +// AddressSanitizer pass to use the new PassManager infrastructure. +// +//===----------------------------------------------------------------------===// +#ifndef LLVM_TRANSFORMS_INSTRUMENTATION_ADDRESSSANITIZERPASS_H +#define LLVM_TRANSFORMS_INSTRUMENTATION_ADDRESSSANITIZERPASS_H + +#include "llvm/IR/Function.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/PassManager.h" + +namespace llvm { + +/// Public interface to the address sanitizer pass for instrumenting code to +/// check for various memory bugs. +class AddressSanitizerPass : public PassInfoMixin { +public: + explicit AddressSanitizerPass(bool CompileKernel = false, + bool Recover = false, + bool UseAfterScope = false); + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); + PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); + +private: + bool CompileKernel; + bool Recover; + bool UseAfterScope; +}; + +} // namespace llvm + +#endif diff --git a/lib/Passes/PassBuilder.cpp b/lib/Passes/PassBuilder.cpp index 94afb5409e1..09758dc5651 100644 --- a/lib/Passes/PassBuilder.cpp +++ b/lib/Passes/PassBuilder.cpp @@ -61,7 +61,6 @@ #include "llvm/Support/Regex.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h" -#include "llvm/Transforms/Instrumentation/CGProfile.h" #include "llvm/Transforms/IPO/AlwaysInliner.h" #include "llvm/Transforms/IPO/ArgumentPromotion.h" #include "llvm/Transforms/IPO/CalledValuePropagation.h" @@ -87,7 +86,9 @@ #include "llvm/Transforms/IPO/SyntheticCountsPropagation.h" #include "llvm/Transforms/IPO/WholeProgramDevirt.h" #include "llvm/Transforms/InstCombine/InstCombine.h" +#include "llvm/Transforms/Instrumentation/AddressSanitizerPass.h" #include "llvm/Transforms/Instrumentation/BoundsChecking.h" +#include "llvm/Transforms/Instrumentation/CGProfile.h" #include "llvm/Transforms/Instrumentation/ControlHeightReduction.h" #include "llvm/Transforms/Instrumentation/GCOVProfiler.h" #include "llvm/Transforms/Instrumentation/InstrProfiling.h" diff --git a/lib/Passes/PassRegistry.def b/lib/Passes/PassRegistry.def index 8de4541a772..ad03942fb9a 100644 --- a/lib/Passes/PassRegistry.def +++ b/lib/Passes/PassRegistry.def @@ -40,6 +40,7 @@ MODULE_ALIAS_ANALYSIS("globals-aa", GlobalsAA()) #define MODULE_PASS(NAME, CREATE_PASS) #endif MODULE_PASS("always-inline", AlwaysInlinerPass()) +MODULE_PASS("asan", AddressSanitizerPass(false, false, true)) MODULE_PASS("called-value-propagation", CalledValuePropagationPass()) MODULE_PASS("cg-profile", CGProfilePass()) MODULE_PASS("constmerge", ConstantMergePass()) @@ -147,6 +148,7 @@ FUNCTION_PASS("adce", ADCEPass()) FUNCTION_PASS("add-discriminators", AddDiscriminatorsPass()) FUNCTION_PASS("aggressive-instcombine", AggressiveInstCombinePass()) FUNCTION_PASS("alignment-from-assumptions", AlignmentFromAssumptionsPass()) +FUNCTION_PASS("asan", AddressSanitizerPass(false, false, false)) FUNCTION_PASS("bdce", BDCEPass()) FUNCTION_PASS("bounds-checking", BoundsCheckingPass()) FUNCTION_PASS("break-crit-edges", BreakCriticalEdgesPass()) diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp index 15eba9089cb..b819565e7ba 100644 --- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -25,7 +25,6 @@ #include "llvm/ADT/Twine.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/TargetLibraryInfo.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/BinaryFormat/MachO.h" #include "llvm/IR/Argument.h" @@ -70,8 +69,10 @@ #include "llvm/Support/ScopedPrinter.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Instrumentation.h" +#include "llvm/Transforms/Instrumentation/AddressSanitizerPass.h" #include "llvm/Transforms/Utils/ASanStackFrameLayout.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/ModuleUtils.h" #include "llvm/Transforms/Utils/PromoteMemToReg.h" #include @@ -597,26 +598,22 @@ static size_t RedzoneSizeForScale(int MappingScale) { namespace { /// AddressSanitizer: instrument the code in module to find memory bugs. -struct AddressSanitizer : public FunctionPass { - // Pass identification, replacement for typeid - static char ID; - - explicit AddressSanitizer(bool CompileKernel = false, bool Recover = false, +struct AddressSanitizer { + explicit AddressSanitizer(Module &M, DominatorTree *DT, + bool CompileKernel = false, bool Recover = false, bool UseAfterScope = false) - : FunctionPass(ID), UseAfterScope(UseAfterScope || ClUseAfterScope) { + : UseAfterScope(UseAfterScope || ClUseAfterScope), DT(DT) { this->Recover = ClRecover.getNumOccurrences() > 0 ? ClRecover : Recover; this->CompileKernel = ClEnableKasan.getNumOccurrences() > 0 ? ClEnableKasan : CompileKernel; - initializeAddressSanitizerPass(*PassRegistry::getPassRegistry()); - } - - StringRef getPassName() const override { - return "AddressSanitizerFunctionPass"; - } - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); - AU.addRequired(); + // Initialize the private fields. No one has accessed them before. + GlobalsMD.init(M); + C = &(M.getContext()); + LongSize = M.getDataLayout().getPointerSizeInBits(); + IntptrTy = Type::getIntNTy(*C, LongSize); + TargetTriple = Triple(M.getTargetTriple()); + Mapping = getShadowMapping(TargetTriple, LongSize, CompileKernel); } uint64_t getAllocaSizeInBytes(const AllocaInst &AI) const { @@ -661,12 +658,12 @@ struct AddressSanitizer : public FunctionPass { Value *SizeArgument, uint32_t Exp); void instrumentMemIntrinsic(MemIntrinsic *MI); Value *memToShadow(Value *Shadow, IRBuilder<> &IRB); - bool runOnFunction(Function &F) override; bool maybeInsertAsanInitAtFunctionEntry(Function &F); void maybeInsertDynamicShadowAtFunctionEntry(Function &F); void markEscapedLocalAllocas(Function &F); - bool doInitialization(Module &M) override; - bool doFinalization(Module &M) override; + + /// Return true if the function changed. + bool instrument(Function &F, const TargetLibraryInfo *TLI); DominatorTree &getDominatorTree() const { return *DT; } @@ -724,16 +721,12 @@ private: DenseMap ProcessedAllocas; }; -class AddressSanitizerModule : public ModulePass { +class AddressSanitizerModule { public: - // Pass identification, replacement for typeid - static char ID; - explicit AddressSanitizerModule(bool CompileKernel = false, bool Recover = false, bool UseGlobalsGC = true) - : ModulePass(ID), - UseGlobalsGC(UseGlobalsGC && ClUseGlobalsGC), + : UseGlobalsGC(UseGlobalsGC && ClUseGlobalsGC), // Not a typo: ClWithComdat is almost completely pointless without // ClUseGlobalsGC (because then it only works on modules without // globals, which are rare); it is a prerequisite for ClUseGlobalsGC; @@ -742,14 +735,12 @@ public: // ClWithComdat and ClUseGlobalsGC unless the frontend says it's ok to // do globals-gc. UseCtorComdat(UseGlobalsGC && ClWithComdat) { - this->Recover = ClRecover.getNumOccurrences() > 0 ? - ClRecover : Recover; - this->CompileKernel = ClEnableKasan.getNumOccurrences() > 0 ? - ClEnableKasan : CompileKernel; - } + this->Recover = ClRecover.getNumOccurrences() > 0 ? ClRecover : Recover; + this->CompileKernel = + ClEnableKasan.getNumOccurrences() > 0 ? ClEnableKasan : CompileKernel; + } - bool runOnModule(Module &M) override; - StringRef getPassName() const override { return "AddressSanitizerModule"; } + bool instrument(Module &M); private: void initializeCallbacks(Module &M); @@ -1057,18 +1048,100 @@ struct FunctionStackPoisoner : public InstVisitor { Instruction *ThenTerm, Value *ValueIfFalse); }; +class AddressSanitizerLegacyPass : public FunctionPass { +public: + static char ID; + + explicit AddressSanitizerLegacyPass(bool CompileKernel = false, + bool Recover = false, + bool UseAfterScope = false) + : FunctionPass(ID), CompileKernel(CompileKernel), Recover(Recover), + UseAfterScope(UseAfterScope) {} + + StringRef getPassName() const override { + return "AddressSanitizerFunctionPass"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + } + + bool runOnFunction(Function &F) override { + DominatorTree *DTree = + &getAnalysis().getDomTree(); + const TargetLibraryInfo *TLI = + &getAnalysis().getTLI(); + AddressSanitizer Sanitizer(*F.getParent(), DTree, CompileKernel, Recover, + UseAfterScope); + return Sanitizer.instrument(F, TLI); + } + +private: + bool CompileKernel; + bool Recover; + bool UseAfterScope; +}; + +class AddressSanitizerModuleLegacyPass : public ModulePass { +public: + static char ID; + + explicit AddressSanitizerModuleLegacyPass(bool CompileKernel = false, + bool Recover = false, + bool UseAfterScope = true) + : ModulePass(ID), CompileKernel(CompileKernel), Recover(Recover), + UseAfterScope(UseAfterScope) {} + + StringRef getPassName() const override { return "AddressSanitizerModule"; } + + bool runOnModule(Module &M) override { + AddressSanitizerModule Sanitizer(CompileKernel, Recover, UseAfterScope); + return Sanitizer.instrument(M); + } + +private: + bool CompileKernel; + bool Recover; + bool UseAfterScope; +}; + } // end anonymous namespace -char AddressSanitizer::ID = 0; +AddressSanitizerPass::AddressSanitizerPass(bool CompileKernel, bool Recover, + bool UseAfterScope) + : CompileKernel(CompileKernel), Recover(Recover), + UseAfterScope(UseAfterScope) {} + +PreservedAnalyses AddressSanitizerPass::run(Function &F, + AnalysisManager &AM) { + DominatorTree *DT = &AM.getResult(F); + const TargetLibraryInfo *TLI = &AM.getResult(F); + AddressSanitizer Sanitizer(*F.getParent(), DT, CompileKernel, Recover, + UseAfterScope); + if (Sanitizer.instrument(F, TLI)) + return PreservedAnalyses::none(); + return PreservedAnalyses::all(); +} + +PreservedAnalyses AddressSanitizerPass::run(Module &M, + AnalysisManager &AM) { + AddressSanitizerModule Sanitizer(CompileKernel, Recover, UseAfterScope); + if (Sanitizer.instrument(M)) + return PreservedAnalyses::none(); + return PreservedAnalyses::all(); +} + +char AddressSanitizerLegacyPass::ID = 0; INITIALIZE_PASS_BEGIN( - AddressSanitizer, "asan", + AddressSanitizerLegacyPass, "asan", "AddressSanitizer: detects use-after-free and out-of-bounds bugs.", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_END( - AddressSanitizer, "asan", + AddressSanitizerLegacyPass, "asan", "AddressSanitizer: detects use-after-free and out-of-bounds bugs.", false, false) @@ -1076,13 +1149,13 @@ FunctionPass *llvm::createAddressSanitizerFunctionPass(bool CompileKernel, bool Recover, bool UseAfterScope) { assert(!CompileKernel || Recover); - return new AddressSanitizer(CompileKernel, Recover, UseAfterScope); + return new AddressSanitizerLegacyPass(CompileKernel, Recover, UseAfterScope); } -char AddressSanitizerModule::ID = 0; +char AddressSanitizerModuleLegacyPass::ID = 0; INITIALIZE_PASS( - AddressSanitizerModule, "asan-module", + AddressSanitizerModuleLegacyPass, "asan-module", "AddressSanitizer: detects use-after-free and out-of-bounds bugs." "ModulePass", false, false) @@ -1091,7 +1164,8 @@ ModulePass *llvm::createAddressSanitizerModulePass(bool CompileKernel, bool Recover, bool UseGlobalsGC) { assert(!CompileKernel || Recover); - return new AddressSanitizerModule(CompileKernel, Recover, UseGlobalsGC); + return new AddressSanitizerModuleLegacyPass(CompileKernel, Recover, + UseGlobalsGC); } static size_t TypeSizeToSizeIndex(uint32_t TypeSize) { @@ -2268,7 +2342,7 @@ int AddressSanitizerModule::GetAsanVersion(const Module &M) const { return Version; } -bool AddressSanitizerModule::runOnModule(Module &M) { +bool AddressSanitizerModule::instrument(Module &M) { C = &(M.getContext()); int LongSize = M.getDataLayout().getPointerSizeInBits(); IntptrTy = Type::getIntNTy(*C, LongSize); @@ -2387,25 +2461,6 @@ void AddressSanitizer::initializeCallbacks(Module &M) { ArrayType::get(IRB.getInt8Ty(), 0)); } -// virtual -bool AddressSanitizer::doInitialization(Module &M) { - // Initialize the private fields. No one has accessed them before. - GlobalsMD.init(M); - - C = &(M.getContext()); - LongSize = M.getDataLayout().getPointerSizeInBits(); - IntptrTy = Type::getIntNTy(*C, LongSize); - TargetTriple = Triple(M.getTargetTriple()); - - Mapping = getShadowMapping(TargetTriple, LongSize, CompileKernel); - return true; -} - -bool AddressSanitizer::doFinalization(Module &M) { - GlobalsMD.reset(); - return false; -} - bool AddressSanitizer::maybeInsertAsanInitAtFunctionEntry(Function &F) { // For each NSObject descendant having a +load method, this method is invoked // by the ObjC runtime before any of the static constructors is called. @@ -2479,7 +2534,7 @@ void AddressSanitizer::markEscapedLocalAllocas(Function &F) { } } -bool AddressSanitizer::runOnFunction(Function &F) { +bool AddressSanitizer::instrument(Function &F, const TargetLibraryInfo *TLI) { if (F.getLinkage() == GlobalValue::AvailableExternallyLinkage) return false; if (!ClDebugFunc.empty() && ClDebugFunc == F.getName()) return false; if (F.getName().startswith("__asan_")) return false; @@ -2498,7 +2553,6 @@ bool AddressSanitizer::runOnFunction(Function &F) { LLVM_DEBUG(dbgs() << "ASAN instrumenting:\n" << F << "\n"); initializeCallbacks(*F.getParent()); - DT = &getAnalysis().getDomTree(); FunctionStateRAII CleanupObj(this); @@ -2519,8 +2573,6 @@ bool AddressSanitizer::runOnFunction(Function &F) { bool IsWrite; unsigned Alignment; uint64_t TypeSize; - const TargetLibraryInfo *TLI = - &getAnalysis().getTLI(); // Fill the set of memory operations to instrument. for (auto &BB : F) { diff --git a/lib/Transforms/Instrumentation/Instrumentation.cpp b/lib/Transforms/Instrumentation/Instrumentation.cpp index d52b1b92817..ea819c1856b 100644 --- a/lib/Transforms/Instrumentation/Instrumentation.cpp +++ b/lib/Transforms/Instrumentation/Instrumentation.cpp @@ -56,8 +56,8 @@ BasicBlock::iterator llvm::PrepareToSplitEntryBlock(BasicBlock &BB, /// initializeInstrumentation - Initialize all passes in the TransformUtils /// library. void llvm::initializeInstrumentation(PassRegistry &Registry) { - initializeAddressSanitizerPass(Registry); - initializeAddressSanitizerModulePass(Registry); + initializeAddressSanitizerLegacyPassPass(Registry); + initializeAddressSanitizerModuleLegacyPassPass(Registry); initializeBoundsCheckingLegacyPassPass(Registry); initializeControlHeightReductionLegacyPassPass(Registry); initializeGCOVProfilerLegacyPassPass(Registry); diff --git a/test/Instrumentation/AddressSanitizer/basic.ll b/test/Instrumentation/AddressSanitizer/basic.ll index 099965348eb..be80a89392c 100644 --- a/test/Instrumentation/AddressSanitizer/basic.ll +++ b/test/Instrumentation/AddressSanitizer/basic.ll @@ -1,7 +1,9 @@ ; Test basic address sanitizer instrumentation. ; ; RUN: opt < %s -asan -asan-module -S | FileCheck --check-prefixes=CHECK,CHECK-S3 %s +; RUN: opt < %s -passes='function(asan),module(asan)' -S | FileCheck --check-prefixes=CHECK,CHECK-S3 %s ; RUN: opt < %s -asan -asan-module -asan-mapping-scale=5 -S | FileCheck --check-prefixes=CHECK,CHECK-S5 %s +; RUN: opt < %s -passes='function(asan),module(asan)' -asan-mapping-scale=5 -S | FileCheck --check-prefixes=CHECK,CHECK-S5 %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" target triple = "x86_64-unknown-linux-gnu" -- GitLab From 4db84ee724af1ef6d709901e826fae6ee67c696a Mon Sep 17 00:00:00 2001 From: Artem Dergachev Date: Thu, 11 Oct 2018 18:43:08 +0000 Subject: [PATCH 0065/1116] Revert r344197 "[MC][ELF] compute entity size for explicit sections" Revert r344206 "[MC][ELF] Fix section_mergeable_size.ll" They were causing failures on too many important buildbots for too long. Please revert eagerly if your fix takes more than a couple of hours to land! git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344278 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/TargetLoweringObjectFileImpl.cpp | 50 +++++++++---------- .../CodeGen/Generic/section_mergeable_size.ll | 3 -- 2 files changed, 25 insertions(+), 28 deletions(-) delete mode 100644 test/CodeGen/Generic/section_mergeable_size.ll diff --git a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp index b046cd81d6c..f6882c40531 100644 --- a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp +++ b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp @@ -506,30 +506,6 @@ static const MCSymbolELF *getAssociatedSymbol(const GlobalObject *GO, return OtherGO ? dyn_cast(TM.getSymbol(OtherGO)) : nullptr; } -static unsigned getEntrySizeForKind(SectionKind Kind) { - if (Kind.isMergeable1ByteCString()) - return 1; - else if (Kind.isMergeable2ByteCString()) - return 2; - else if (Kind.isMergeable4ByteCString()) - return 4; - else if (Kind.isMergeableConst4()) - return 4; - else if (Kind.isMergeableConst8()) - return 8; - else if (Kind.isMergeableConst16()) - return 16; - else if (Kind.isMergeableConst32()) - return 32; - else { - // We shouldn't have mergeable C strings or mergeable constants that we - // didn't handle above. - assert(!Kind.isMergeableCString() && "unknown string width"); - assert(!Kind.isMergeableConst() && "unknown data width"); - return 0; - } -} - MCSection *TargetLoweringObjectFileELF::getExplicitSectionGlobal( const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const { StringRef SectionName = GO->getSection(); @@ -574,7 +550,7 @@ MCSection *TargetLoweringObjectFileELF::getExplicitSectionGlobal( MCSectionELF *Section = getContext().getELFSection( SectionName, getELFSectionType(SectionName, Kind), Flags, - getEntrySizeForKind(Kind), Group, UniqueID, AssociatedSymbol); + /*EntrySize=*/0, Group, UniqueID, AssociatedSymbol); // Make sure that we did not get some other section with incompatible sh_link. // This should not be possible due to UniqueID code above. assert(Section->getAssociatedSymbol() == AssociatedSymbol && @@ -601,6 +577,30 @@ static StringRef getSectionPrefixForGlobal(SectionKind Kind) { return ".data.rel.ro"; } +static unsigned getEntrySizeForKind(SectionKind Kind) { + if (Kind.isMergeable1ByteCString()) + return 1; + else if (Kind.isMergeable2ByteCString()) + return 2; + else if (Kind.isMergeable4ByteCString()) + return 4; + else if (Kind.isMergeableConst4()) + return 4; + else if (Kind.isMergeableConst8()) + return 8; + else if (Kind.isMergeableConst16()) + return 16; + else if (Kind.isMergeableConst32()) + return 32; + else { + // We shouldn't have mergeable C strings or mergeable constants that we + // didn't handle above. + assert(!Kind.isMergeableCString() && "unknown string width"); + assert(!Kind.isMergeableConst() && "unknown data width"); + return 0; + } +} + static MCSectionELF *selectELFSectionForGlobal( MCContext &Ctx, const GlobalObject *GO, SectionKind Kind, Mangler &Mang, const TargetMachine &TM, bool EmitUniqueSection, unsigned Flags, diff --git a/test/CodeGen/Generic/section_mergeable_size.ll b/test/CodeGen/Generic/section_mergeable_size.ll deleted file mode 100644 index 0a7ddd110c4..00000000000 --- a/test/CodeGen/Generic/section_mergeable_size.ll +++ /dev/null @@ -1,3 +0,0 @@ -; RUN: llc < %s | FileCheck %s -@a = internal unnamed_addr constant [1 x [1 x i32]] zeroinitializer, section ".init.rodata", align 4 -; CHECK: .init.rodata,"aM",{{[@%]}}progbits,4 -- GitLab From 75105c59a05de66c1c3237d21be676f17932c0ca Mon Sep 17 00:00:00 2001 From: Zachary Turner Date: Thu, 11 Oct 2018 18:45:44 +0000 Subject: [PATCH 0066/1116] Revert SymbolFileNativePDB plugin. This was originally causing some test failures on non-Windows platforms, which required fixes in the compiler and linker. After those fixes, however, other tests started failing. Reverting temporarily until I can address everything. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344279 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/AsmPrinter/CodeViewDebug.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp index 8232f076a93..4d45a103c5a 100644 --- a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp +++ b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp @@ -73,7 +73,6 @@ #include "llvm/Support/Error.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FormatVariadic.h" -#include "llvm/Support/Path.h" #include "llvm/Support/SMLoc.h" #include "llvm/Support/ScopedPrinter.h" #include "llvm/Target/TargetLoweringObjectFile.h" @@ -135,9 +134,7 @@ StringRef CodeViewDebug::getFullFilepath(const DIFile *File) { // If this is a Unix-style path, just use it as is. Don't try to canonicalize // it textually because one of the path components could be a symlink. - if (Dir.startswith("/") || Filename.startswith("/")) { - if (llvm::sys::path::is_absolute(Filename, llvm::sys::path::Style::posix)) - return Filename; + if (!Dir.empty() && Dir[0] == '/') { Filepath = Dir; if (Dir.back() != '/') Filepath += '/'; -- GitLab From 07125b4a5bda6c5c6c7debfadf48ac23350f1d7b Mon Sep 17 00:00:00 2001 From: Thomas Lively Date: Thu, 11 Oct 2018 18:45:48 +0000 Subject: [PATCH 0067/1116] [WebAssembly] Revert rL344180, which was breaking expensive checks git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344280 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../WebAssembly/WebAssemblyInstrAtomics.td | 16 ++++++++++++++ .../WebAssembly/WebAssemblyInstrCall.td | 4 ++++ .../WebAssembly/WebAssemblyInstrControl.td | 8 +++++++ .../WebAssembly/WebAssemblyInstrConv.td | 12 ++++++++++ .../WebAssembly/WebAssemblyInstrExceptRef.td | 4 ++++ .../WebAssembly/WebAssemblyInstrFloat.td | 12 ++++++++++ .../WebAssembly/WebAssemblyInstrFormats.td | 1 - .../WebAssembly/WebAssemblyInstrInfo.td | 7 ++++-- .../WebAssembly/WebAssemblyInstrInteger.td | 9 ++++++++ .../WebAssembly/WebAssemblyInstrMemory.td | 22 +++++++++++++++++++ .../WebAssembly/WebAssemblyInstrSIMD.td | 2 ++ 11 files changed, 94 insertions(+), 3 deletions(-) diff --git a/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td b/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td index f9d092e4b8a..9eff2cfde0a 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td @@ -24,8 +24,10 @@ multiclass ATOMIC_I; } +let Defs = [ARGUMENTS] in { defm ATOMIC_LOAD_I32 : WebAssemblyLoad; defm ATOMIC_LOAD_I64 : WebAssemblyLoad; +} // Defs = [ARGUMENTS] // Select loads with no constant offset. let Predicates = [HasAtomics] in { @@ -60,11 +62,13 @@ def : LoadPatExternSymOffOnly; // Extending loads. Note that there are only zero-extending atomic loads, no // sign-extending loads. +let Defs = [ARGUMENTS] in { defm ATOMIC_LOAD8_U_I32 : WebAssemblyLoad; defm ATOMIC_LOAD16_U_I32 : WebAssemblyLoad; defm ATOMIC_LOAD8_U_I64 : WebAssemblyLoad; defm ATOMIC_LOAD16_U_I64 : WebAssemblyLoad; defm ATOMIC_LOAD32_U_I64 : WebAssemblyLoad; +} // Defs = [ARGUMENTS] // Fragments for extending loads. These are different from regular loads because // the SDNodes are derived from AtomicSDNode rather than LoadSDNode and @@ -196,8 +200,10 @@ def : LoadPatExternSymOffOnly; // Atomic stores //===----------------------------------------------------------------------===// +let Defs = [ARGUMENTS] in { defm ATOMIC_STORE_I32 : WebAssemblyStore; defm ATOMIC_STORE_I64 : WebAssemblyStore; +} // Defs = [ARGUMENTS] // We need an 'atomic' version of store patterns because store and atomic_store // nodes have different operand orders: @@ -257,11 +263,13 @@ def : AStorePatExternSymOffOnly; } // Predicates = [HasAtomics] // Truncating stores. +let Defs = [ARGUMENTS] in { defm ATOMIC_STORE8_I32 : WebAssemblyStore; defm ATOMIC_STORE16_I32 : WebAssemblyStore; defm ATOMIC_STORE8_I64 : WebAssemblyStore; defm ATOMIC_STORE16_I64 : WebAssemblyStore; defm ATOMIC_STORE32_I64 : WebAssemblyStore; +} // Defs = [ARGUMENTS] // Fragments for truncating stores. @@ -333,6 +341,8 @@ def : AStorePatExternSymOffOnly; // Atomic binary read-modify-writes //===----------------------------------------------------------------------===// +let Defs = [ARGUMENTS] in { + multiclass WebAssemblyBinRMW { defm "" : I<(outs rc:$dst), (ins P2Align:$p2align, offset32_op:$off, I32:$addr, rc:$val), @@ -420,6 +430,7 @@ defm ATOMIC_RMW16_U_XCHG_I64 : WebAssemblyBinRMW; defm ATOMIC_RMW32_U_XCHG_I64 : WebAssemblyBinRMW; +} // Select binary RMWs with no constant offset. class BinRMWPatNoOffset : @@ -663,6 +674,8 @@ defm : BinRMWTruncExtPattern< // Consider adding a pass after instruction selection that optimizes this case // if it is frequent. +let Defs = [ARGUMENTS] in { + multiclass WebAssemblyTerRMW { defm "" : I<(outs rc:$dst), (ins P2Align:$p2align, offset32_op:$off, I32:$addr, rc:$exp, @@ -686,6 +699,7 @@ defm ATOMIC_RMW16_U_CMPXCHG_I64 : WebAssemblyTerRMW; defm ATOMIC_RMW32_U_CMPXCHG_I64 : WebAssemblyTerRMW; +} // Select ternary RMWs with no constant offset. class TerRMWPatNoOffset : @@ -898,6 +912,7 @@ defm : TerRMWTruncExtPattern< // Atomic wait / notify //===----------------------------------------------------------------------===// +let Defs = [ARGUMENTS] in { let hasSideEffects = 1 in { defm ATOMIC_NOTIFY : I<(outs I32:$dst), @@ -920,6 +935,7 @@ defm ATOMIC_WAIT_I64 : "i64.atomic.wait \t${off}, ${p2align}", 0xfe02>; } // mayLoad = 1 } // hasSideEffects = 1 +} // Defs = [ARGUMENTS] let Predicates = [HasAtomics] in { // Select notifys with no constant offset. diff --git a/lib/Target/WebAssembly/WebAssemblyInstrCall.td b/lib/Target/WebAssembly/WebAssemblyInstrCall.td index 07839b79011..3c9caa3f0de 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrCall.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrCall.td @@ -15,6 +15,8 @@ // TODO: addr64: These currently assume the callee address is 32-bit. // FIXME: add $type to first call_indirect asmstr (and maybe $flags) +let Defs = [ARGUMENTS] in { + // Call sequence markers. These have an immediate which represents the amount of // stack space to allocate or free, which is used for varargs lowering. let Uses = [SP32, SP64], Defs = [SP32, SP64], isCodeGenOnly = 1 in { @@ -116,6 +118,8 @@ let Uses = [SP32, SP64], isCall = 1 in { 0x11>; } // Uses = [SP32,SP64], isCall = 1 +} // Defs = [ARGUMENTS] + // Patterns for matching a direct call to a global address. def : Pat<(i32 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))), (CALL_I32 tglobaladdr:$callee)>; diff --git a/lib/Target/WebAssembly/WebAssemblyInstrControl.td b/lib/Target/WebAssembly/WebAssemblyInstrControl.td index ed9879ae454..e27d81937dd 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrControl.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrControl.td @@ -12,6 +12,8 @@ /// //===----------------------------------------------------------------------===// +let Defs = [ARGUMENTS] in { + let isBranch = 1, isTerminator = 1, hasCtrlDep = 1 in { // The condition operand is a boolean value which WebAssembly represents as i32. defm BR_IF : I<(outs), (ins bb_op:$dst, I32:$cond), @@ -28,11 +30,15 @@ defm BR : NRI<(outs), (ins bb_op:$dst), } // isBarrier = 1 } // isBranch = 1, isTerminator = 1, hasCtrlDep = 1 +} // Defs = [ARGUMENTS] + def : Pat<(brcond (i32 (setne I32:$cond, 0)), bb:$dst), (BR_IF bb_op:$dst, I32:$cond)>; def : Pat<(brcond (i32 (seteq I32:$cond, 0)), bb:$dst), (BR_UNLESS bb_op:$dst, I32:$cond)>; +let Defs = [ARGUMENTS] in { + // TODO: SelectionDAG's lowering insists on using a pointer as the index for // jump tables, so in practice we don't ever use BR_TABLE_I64 in wasm32 mode // currently. @@ -188,3 +194,5 @@ let isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1, [(catchret bb:$dst, bb:$from)], "", 0>; } } + +} // Defs = [ARGUMENTS] diff --git a/lib/Target/WebAssembly/WebAssemblyInstrConv.td b/lib/Target/WebAssembly/WebAssemblyInstrConv.td index 0d772c743a7..6dca96f3ddd 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrConv.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrConv.td @@ -13,6 +13,8 @@ /// //===----------------------------------------------------------------------===// +let Defs = [ARGUMENTS] in { + defm I32_WRAP_I64 : I<(outs I32:$dst), (ins I64:$src), (outs), (ins), [(set I32:$dst, (trunc I64:$src))], "i32.wrap/i64\t$dst, $src", "i32.wrap/i64", 0xa7>; @@ -49,11 +51,15 @@ defm I64_EXTEND32_S_I64 : I<(outs I64:$dst), (ins I64:$src), (outs), (ins), 0xc4>; } // Predicates = [HasSignExt] +} // defs = [ARGUMENTS] + // Expand a "don't care" extend into zero-extend (chosen over sign-extend // somewhat arbitrarily, although it favors popular hardware architectures // and is conceptually a simpler operation). def : Pat<(i64 (anyext I32:$src)), (I64_EXTEND_U_I32 I32:$src)>; +let Defs = [ARGUMENTS] in { + // Conversion from floating point to integer instructions which don't trap on // overflow or invalid. defm I32_TRUNC_S_SAT_F32 : I<(outs I32:$dst), (ins F32:$src), (outs), (ins), @@ -97,6 +103,8 @@ defm I64_TRUNC_U_SAT_F64 : I<(outs I64:$dst), (ins F64:$src), (outs), (ins), "i64.trunc_u:sat/f64", 0xfc07>, Requires<[HasNontrappingFPToInt]>; +} // Defs = [Arguments] + // Lower llvm.wasm.trunc.saturate.* to saturating instructions def : Pat<(int_wasm_trunc_saturate_signed F32:$src), (I32_TRUNC_S_SAT_F32 F32:$src)>; @@ -115,6 +123,8 @@ def : Pat<(int_wasm_trunc_saturate_signed F64:$src), def : Pat<(int_wasm_trunc_saturate_unsigned F64:$src), (I64_TRUNC_U_SAT_F64 F64:$src)>; +let Defs = [ARGUMENTS] in { + // Conversion from floating point to integer pseudo-instructions which don't // trap on overflow or invalid. let usesCustomInserter = 1, isCodeGenOnly = 1 in { @@ -230,3 +240,5 @@ defm F64_REINTERPRET_I64 : I<(outs F64:$dst), (ins I64:$src), (outs), (ins), [(set F64:$dst, (bitconvert I64:$src))], "f64.reinterpret/i64\t$dst, $src", "f64.reinterpret/i64", 0xbf>; + +} // Defs = [ARGUMENTS] diff --git a/lib/Target/WebAssembly/WebAssemblyInstrExceptRef.td b/lib/Target/WebAssembly/WebAssemblyInstrExceptRef.td index a251d60b89e..41b39f69e51 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrExceptRef.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrExceptRef.td @@ -12,6 +12,8 @@ /// //===----------------------------------------------------------------------===// +let Defs = [ARGUMENTS] in { + defm SELECT_EXCEPT_REF : I<(outs EXCEPT_REF:$dst), (ins EXCEPT_REF:$lhs, EXCEPT_REF:$rhs, I32:$cond), (outs), (ins), @@ -21,6 +23,8 @@ defm SELECT_EXCEPT_REF : I<(outs EXCEPT_REF:$dst), "except_ref.select\t$dst, $lhs, $rhs, $cond", "except_ref.select", 0x1b>; +} // Defs = [ARGUMENTS] + def : Pat<(select (i32 (setne I32:$cond, 0)), EXCEPT_REF:$lhs, EXCEPT_REF:$rhs), (SELECT_EXCEPT_REF EXCEPT_REF:$lhs, EXCEPT_REF:$rhs, I32:$cond)>; def : Pat<(select (i32 (seteq I32:$cond, 0)), EXCEPT_REF:$lhs, EXCEPT_REF:$rhs), diff --git a/lib/Target/WebAssembly/WebAssemblyInstrFloat.td b/lib/Target/WebAssembly/WebAssemblyInstrFloat.td index 364c485f409..70e27df27e6 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrFloat.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrFloat.td @@ -45,6 +45,8 @@ multiclass ComparisonFP f32Inst, bits<32> f !strconcat("f64.", name), f64Inst>; } +let Defs = [ARGUMENTS] in { + let isCommutable = 1 in defm ADD : BinaryFP; defm SUB : BinaryFP; @@ -67,6 +69,8 @@ defm FLOOR : UnaryFP; defm TRUNC : UnaryFP; defm NEAREST : UnaryFP; +} // Defs = [ARGUMENTS] + // DAGCombine oddly folds casts into the rhs of copysign. Unfold them. def : Pat<(fcopysign F64:$lhs, F32:$rhs), (COPYSIGN_F64 F64:$lhs, (F64_PROMOTE_F32 F32:$rhs))>; @@ -77,6 +81,8 @@ def : Pat<(fcopysign F32:$lhs, F64:$rhs), def : Pat<(frint f32:$src), (NEAREST_F32 f32:$src)>; def : Pat<(frint f64:$src), (NEAREST_F64 f64:$src)>; +let Defs = [ARGUMENTS] in { + let isCommutable = 1 in { defm EQ : ComparisonFP; defm NE : ComparisonFP; @@ -86,6 +92,8 @@ defm LE : ComparisonFP; defm GT : ComparisonFP; defm GE : ComparisonFP; +} // Defs = [ARGUMENTS] + // Don't care floating-point comparisons, supported via other comparisons. def : Pat<(seteq f32:$lhs, f32:$rhs), (EQ_F32 f32:$lhs, f32:$rhs)>; def : Pat<(setne f32:$lhs, f32:$rhs), (NE_F32 f32:$lhs, f32:$rhs)>; @@ -100,6 +108,8 @@ def : Pat<(setle f64:$lhs, f64:$rhs), (LE_F64 f64:$lhs, f64:$rhs)>; def : Pat<(setgt f64:$lhs, f64:$rhs), (GT_F64 f64:$lhs, f64:$rhs)>; def : Pat<(setge f64:$lhs, f64:$rhs), (GE_F64 f64:$lhs, f64:$rhs)>; +let Defs = [ARGUMENTS] in { + defm SELECT_F32 : I<(outs F32:$dst), (ins F32:$lhs, F32:$rhs, I32:$cond), (outs), (ins), [(set F32:$dst, (select I32:$cond, F32:$lhs, F32:$rhs))], @@ -109,6 +119,8 @@ defm SELECT_F64 : I<(outs F64:$dst), (ins F64:$lhs, F64:$rhs, I32:$cond), [(set F64:$dst, (select I32:$cond, F64:$lhs, F64:$rhs))], "f64.select\t$dst, $lhs, $rhs, $cond", "f64.select", 0x1b>; +} // Defs = [ARGUMENTS] + // ISD::SELECT requires its operand to conform to getBooleanContents, but // WebAssembly's select interprets any non-zero value as true, so we can fold // a setne with 0 into a select. diff --git a/lib/Target/WebAssembly/WebAssemblyInstrFormats.td b/lib/Target/WebAssembly/WebAssemblyInstrFormats.td index 2d23acfc825..683fb3d981f 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrFormats.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrFormats.td @@ -30,7 +30,6 @@ class NI pattern, bit stack, string asmstr = "", dag OutOperandList = oops; dag InOperandList = iops; let Pattern = pattern; - let Defs = [ARGUMENTS]; } // Generates both register and stack based versions of one actual instruction. diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td index 9e1409cf90e..a2ea14cc28b 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td @@ -164,8 +164,7 @@ include "WebAssemblyInstrFormats.td" //===----------------------------------------------------------------------===// multiclass ARGUMENT { - let hasSideEffects = 1, isCodeGenOnly = 1, - Defs = [], Uses = [ARGUMENTS] in + let hasSideEffects = 1, Uses = [ARGUMENTS], isCodeGenOnly = 1 in defm ARGUMENT_#vt : I<(outs vt:$res), (ins i32imm:$argno), (outs), (ins i32imm:$argno), [(set vt:$res, (WebAssemblyargument timm:$argno))]>; @@ -176,6 +175,8 @@ defm "": ARGUMENT; defm "": ARGUMENT; defm "": ARGUMENT; +let Defs = [ARGUMENTS] in { + // get_local and set_local are not generated by instruction selection; they // are implied by virtual register uses and defs. multiclass LOCAL { @@ -265,6 +266,8 @@ defm CONST_F64 : I<(outs F64:$res), (ins f64imm_op:$imm), "f64.const\t$res, $imm", "f64.const\t$imm", 0x44>; } // isMoveImm = 1, isAsCheapAsAMove = 1, isReMaterializable = 1 +} // Defs = [ARGUMENTS] + def : Pat<(i32 (WebAssemblywrapper tglobaladdr:$addr)), (CONST_I32 tglobaladdr:$addr)>; def : Pat<(i32 (WebAssemblywrapper texternalsym:$addr)), diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInteger.td b/lib/Target/WebAssembly/WebAssemblyInstrInteger.td index bd41f46214a..44c93de54aa 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrInteger.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrInteger.td @@ -45,6 +45,9 @@ multiclass ComparisonInt i32Inst, bits<32> !strconcat("i64.", name), i64Inst>; } + +let Defs = [ARGUMENTS] in { + // The spaces after the names are for aesthetic purposes only, to make // operands line up vertically after tab expansion. let isCommutable = 1 in @@ -94,12 +97,16 @@ defm EQZ_I64 : I<(outs I32:$dst), (ins I64:$src), (outs), (ins), [(set I32:$dst, (setcc I64:$src, 0, SETEQ))], "i64.eqz \t$dst, $src", "i64.eqz", 0x50>; +} // Defs = [ARGUMENTS] + // Optimize away an explicit mask on a rotate count. def : Pat<(rotl I32:$lhs, (and I32:$rhs, 31)), (ROTL_I32 I32:$lhs, I32:$rhs)>; def : Pat<(rotr I32:$lhs, (and I32:$rhs, 31)), (ROTR_I32 I32:$lhs, I32:$rhs)>; def : Pat<(rotl I64:$lhs, (and I64:$rhs, 63)), (ROTL_I64 I64:$lhs, I64:$rhs)>; def : Pat<(rotr I64:$lhs, (and I64:$rhs, 63)), (ROTR_I64 I64:$lhs, I64:$rhs)>; +let Defs = [ARGUMENTS] in { + defm SELECT_I32 : I<(outs I32:$dst), (ins I32:$lhs, I32:$rhs, I32:$cond), (outs), (ins), [(set I32:$dst, (select I32:$cond, I32:$lhs, I32:$rhs))], @@ -109,6 +116,8 @@ defm SELECT_I64 : I<(outs I64:$dst), (ins I64:$lhs, I64:$rhs, I32:$cond), [(set I64:$dst, (select I32:$cond, I64:$lhs, I64:$rhs))], "i64.select\t$dst, $lhs, $rhs, $cond", "i64.select", 0x1b>; +} // Defs = [ARGUMENTS] + // ISD::SELECT requires its operand to conform to getBooleanContents, but // WebAssembly's select interprets any non-zero value as true, so we can fold // a setne with 0 into a select. diff --git a/lib/Target/WebAssembly/WebAssemblyInstrMemory.td b/lib/Target/WebAssembly/WebAssemblyInstrMemory.td index ccc331d1bf0..76ef1461d22 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrMemory.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrMemory.td @@ -53,6 +53,8 @@ def regPlusGA : PatFrag<(ops node:$addr, node:$off), // We don't need a regPlusES because external symbols never have constant // offsets folded into them, so we can just use add. +let Defs = [ARGUMENTS] in { + // Defines atomic and non-atomic loads, regular and extending. multiclass WebAssemblyLoad { let mayLoad = 1 in @@ -71,6 +73,8 @@ defm LOAD_I64 : WebAssemblyLoad; defm LOAD_F32 : WebAssemblyLoad; defm LOAD_F64 : WebAssemblyLoad; +} // Defs = [ARGUMENTS] + // Select loads with no constant offset. class LoadPatNoOffset : Pat<(ty (kind I32:$addr)), (inst 0, 0, I32:$addr)>; @@ -140,6 +144,8 @@ def : LoadPatExternSymOffOnly; def : LoadPatExternSymOffOnly; def : LoadPatExternSymOffOnly; +let Defs = [ARGUMENTS] in { + // Extending load. defm LOAD8_S_I32 : WebAssemblyLoad; defm LOAD8_U_I32 : WebAssemblyLoad; @@ -152,6 +158,8 @@ defm LOAD16_U_I64 : WebAssemblyLoad; defm LOAD32_S_I64 : WebAssemblyLoad; defm LOAD32_U_I64 : WebAssemblyLoad; +} // Defs = [ARGUMENTS] + // Select extending loads with no constant offset. def : LoadPatNoOffset; def : LoadPatNoOffset; @@ -295,6 +303,9 @@ def : LoadPatExternSymOffOnly; def : LoadPatExternSymOffOnly; def : LoadPatExternSymOffOnly; + +let Defs = [ARGUMENTS] in { + // Defines atomic and non-atomic stores, regular and truncating multiclass WebAssemblyStore { let mayStore = 1 in @@ -312,6 +323,8 @@ defm STORE_I64 : WebAssemblyStore; defm STORE_F32 : WebAssemblyStore; defm STORE_F64 : WebAssemblyStore; +} // Defs = [ARGUMENTS] + // Select stores with no constant offset. class StorePatNoOffset : Pat<(node ty:$val, I32:$addr), (inst 0, 0, I32:$addr, ty:$val)>; @@ -376,6 +389,9 @@ def : StorePatExternSymOffOnly; def : StorePatExternSymOffOnly; def : StorePatExternSymOffOnly; + +let Defs = [ARGUMENTS] in { + // Truncating store. defm STORE8_I32 : WebAssemblyStore; defm STORE16_I32 : WebAssemblyStore; @@ -383,6 +399,8 @@ defm STORE8_I64 : WebAssemblyStore; defm STORE16_I64 : WebAssemblyStore; defm STORE32_I64 : WebAssemblyStore; +} // Defs = [ARGUMENTS] + // Select truncating stores with no constant offset. def : StorePatNoOffset; def : StorePatNoOffset; @@ -430,6 +448,8 @@ def : StorePatExternSymOffOnly; def : StorePatExternSymOffOnly; def : StorePatExternSymOffOnly; +let Defs = [ARGUMENTS] in { + // Current memory size. defm MEMORY_SIZE_I32 : I<(outs I32:$dst), (ins i32imm:$flags), (outs), (ins i32imm:$flags), @@ -473,6 +493,8 @@ defm GROW_MEMORY_I32 : I<(outs I32:$dst), (ins i32imm:$flags, I32:$delta), 0x40>, Requires<[HasAddr32]>; +} // Defs = [ARGUMENTS] + def : Pat<(int_wasm_current_memory), (CURRENT_MEMORY_I32 0)>; def : Pat<(int_wasm_grow_memory I32:$delta), diff --git a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td index 419aa0b437f..57024616f3f 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -55,6 +55,7 @@ multiclass ConstVec { "v128.const\t"#args, 0>; } +let Defs = [ARGUMENTS] in { defm "" : ConstVec; +} // Defs = [ARGUMENTS] // Create vector with identical lanes: splat def splat2 : PatFrag<(ops node:$x), (build_vector node:$x, node:$x)>; -- GitLab From 19a8ca2849de4671efe24908c3719eb1e294ee06 Mon Sep 17 00:00:00 2001 From: Sumanth Gundapaneni Date: Thu, 11 Oct 2018 19:42:46 +0000 Subject: [PATCH 0068/1116] [Pipeliner] Fix the Schedule DAG topoligical order. This patch updates the DAG change to reflect in the topological ordering of the nodes. Differential Revision: https://reviews.llvm.org/D53105 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344282 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/MachinePipeliner.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/CodeGen/MachinePipeliner.cpp b/lib/CodeGen/MachinePipeliner.cpp index 3d8510f7c0c..1109be15077 100644 --- a/lib/CodeGen/MachinePipeliner.cpp +++ b/lib/CodeGen/MachinePipeliner.cpp @@ -1295,6 +1295,7 @@ void SwingSchedulerDAG::changeDependences() { // Add a dependence between the new instruction and the instruction // that defines the new base. SDep Dep(&I, SDep::Anti, NewBase); + Topo.AddPred(LastSU, &I); LastSU->addPred(Dep); // Remember the base and offset information so that we can update the -- GitLab From ba50914be1ad0c99e0dae4bcab52b3b167137cde Mon Sep 17 00:00:00 2001 From: Sumanth Gundapaneni Date: Thu, 11 Oct 2018 19:45:07 +0000 Subject: [PATCH 0069/1116] [Pipeliner] Use the Index from Topo instead of relying on NodeNum. (NFC) In future, if we may add any new DAG mutations other than artificial dependencies, the NodeNum may not be valid. Instead the index from topological schedule DAG can be used as long as we update it with the DAG change. Differential Revision: https://reviews.llvm.org/D53104 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344283 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/MachinePipeliner.cpp | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/lib/CodeGen/MachinePipeliner.cpp b/lib/CodeGen/MachinePipeliner.cpp index 1109be15077..02344225391 100644 --- a/lib/CodeGen/MachinePipeliner.cpp +++ b/lib/CodeGen/MachinePipeliner.cpp @@ -278,12 +278,21 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs { BitVector Blocked; SmallVector, 10> B; SmallVector, 16> AdjK; + // Node to Index from ScheduleDAGTopologicalSort + std::vector *Node2Idx; unsigned NumPaths; static unsigned MaxPaths; public: - Circuits(std::vector &SUs) - : SUnits(SUs), Blocked(SUs.size()), B(SUs.size()), AdjK(SUs.size()) {} + Circuits(std::vector &SUs, ScheduleDAGTopologicalSort &Topo) + : SUnits(SUs), Blocked(SUs.size()), B(SUs.size()), AdjK(SUs.size()) { + Node2Idx = new std::vector(SUs.size()); + unsigned Idx = 0; + for (const auto &NodeNum : Topo) + Node2Idx->at(NodeNum) = Idx++; + } + + ~Circuits() { delete Node2Idx; } /// Reset the data structures used in the circuit algorithm. void reset() { @@ -1562,7 +1571,8 @@ bool SwingSchedulerDAG::Circuits::circuit(int V, int S, NodeSetType &NodeSets, ++NumPaths; break; } else if (!Blocked.test(W)) { - if (circuit(W, S, NodeSets, W < V ? true : HasBackedge)) + if (circuit(W, S, NodeSets, + Node2Idx->at(W) < Node2Idx->at(V) ? true : HasBackedge)) F = true; } } @@ -1602,7 +1612,7 @@ void SwingSchedulerDAG::findCircuits(NodeSetType &NodeSets) { // but we do this to find the circuits, and then change them back. swapAntiDependences(SUnits); - Circuits Cir(SUnits); + Circuits Cir(SUnits, Topo); // Create the adjacency structure. Cir.createAdjacencyStructure(this); for (int i = 0, e = SUnits.size(); i != e; ++i) { -- GitLab From 2141d146188bfbfce6917ff11f606de819b57d52 Mon Sep 17 00:00:00 2001 From: Sumanth Gundapaneni Date: Thu, 11 Oct 2018 19:48:15 +0000 Subject: [PATCH 0070/1116] [Hexagon] Restrict compound instructions with constant value. Having a constant value operand in the compound instruction is not always profitable. This patch improves coremark by ~4% on Hexagon. Differential Revision: https://reviews.llvm.org/D53152 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344284 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/Hexagon/HexagonPatterns.td | 37 +++++++++++----- test/CodeGen/Hexagon/constant_compound.ll | 52 +++++++++++++++++++++++ 2 files changed, 79 insertions(+), 10 deletions(-) create mode 100644 test/CodeGen/Hexagon/constant_compound.ll diff --git a/lib/Target/Hexagon/HexagonPatterns.td b/lib/Target/Hexagon/HexagonPatterns.td index 2f5033a20af..f671238ec12 100644 --- a/lib/Target/Hexagon/HexagonPatterns.td +++ b/lib/Target/Hexagon/HexagonPatterns.td @@ -257,6 +257,23 @@ class pf2 : PatFrag<(ops node:$a, node:$b), (Op node:$a, node:$b)>; class Not2 : PatFrag<(ops node:$A, node:$B), (P node:$A, (not node:$B))>; +// If there is a constant operand that feeds the and/or instruction, +// do not generate the compound instructions. +// It is not always profitable, as some times we end up with a transfer. +// Check the below example. +// ra = #65820; rb = lsr(rb, #8); rc ^= and (rb, ra) +// Instead this is preferable. +// ra = and (#65820, lsr(ra, #8)); rb = xor(rb, ra) +class Su_ni1 + : PatFraggetOperand(1); + return !dyn_cast(Op1); + } + return false;}], + Op.OperandTransform>; + class Su : PatFrag; @@ -1336,16 +1353,16 @@ def: Pat<(mul I32:$Rs, n8_0ImmPred:$n8), def: Pat<(add Sext64:$Rs, I64:$Rt), (A2_addsp (LoReg Sext64:$Rs), I64:$Rt)>; -def: AccRRR_pat, I32, I32, I32>; -def: AccRRR_pat, I32, I32, I32>; -def: AccRRR_pat, I32, I32, I32>; -def: AccRRR_pat, I32, I32, I32>; -def: AccRRR_pat, I32, I32, I32>; -def: AccRRR_pat, I32, I32, I32>; -def: AccRRR_pat, I32, I32, I32>; -def: AccRRR_pat, I32, I32, I32>; -def: AccRRR_pat, I32, I32, I32>; -def: AccRRR_pat, I64, I64, I64>; +def: AccRRR_pat, I32, I32, I32>; +def: AccRRR_pat, I32, I32, I32>; +def: AccRRR_pat, I32, I32, I32>; +def: AccRRR_pat, I32, I32, I32>; +def: AccRRR_pat, I32, I32, I32>; +def: AccRRR_pat, I32, I32, I32>; +def: AccRRR_pat, I32, I32, I32>; +def: AccRRR_pat, I32, I32, I32>; +def: AccRRR_pat, I32, I32, I32>; +def: AccRRR_pat, I64, I64, I64>; // For dags like (or (and (not _), _), (shl _, _)) where the "or" with // one argument matches the patterns below, and with the other argument diff --git a/test/CodeGen/Hexagon/constant_compound.ll b/test/CodeGen/Hexagon/constant_compound.ll new file mode 100644 index 00000000000..4ca2dc5d4ed --- /dev/null +++ b/test/CodeGen/Hexagon/constant_compound.ll @@ -0,0 +1,52 @@ +; RUN: llc -march=hexagon < %s 2>&1 | FileCheck %s + +; Generating a compound instruction with a constant is not profitable. +; The constant needs to be kept in a register before it is fed to compound +; instruction. +; Before, we are generating +; ra = #65820; +; rb = lsr(rb, #8); +; rc ^= and (rb, ra) +; Now, we are generating +; ra = and (#65820, lsr(ra, #8)); +; rb = xor(rb, ra) + +; CHECK: and(##65280,lsr(r +; CHECK-NOT : ^= and + +define dso_local zeroext i16 @test_compound(i16 zeroext %varA, i16 zeroext %varB) local_unnamed_addr #0 { +entry: + %tmp = zext i16 %varB to i32 + %tmp1 = and i16 %varA, 255 + %tmp2 = zext i16 %tmp1 to i32 + %.masked.i = and i32 %tmp, 255 + %tmp3 = xor i32 %.masked.i, %tmp2 + %tmp4 = tail call i64 @llvm.hexagon.M4.pmpyw(i32 %tmp3, i32 255) #2 + %tmp5 = trunc i64 %tmp4 to i32 + %tmp6 = and i32 %tmp5, 255 + %tmp7 = tail call i64 @llvm.hexagon.M4.pmpyw(i32 %tmp6, i32 81922) #2 + %tmp8 = trunc i64 %tmp7 to i32 + %tmp9 = xor i32 %tmp8, %tmp + %tmp10 = lshr i32 %tmp9, 8 + %tmp11 = lshr i16 %varA, 8 + %conv2 = zext i16 %tmp11 to i32 + %tmp12 = and i32 %tmp10, 65280 + %.masked.i7 = and i32 %tmp10, 255 + %tmp13 = xor i32 %.masked.i7, %conv2 + %tmp14 = tail call i64 @llvm.hexagon.M4.pmpyw(i32 %tmp13, i32 255) #2 + %tmp15 = trunc i64 %tmp14 to i32 + %tmp16 = and i32 %tmp15, 255 + %tmp17 = tail call i64 @llvm.hexagon.M4.pmpyw(i32 %tmp16, i32 81922) #2 + %tmp18 = trunc i64 %tmp17 to i32 + %tmp19 = xor i32 %tmp12, %tmp18 + %tmp20 = lshr i32 %tmp19, 8 + %tmp21 = trunc i32 %tmp20 to i16 + ret i16 %tmp21 +} + +; Function Attrs: nounwind readnone +declare i64 @llvm.hexagon.M4.pmpyw(i32, i32) #1 + +attributes #0 = { nounwind readnone "target-cpu"="hexagonv65" } +attributes #1 = { nounwind readnone } +attributes #2 = { nounwind } -- GitLab From 43adb6744a1ec2d32dd6d86d570f87a54d51b64d Mon Sep 17 00:00:00 2001 From: Warren Ristow Date: Thu, 11 Oct 2018 20:19:25 +0000 Subject: [PATCH 0071/1116] Update test of r344198 to work with release builds. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344286 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/LTO/X86/libcall-overridden-via-alias.ll | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/LTO/X86/libcall-overridden-via-alias.ll b/test/LTO/X86/libcall-overridden-via-alias.ll index cac125b2843..04e1512f5b8 100755 --- a/test/LTO/X86/libcall-overridden-via-alias.ll +++ b/test/LTO/X86/libcall-overridden-via-alias.ll @@ -13,8 +13,8 @@ ; ; Check that the IR contains the overriding definition of the library routine ; in the IR after LTO: -; CHECK_IR: define internal float @logf(float [[X:%.*]]) -; CHECK_IR-NEXT: [[TMP:%.*]] = fadd float [[X]], [[X]] +; CHECK_IR: define internal float @logf(float +; CHECK_IR-NEXT: [[TMP:%.*]] = fadd float ; CHECK_IR-NEXT: ret float [[TMP]] ; ; Check that the assembly code from LTO contains the call to the expected -- GitLab From b536aafd96d3e218c12d5525d0d145c051e42ae2 Mon Sep 17 00:00:00 2001 From: Thomas Lively Date: Thu, 11 Oct 2018 20:21:22 +0000 Subject: [PATCH 0072/1116] [WebAssembly][NFC] Remove repetition of Defs = [ARGUMENTS] (fixed) git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344287 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../WebAssembly/WebAssemblyInstrAtomics.td | 16 -------------- .../WebAssembly/WebAssemblyInstrCall.td | 4 ---- .../WebAssembly/WebAssemblyInstrControl.td | 8 ------- .../WebAssembly/WebAssemblyInstrConv.td | 12 ---------- .../WebAssembly/WebAssemblyInstrExceptRef.td | 4 ---- .../WebAssembly/WebAssemblyInstrFloat.td | 12 ---------- .../WebAssembly/WebAssemblyInstrFormats.td | 1 + .../WebAssembly/WebAssemblyInstrInfo.td | 7 ++---- .../WebAssembly/WebAssemblyInstrInteger.td | 9 -------- .../WebAssembly/WebAssemblyInstrMemory.td | 22 ------------------- .../WebAssembly/WebAssemblyInstrSIMD.td | 5 ++--- 11 files changed, 5 insertions(+), 95 deletions(-) diff --git a/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td b/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td index 9eff2cfde0a..f9d092e4b8a 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td @@ -24,10 +24,8 @@ multiclass ATOMIC_I; } -let Defs = [ARGUMENTS] in { defm ATOMIC_LOAD_I32 : WebAssemblyLoad; defm ATOMIC_LOAD_I64 : WebAssemblyLoad; -} // Defs = [ARGUMENTS] // Select loads with no constant offset. let Predicates = [HasAtomics] in { @@ -62,13 +60,11 @@ def : LoadPatExternSymOffOnly; // Extending loads. Note that there are only zero-extending atomic loads, no // sign-extending loads. -let Defs = [ARGUMENTS] in { defm ATOMIC_LOAD8_U_I32 : WebAssemblyLoad; defm ATOMIC_LOAD16_U_I32 : WebAssemblyLoad; defm ATOMIC_LOAD8_U_I64 : WebAssemblyLoad; defm ATOMIC_LOAD16_U_I64 : WebAssemblyLoad; defm ATOMIC_LOAD32_U_I64 : WebAssemblyLoad; -} // Defs = [ARGUMENTS] // Fragments for extending loads. These are different from regular loads because // the SDNodes are derived from AtomicSDNode rather than LoadSDNode and @@ -200,10 +196,8 @@ def : LoadPatExternSymOffOnly; // Atomic stores //===----------------------------------------------------------------------===// -let Defs = [ARGUMENTS] in { defm ATOMIC_STORE_I32 : WebAssemblyStore; defm ATOMIC_STORE_I64 : WebAssemblyStore; -} // Defs = [ARGUMENTS] // We need an 'atomic' version of store patterns because store and atomic_store // nodes have different operand orders: @@ -263,13 +257,11 @@ def : AStorePatExternSymOffOnly; } // Predicates = [HasAtomics] // Truncating stores. -let Defs = [ARGUMENTS] in { defm ATOMIC_STORE8_I32 : WebAssemblyStore; defm ATOMIC_STORE16_I32 : WebAssemblyStore; defm ATOMIC_STORE8_I64 : WebAssemblyStore; defm ATOMIC_STORE16_I64 : WebAssemblyStore; defm ATOMIC_STORE32_I64 : WebAssemblyStore; -} // Defs = [ARGUMENTS] // Fragments for truncating stores. @@ -341,8 +333,6 @@ def : AStorePatExternSymOffOnly; // Atomic binary read-modify-writes //===----------------------------------------------------------------------===// -let Defs = [ARGUMENTS] in { - multiclass WebAssemblyBinRMW { defm "" : I<(outs rc:$dst), (ins P2Align:$p2align, offset32_op:$off, I32:$addr, rc:$val), @@ -430,7 +420,6 @@ defm ATOMIC_RMW16_U_XCHG_I64 : WebAssemblyBinRMW; defm ATOMIC_RMW32_U_XCHG_I64 : WebAssemblyBinRMW; -} // Select binary RMWs with no constant offset. class BinRMWPatNoOffset : @@ -674,8 +663,6 @@ defm : BinRMWTruncExtPattern< // Consider adding a pass after instruction selection that optimizes this case // if it is frequent. -let Defs = [ARGUMENTS] in { - multiclass WebAssemblyTerRMW { defm "" : I<(outs rc:$dst), (ins P2Align:$p2align, offset32_op:$off, I32:$addr, rc:$exp, @@ -699,7 +686,6 @@ defm ATOMIC_RMW16_U_CMPXCHG_I64 : WebAssemblyTerRMW; defm ATOMIC_RMW32_U_CMPXCHG_I64 : WebAssemblyTerRMW; -} // Select ternary RMWs with no constant offset. class TerRMWPatNoOffset : @@ -912,7 +898,6 @@ defm : TerRMWTruncExtPattern< // Atomic wait / notify //===----------------------------------------------------------------------===// -let Defs = [ARGUMENTS] in { let hasSideEffects = 1 in { defm ATOMIC_NOTIFY : I<(outs I32:$dst), @@ -935,7 +920,6 @@ defm ATOMIC_WAIT_I64 : "i64.atomic.wait \t${off}, ${p2align}", 0xfe02>; } // mayLoad = 1 } // hasSideEffects = 1 -} // Defs = [ARGUMENTS] let Predicates = [HasAtomics] in { // Select notifys with no constant offset. diff --git a/lib/Target/WebAssembly/WebAssemblyInstrCall.td b/lib/Target/WebAssembly/WebAssemblyInstrCall.td index 3c9caa3f0de..07839b79011 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrCall.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrCall.td @@ -15,8 +15,6 @@ // TODO: addr64: These currently assume the callee address is 32-bit. // FIXME: add $type to first call_indirect asmstr (and maybe $flags) -let Defs = [ARGUMENTS] in { - // Call sequence markers. These have an immediate which represents the amount of // stack space to allocate or free, which is used for varargs lowering. let Uses = [SP32, SP64], Defs = [SP32, SP64], isCodeGenOnly = 1 in { @@ -118,8 +116,6 @@ let Uses = [SP32, SP64], isCall = 1 in { 0x11>; } // Uses = [SP32,SP64], isCall = 1 -} // Defs = [ARGUMENTS] - // Patterns for matching a direct call to a global address. def : Pat<(i32 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))), (CALL_I32 tglobaladdr:$callee)>; diff --git a/lib/Target/WebAssembly/WebAssemblyInstrControl.td b/lib/Target/WebAssembly/WebAssemblyInstrControl.td index e27d81937dd..ed9879ae454 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrControl.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrControl.td @@ -12,8 +12,6 @@ /// //===----------------------------------------------------------------------===// -let Defs = [ARGUMENTS] in { - let isBranch = 1, isTerminator = 1, hasCtrlDep = 1 in { // The condition operand is a boolean value which WebAssembly represents as i32. defm BR_IF : I<(outs), (ins bb_op:$dst, I32:$cond), @@ -30,15 +28,11 @@ defm BR : NRI<(outs), (ins bb_op:$dst), } // isBarrier = 1 } // isBranch = 1, isTerminator = 1, hasCtrlDep = 1 -} // Defs = [ARGUMENTS] - def : Pat<(brcond (i32 (setne I32:$cond, 0)), bb:$dst), (BR_IF bb_op:$dst, I32:$cond)>; def : Pat<(brcond (i32 (seteq I32:$cond, 0)), bb:$dst), (BR_UNLESS bb_op:$dst, I32:$cond)>; -let Defs = [ARGUMENTS] in { - // TODO: SelectionDAG's lowering insists on using a pointer as the index for // jump tables, so in practice we don't ever use BR_TABLE_I64 in wasm32 mode // currently. @@ -194,5 +188,3 @@ let isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1, [(catchret bb:$dst, bb:$from)], "", 0>; } } - -} // Defs = [ARGUMENTS] diff --git a/lib/Target/WebAssembly/WebAssemblyInstrConv.td b/lib/Target/WebAssembly/WebAssemblyInstrConv.td index 6dca96f3ddd..0d772c743a7 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrConv.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrConv.td @@ -13,8 +13,6 @@ /// //===----------------------------------------------------------------------===// -let Defs = [ARGUMENTS] in { - defm I32_WRAP_I64 : I<(outs I32:$dst), (ins I64:$src), (outs), (ins), [(set I32:$dst, (trunc I64:$src))], "i32.wrap/i64\t$dst, $src", "i32.wrap/i64", 0xa7>; @@ -51,15 +49,11 @@ defm I64_EXTEND32_S_I64 : I<(outs I64:$dst), (ins I64:$src), (outs), (ins), 0xc4>; } // Predicates = [HasSignExt] -} // defs = [ARGUMENTS] - // Expand a "don't care" extend into zero-extend (chosen over sign-extend // somewhat arbitrarily, although it favors popular hardware architectures // and is conceptually a simpler operation). def : Pat<(i64 (anyext I32:$src)), (I64_EXTEND_U_I32 I32:$src)>; -let Defs = [ARGUMENTS] in { - // Conversion from floating point to integer instructions which don't trap on // overflow or invalid. defm I32_TRUNC_S_SAT_F32 : I<(outs I32:$dst), (ins F32:$src), (outs), (ins), @@ -103,8 +97,6 @@ defm I64_TRUNC_U_SAT_F64 : I<(outs I64:$dst), (ins F64:$src), (outs), (ins), "i64.trunc_u:sat/f64", 0xfc07>, Requires<[HasNontrappingFPToInt]>; -} // Defs = [Arguments] - // Lower llvm.wasm.trunc.saturate.* to saturating instructions def : Pat<(int_wasm_trunc_saturate_signed F32:$src), (I32_TRUNC_S_SAT_F32 F32:$src)>; @@ -123,8 +115,6 @@ def : Pat<(int_wasm_trunc_saturate_signed F64:$src), def : Pat<(int_wasm_trunc_saturate_unsigned F64:$src), (I64_TRUNC_U_SAT_F64 F64:$src)>; -let Defs = [ARGUMENTS] in { - // Conversion from floating point to integer pseudo-instructions which don't // trap on overflow or invalid. let usesCustomInserter = 1, isCodeGenOnly = 1 in { @@ -240,5 +230,3 @@ defm F64_REINTERPRET_I64 : I<(outs F64:$dst), (ins I64:$src), (outs), (ins), [(set F64:$dst, (bitconvert I64:$src))], "f64.reinterpret/i64\t$dst, $src", "f64.reinterpret/i64", 0xbf>; - -} // Defs = [ARGUMENTS] diff --git a/lib/Target/WebAssembly/WebAssemblyInstrExceptRef.td b/lib/Target/WebAssembly/WebAssemblyInstrExceptRef.td index 41b39f69e51..a251d60b89e 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrExceptRef.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrExceptRef.td @@ -12,8 +12,6 @@ /// //===----------------------------------------------------------------------===// -let Defs = [ARGUMENTS] in { - defm SELECT_EXCEPT_REF : I<(outs EXCEPT_REF:$dst), (ins EXCEPT_REF:$lhs, EXCEPT_REF:$rhs, I32:$cond), (outs), (ins), @@ -23,8 +21,6 @@ defm SELECT_EXCEPT_REF : I<(outs EXCEPT_REF:$dst), "except_ref.select\t$dst, $lhs, $rhs, $cond", "except_ref.select", 0x1b>; -} // Defs = [ARGUMENTS] - def : Pat<(select (i32 (setne I32:$cond, 0)), EXCEPT_REF:$lhs, EXCEPT_REF:$rhs), (SELECT_EXCEPT_REF EXCEPT_REF:$lhs, EXCEPT_REF:$rhs, I32:$cond)>; def : Pat<(select (i32 (seteq I32:$cond, 0)), EXCEPT_REF:$lhs, EXCEPT_REF:$rhs), diff --git a/lib/Target/WebAssembly/WebAssemblyInstrFloat.td b/lib/Target/WebAssembly/WebAssemblyInstrFloat.td index 70e27df27e6..364c485f409 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrFloat.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrFloat.td @@ -45,8 +45,6 @@ multiclass ComparisonFP f32Inst, bits<32> f !strconcat("f64.", name), f64Inst>; } -let Defs = [ARGUMENTS] in { - let isCommutable = 1 in defm ADD : BinaryFP; defm SUB : BinaryFP; @@ -69,8 +67,6 @@ defm FLOOR : UnaryFP; defm TRUNC : UnaryFP; defm NEAREST : UnaryFP; -} // Defs = [ARGUMENTS] - // DAGCombine oddly folds casts into the rhs of copysign. Unfold them. def : Pat<(fcopysign F64:$lhs, F32:$rhs), (COPYSIGN_F64 F64:$lhs, (F64_PROMOTE_F32 F32:$rhs))>; @@ -81,8 +77,6 @@ def : Pat<(fcopysign F32:$lhs, F64:$rhs), def : Pat<(frint f32:$src), (NEAREST_F32 f32:$src)>; def : Pat<(frint f64:$src), (NEAREST_F64 f64:$src)>; -let Defs = [ARGUMENTS] in { - let isCommutable = 1 in { defm EQ : ComparisonFP; defm NE : ComparisonFP; @@ -92,8 +86,6 @@ defm LE : ComparisonFP; defm GT : ComparisonFP; defm GE : ComparisonFP; -} // Defs = [ARGUMENTS] - // Don't care floating-point comparisons, supported via other comparisons. def : Pat<(seteq f32:$lhs, f32:$rhs), (EQ_F32 f32:$lhs, f32:$rhs)>; def : Pat<(setne f32:$lhs, f32:$rhs), (NE_F32 f32:$lhs, f32:$rhs)>; @@ -108,8 +100,6 @@ def : Pat<(setle f64:$lhs, f64:$rhs), (LE_F64 f64:$lhs, f64:$rhs)>; def : Pat<(setgt f64:$lhs, f64:$rhs), (GT_F64 f64:$lhs, f64:$rhs)>; def : Pat<(setge f64:$lhs, f64:$rhs), (GE_F64 f64:$lhs, f64:$rhs)>; -let Defs = [ARGUMENTS] in { - defm SELECT_F32 : I<(outs F32:$dst), (ins F32:$lhs, F32:$rhs, I32:$cond), (outs), (ins), [(set F32:$dst, (select I32:$cond, F32:$lhs, F32:$rhs))], @@ -119,8 +109,6 @@ defm SELECT_F64 : I<(outs F64:$dst), (ins F64:$lhs, F64:$rhs, I32:$cond), [(set F64:$dst, (select I32:$cond, F64:$lhs, F64:$rhs))], "f64.select\t$dst, $lhs, $rhs, $cond", "f64.select", 0x1b>; -} // Defs = [ARGUMENTS] - // ISD::SELECT requires its operand to conform to getBooleanContents, but // WebAssembly's select interprets any non-zero value as true, so we can fold // a setne with 0 into a select. diff --git a/lib/Target/WebAssembly/WebAssemblyInstrFormats.td b/lib/Target/WebAssembly/WebAssemblyInstrFormats.td index 683fb3d981f..2d23acfc825 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrFormats.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrFormats.td @@ -30,6 +30,7 @@ class NI pattern, bit stack, string asmstr = "", dag OutOperandList = oops; dag InOperandList = iops; let Pattern = pattern; + let Defs = [ARGUMENTS]; } // Generates both register and stack based versions of one actual instruction. diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td index a2ea14cc28b..9e1409cf90e 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td @@ -164,7 +164,8 @@ include "WebAssemblyInstrFormats.td" //===----------------------------------------------------------------------===// multiclass ARGUMENT { - let hasSideEffects = 1, Uses = [ARGUMENTS], isCodeGenOnly = 1 in + let hasSideEffects = 1, isCodeGenOnly = 1, + Defs = [], Uses = [ARGUMENTS] in defm ARGUMENT_#vt : I<(outs vt:$res), (ins i32imm:$argno), (outs), (ins i32imm:$argno), [(set vt:$res, (WebAssemblyargument timm:$argno))]>; @@ -175,8 +176,6 @@ defm "": ARGUMENT; defm "": ARGUMENT; defm "": ARGUMENT; -let Defs = [ARGUMENTS] in { - // get_local and set_local are not generated by instruction selection; they // are implied by virtual register uses and defs. multiclass LOCAL { @@ -266,8 +265,6 @@ defm CONST_F64 : I<(outs F64:$res), (ins f64imm_op:$imm), "f64.const\t$res, $imm", "f64.const\t$imm", 0x44>; } // isMoveImm = 1, isAsCheapAsAMove = 1, isReMaterializable = 1 -} // Defs = [ARGUMENTS] - def : Pat<(i32 (WebAssemblywrapper tglobaladdr:$addr)), (CONST_I32 tglobaladdr:$addr)>; def : Pat<(i32 (WebAssemblywrapper texternalsym:$addr)), diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInteger.td b/lib/Target/WebAssembly/WebAssemblyInstrInteger.td index 44c93de54aa..bd41f46214a 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrInteger.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrInteger.td @@ -45,9 +45,6 @@ multiclass ComparisonInt i32Inst, bits<32> !strconcat("i64.", name), i64Inst>; } - -let Defs = [ARGUMENTS] in { - // The spaces after the names are for aesthetic purposes only, to make // operands line up vertically after tab expansion. let isCommutable = 1 in @@ -97,16 +94,12 @@ defm EQZ_I64 : I<(outs I32:$dst), (ins I64:$src), (outs), (ins), [(set I32:$dst, (setcc I64:$src, 0, SETEQ))], "i64.eqz \t$dst, $src", "i64.eqz", 0x50>; -} // Defs = [ARGUMENTS] - // Optimize away an explicit mask on a rotate count. def : Pat<(rotl I32:$lhs, (and I32:$rhs, 31)), (ROTL_I32 I32:$lhs, I32:$rhs)>; def : Pat<(rotr I32:$lhs, (and I32:$rhs, 31)), (ROTR_I32 I32:$lhs, I32:$rhs)>; def : Pat<(rotl I64:$lhs, (and I64:$rhs, 63)), (ROTL_I64 I64:$lhs, I64:$rhs)>; def : Pat<(rotr I64:$lhs, (and I64:$rhs, 63)), (ROTR_I64 I64:$lhs, I64:$rhs)>; -let Defs = [ARGUMENTS] in { - defm SELECT_I32 : I<(outs I32:$dst), (ins I32:$lhs, I32:$rhs, I32:$cond), (outs), (ins), [(set I32:$dst, (select I32:$cond, I32:$lhs, I32:$rhs))], @@ -116,8 +109,6 @@ defm SELECT_I64 : I<(outs I64:$dst), (ins I64:$lhs, I64:$rhs, I32:$cond), [(set I64:$dst, (select I32:$cond, I64:$lhs, I64:$rhs))], "i64.select\t$dst, $lhs, $rhs, $cond", "i64.select", 0x1b>; -} // Defs = [ARGUMENTS] - // ISD::SELECT requires its operand to conform to getBooleanContents, but // WebAssembly's select interprets any non-zero value as true, so we can fold // a setne with 0 into a select. diff --git a/lib/Target/WebAssembly/WebAssemblyInstrMemory.td b/lib/Target/WebAssembly/WebAssemblyInstrMemory.td index 76ef1461d22..ccc331d1bf0 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrMemory.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrMemory.td @@ -53,8 +53,6 @@ def regPlusGA : PatFrag<(ops node:$addr, node:$off), // We don't need a regPlusES because external symbols never have constant // offsets folded into them, so we can just use add. -let Defs = [ARGUMENTS] in { - // Defines atomic and non-atomic loads, regular and extending. multiclass WebAssemblyLoad { let mayLoad = 1 in @@ -73,8 +71,6 @@ defm LOAD_I64 : WebAssemblyLoad; defm LOAD_F32 : WebAssemblyLoad; defm LOAD_F64 : WebAssemblyLoad; -} // Defs = [ARGUMENTS] - // Select loads with no constant offset. class LoadPatNoOffset : Pat<(ty (kind I32:$addr)), (inst 0, 0, I32:$addr)>; @@ -144,8 +140,6 @@ def : LoadPatExternSymOffOnly; def : LoadPatExternSymOffOnly; def : LoadPatExternSymOffOnly; -let Defs = [ARGUMENTS] in { - // Extending load. defm LOAD8_S_I32 : WebAssemblyLoad; defm LOAD8_U_I32 : WebAssemblyLoad; @@ -158,8 +152,6 @@ defm LOAD16_U_I64 : WebAssemblyLoad; defm LOAD32_S_I64 : WebAssemblyLoad; defm LOAD32_U_I64 : WebAssemblyLoad; -} // Defs = [ARGUMENTS] - // Select extending loads with no constant offset. def : LoadPatNoOffset; def : LoadPatNoOffset; @@ -303,9 +295,6 @@ def : LoadPatExternSymOffOnly; def : LoadPatExternSymOffOnly; def : LoadPatExternSymOffOnly; - -let Defs = [ARGUMENTS] in { - // Defines atomic and non-atomic stores, regular and truncating multiclass WebAssemblyStore { let mayStore = 1 in @@ -323,8 +312,6 @@ defm STORE_I64 : WebAssemblyStore; defm STORE_F32 : WebAssemblyStore; defm STORE_F64 : WebAssemblyStore; -} // Defs = [ARGUMENTS] - // Select stores with no constant offset. class StorePatNoOffset : Pat<(node ty:$val, I32:$addr), (inst 0, 0, I32:$addr, ty:$val)>; @@ -389,9 +376,6 @@ def : StorePatExternSymOffOnly; def : StorePatExternSymOffOnly; def : StorePatExternSymOffOnly; - -let Defs = [ARGUMENTS] in { - // Truncating store. defm STORE8_I32 : WebAssemblyStore; defm STORE16_I32 : WebAssemblyStore; @@ -399,8 +383,6 @@ defm STORE8_I64 : WebAssemblyStore; defm STORE16_I64 : WebAssemblyStore; defm STORE32_I64 : WebAssemblyStore; -} // Defs = [ARGUMENTS] - // Select truncating stores with no constant offset. def : StorePatNoOffset; def : StorePatNoOffset; @@ -448,8 +430,6 @@ def : StorePatExternSymOffOnly; def : StorePatExternSymOffOnly; def : StorePatExternSymOffOnly; -let Defs = [ARGUMENTS] in { - // Current memory size. defm MEMORY_SIZE_I32 : I<(outs I32:$dst), (ins i32imm:$flags), (outs), (ins i32imm:$flags), @@ -493,8 +473,6 @@ defm GROW_MEMORY_I32 : I<(outs I32:$dst), (ins i32imm:$flags, I32:$delta), 0x40>, Requires<[HasAddr32]>; -} // Defs = [ARGUMENTS] - def : Pat<(int_wasm_current_memory), (CURRENT_MEMORY_I32 0)>; def : Pat<(int_wasm_grow_memory I32:$delta), diff --git a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td index 57024616f3f..b575a039ae0 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -22,7 +22,8 @@ multiclass SIMD_I { - let hasSideEffects = 1, Uses = [ARGUMENTS], isCodeGenOnly = 1 in + let hasSideEffects = 1, isCodeGenOnly = 1, + Defs = [], Uses = [ARGUMENTS] in defm ARGUMENT_#vt : SIMD_I<(outs V128:$res), (ins i32imm:$argno), (outs), (ins i32imm:$argno), [(set (vt V128:$res), @@ -55,7 +56,6 @@ multiclass ConstVec { "v128.const\t"#args, 0>; } -let Defs = [ARGUMENTS] in { defm "" : ConstVec; -} // Defs = [ARGUMENTS] // Create vector with identical lanes: splat def splat2 : PatFrag<(ops node:$x), (build_vector node:$x, node:$x)>; -- GitLab From efad4789d86a5214fb35a5f07938ed9d54c8b3e1 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 11 Oct 2018 20:36:06 +0000 Subject: [PATCH 0073/1116] [X86] Type legalize v2f32 loads by using an f64 load and a scalar_to_vector. On 64-bit targets the generic legalize will use an i64 load and a scalar_to_vector for us. But on 32-bit targets i64 isn't legal and the generic legalizer will end up emitting two 32-bit loads. We have DAG combines that try to put those two loads back together with pretty good success. This patch instead uses f64 to avoid the splitting entirely. I've made it do the same for 64-bit mode for consistency and to keep the load in the fp domain. There are a few things in here that look like regressions in 32-bit mode, but I believe they bring us closer to the 64-bit mode codegen. And that the 64-bit mode code could be better. I think those issues should be looked at separately. Differential Revision: https://reviews.llvm.org/D52528 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344291 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 24 +++++++++++++++ test/CodeGen/X86/bitcast-int-to-vector.ll | 6 ++-- test/CodeGen/X86/fold-load-vec.ll | 4 +-- .../X86/merge-consecutive-loads-256.ll | 30 ++++++++++--------- test/CodeGen/X86/sse-intrinsics-fast-isel.ll | 24 +++++---------- test/CodeGen/X86/vec_extract-avx.ll | 4 ++- test/CodeGen/X86/vector-shuffle-128-v4.ll | 4 +-- test/CodeGen/X86/widen_load-1.ll | 4 +-- 8 files changed, 61 insertions(+), 39 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 67f98d8ee72..d118e38ae72 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -902,6 +902,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, for (MVT VT : MVT::fp_vector_valuetypes()) setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal); + // We want to legalize this to an f64 load rather than an i64 load on + // 64-bit targets and two 32-bit loads on a 32-bit target. + setOperationAction(ISD::LOAD, MVT::v2f32, Custom); + setOperationAction(ISD::BITCAST, MVT::v2i32, Custom); setOperationAction(ISD::BITCAST, MVT::v4i16, Custom); setOperationAction(ISD::BITCAST, MVT::v8i8, Custom); @@ -26420,6 +26424,26 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, } break; } + case ISD::LOAD: { + // Use an f64 load and a scalar_to_vector for v2f32 loads. This avoids + // scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp cast + // since type legalization will try to use an i64 load. + EVT VT = N->getValueType(0); + assert(VT == MVT::v2f32 && "Unexpected VT"); + if (!ISD::isNON_EXTLoad(N)) + return; + auto *Ld = cast(N); + SDValue Res = DAG.getLoad(MVT::f64, dl, Ld->getChain(), Ld->getBasePtr(), + Ld->getPointerInfo(), + Ld->getAlignment(), + Ld->getMemOperand()->getFlags()); + SDValue Chain = Res.getValue(1); + Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Res); + Res = DAG.getBitcast(MVT::v4f32, Res); + Results.push_back(Res); + Results.push_back(Chain); + return; + } } } diff --git a/test/CodeGen/X86/bitcast-int-to-vector.ll b/test/CodeGen/X86/bitcast-int-to-vector.ll index 1a04fef9e01..e319255e8f0 100644 --- a/test/CodeGen/X86/bitcast-int-to-vector.ll +++ b/test/CodeGen/X86/bitcast-int-to-vector.ll @@ -17,8 +17,10 @@ define i1 @foo(i64 %a) { ; ; X86-SSE-LABEL: foo: ; X86-SSE: # %bb.0: -; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE-NEXT: ucomiss {{[0-9]+}}(%esp), %xmm0 +; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-SSE-NEXT: movaps %xmm0, %xmm1 +; X86-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3] +; X86-SSE-NEXT: ucomiss %xmm1, %xmm0 ; X86-SSE-NEXT: setp %al ; X86-SSE-NEXT: retl ; diff --git a/test/CodeGen/X86/fold-load-vec.ll b/test/CodeGen/X86/fold-load-vec.ll index 5523846dd19..115f2bf7a5b 100644 --- a/test/CodeGen/X86/fold-load-vec.ll +++ b/test/CodeGen/X86/fold-load-vec.ll @@ -16,8 +16,8 @@ define void @sample_test(<4 x float>* %source, <2 x float>* %dest) nounwind { ; CHECK-NEXT: movlps %xmm0, (%rsp) ; CHECK-NEXT: movlps %xmm0, (%rsi) ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rax -; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-NEXT: callq ext ; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: retq diff --git a/test/CodeGen/X86/merge-consecutive-loads-256.ll b/test/CodeGen/X86/merge-consecutive-loads-256.ll index f421d41f886..2feb9742c60 100644 --- a/test/CodeGen/X86/merge-consecutive-loads-256.ll +++ b/test/CodeGen/X86/merge-consecutive-loads-256.ll @@ -237,33 +237,35 @@ define <4 x i64> @merge_4i64_i64_23zz(i64* %ptr) nounwind uwtable noinline ssp { define <8 x float> @merge_8f32_2f32_23z5(<2 x float>* %ptr) nounwind uwtable noinline ssp { ; AVX1-LABEL: merge_8f32_2f32_23z5: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vmovups 16(%rdi), %xmm1 -; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vmovups 16(%rdi), %xmm0 +; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: merge_8f32_2f32_23z5: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-NEXT: vmovdqu 16(%rdi), %xmm1 -; AVX2-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: vmovupd 16(%rdi), %xmm0 +; AVX2-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: merge_8f32_2f32_23z5: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512F-NEXT: vmovdqu 16(%rdi), %xmm1 -; AVX512F-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-NEXT: vmovupd 16(%rdi), %xmm0 +; AVX512F-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq ; ; X32-AVX-LABEL: merge_8f32_2f32_23z5: ; X32-AVX: # %bb.0: ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X32-AVX-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5],mem[6,7] +; X32-AVX-NEXT: vmovups 16(%eax), %xmm0 +; X32-AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; X32-AVX-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; X32-AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X32-AVX-NEXT: retl %ptr0 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 2 %ptr1 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 3 diff --git a/test/CodeGen/X86/sse-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse-intrinsics-fast-isel.ll index 90e31eb5fb3..47649a54e80 100644 --- a/test/CodeGen/X86/sse-intrinsics-fast-isel.ll +++ b/test/CodeGen/X86/sse-intrinsics-fast-isel.ll @@ -1329,19 +1329,15 @@ define <4 x float> @test_mm_loadh_pi(<4 x float> %a0, x86_mmx* %a1) { ; X86-AVX1-LABEL: test_mm_loadh_pi: ; X86-AVX1: # %bb.0: ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX1-NEXT: vmovsd (%eax), %xmm1 # encoding: [0xc5,0xfb,0x10,0x08] -; X86-AVX1-NEXT: # xmm1 = mem[0],zero -; X86-AVX1-NEXT: vmovlhps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x16,0xc1] -; X86-AVX1-NEXT: # xmm0 = xmm0[0],xmm1[0] +; X86-AVX1-NEXT: vmovhpd (%eax), %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x16,0x00] +; X86-AVX1-NEXT: # xmm0 = xmm0[0],mem[0] ; X86-AVX1-NEXT: retl # encoding: [0xc3] ; ; X86-AVX512-LABEL: test_mm_loadh_pi: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vmovsd (%eax), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x08] -; X86-AVX512-NEXT: # xmm1 = mem[0],zero -; X86-AVX512-NEXT: vmovlhps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x16,0xc1] -; X86-AVX512-NEXT: # xmm0 = xmm0[0],xmm1[0] +; X86-AVX512-NEXT: vmovhpd (%eax), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x16,0x00] +; X86-AVX512-NEXT: # xmm0 = xmm0[0],mem[0] ; X86-AVX512-NEXT: retl # encoding: [0xc3] ; ; X64-SSE-LABEL: test_mm_loadh_pi: @@ -1396,19 +1392,15 @@ define <4 x float> @test_mm_loadl_pi(<4 x float> %a0, x86_mmx* %a1) { ; X86-AVX1-LABEL: test_mm_loadl_pi: ; X86-AVX1: # %bb.0: ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX1-NEXT: vmovsd (%eax), %xmm1 # encoding: [0xc5,0xfb,0x10,0x08] -; X86-AVX1-NEXT: # xmm1 = mem[0],zero -; X86-AVX1-NEXT: vblendps $3, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x03] -; X86-AVX1-NEXT: # xmm0 = xmm1[0,1],xmm0[2,3] +; X86-AVX1-NEXT: vmovlpd (%eax), %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x12,0x00] +; X86-AVX1-NEXT: # xmm0 = mem[0],xmm0[1] ; X86-AVX1-NEXT: retl # encoding: [0xc3] ; ; X86-AVX512-LABEL: test_mm_loadl_pi: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vmovsd (%eax), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x08] -; X86-AVX512-NEXT: # xmm1 = mem[0],zero -; X86-AVX512-NEXT: vblendps $3, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x03] -; X86-AVX512-NEXT: # xmm0 = xmm1[0,1],xmm0[2,3] +; X86-AVX512-NEXT: vmovlpd (%eax), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x12,0x00] +; X86-AVX512-NEXT: # xmm0 = mem[0],xmm0[1] ; X86-AVX512-NEXT: retl # encoding: [0xc3] ; ; X64-SSE-LABEL: test_mm_loadl_pi: diff --git a/test/CodeGen/X86/vec_extract-avx.ll b/test/CodeGen/X86/vec_extract-avx.ll index 9a12d69b46b..a15424a763e 100644 --- a/test/CodeGen/X86/vec_extract-avx.ll +++ b/test/CodeGen/X86/vec_extract-avx.ll @@ -171,7 +171,9 @@ define void @legal_vzmovl_2f32_8f32(<2 x float>* %in, <8 x float>* %out) { ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X32-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; X32-NEXT: vmovaps %ymm0, (%eax) ; X32-NEXT: vzeroupper ; X32-NEXT: retl diff --git a/test/CodeGen/X86/vector-shuffle-128-v4.ll b/test/CodeGen/X86/vector-shuffle-128-v4.ll index e35f664e121..0e4d5dcd386 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v4.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -1998,8 +1998,8 @@ define <4 x i32> @mask_v4i32_0127(<4 x i32> %a, <4 x i32> %b) { define <4 x float> @broadcast_v4f32_0101_from_v2f32(<2 x float>* %x) { ; SSE2-LABEL: broadcast_v4f32_0101_from_v2f32: ; SSE2: # %bb.0: -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] ; SSE2-NEXT: retq ; ; SSE3-LABEL: broadcast_v4f32_0101_from_v2f32: diff --git a/test/CodeGen/X86/widen_load-1.ll b/test/CodeGen/X86/widen_load-1.ll index 2e4acb57ee4..8cbf8c4e346 100644 --- a/test/CodeGen/X86/widen_load-1.ll +++ b/test/CodeGen/X86/widen_load-1.ll @@ -5,11 +5,11 @@ ; This load should be before the call, not after. -; SSE: movaps compl+128(%rip), %xmm0 +; SSE: movsd compl+128(%rip), %xmm0 ; SSE: movaps %xmm0, (%rsp) ; SSE: callq killcommon -; AVX: vmovaps compl+128(%rip), %xmm0 +; AVX: vmovsd compl+128(%rip), %xmm0 ; AVX: vmovaps %xmm0, (%rsp) ; AVX: callq killcommon -- GitLab From 7034ff81096837cc8e955a692205dc3042735393 Mon Sep 17 00:00:00 2001 From: Aaron Smith Date: Thu, 11 Oct 2018 21:37:18 +0000 Subject: [PATCH 0074/1116] [llvm-pdbutil] Pretty print PDBSymbolUsingNamespace symbols Reviewers: rnk, zturner, llvm-commits Differential Revision: https://reviews.llvm.org/D52799 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344298 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/tools/llvm-pdbdump/Inputs/UsingNamespaceTest.cpp | 11 +++++++++++ test/tools/llvm-pdbdump/Inputs/UsingNamespaceTest.pdb | 0 test/tools/llvm-pdbdump/usingnamespace.test | 6 ++++++ tools/llvm-pdbutil/PrettyCompilandDumper.cpp | 11 +++++++++++ tools/llvm-pdbutil/PrettyCompilandDumper.h | 1 + 5 files changed, 29 insertions(+) create mode 100644 test/tools/llvm-pdbdump/Inputs/UsingNamespaceTest.cpp create mode 100644 test/tools/llvm-pdbdump/Inputs/UsingNamespaceTest.pdb create mode 100644 test/tools/llvm-pdbdump/usingnamespace.test diff --git a/test/tools/llvm-pdbdump/Inputs/UsingNamespaceTest.cpp b/test/tools/llvm-pdbdump/Inputs/UsingNamespaceTest.cpp new file mode 100644 index 00000000000..403ada17713 --- /dev/null +++ b/test/tools/llvm-pdbdump/Inputs/UsingNamespaceTest.cpp @@ -0,0 +1,11 @@ +// Compile with "cl /c /Zi /GR- UsingNamespaceTest.cpp" +// Link with "link UsingNamespaceTest.obj /debug /nodefaultlib /entry:main" + +namespace NS { + int foo() { return 1; } +} + +using namespace NS; +int main(int argc, char **argv) { + return foo(); +} diff --git a/test/tools/llvm-pdbdump/Inputs/UsingNamespaceTest.pdb b/test/tools/llvm-pdbdump/Inputs/UsingNamespaceTest.pdb new file mode 100644 index 00000000000..e69de29bb2d diff --git a/test/tools/llvm-pdbdump/usingnamespace.test b/test/tools/llvm-pdbdump/usingnamespace.test new file mode 100644 index 00000000000..954ec114010 --- /dev/null +++ b/test/tools/llvm-pdbdump/usingnamespace.test @@ -0,0 +1,6 @@ +; RUN: llvm-pdbutil pretty -module-syms %p/Inputs/UsingNamespaceTest.pdb > %t +; RUN: FileCheck -input-file=%t %s + +; CHECK: ---SYMBOLS--- +; CHECK-NEXT: {{.*}}UsingNamespaceTest.obj +; CHECK-DAG: using namespace NS diff --git a/tools/llvm-pdbutil/PrettyCompilandDumper.cpp b/tools/llvm-pdbutil/PrettyCompilandDumper.cpp index 0d99c9b1245..94a0b2d5e78 100644 --- a/tools/llvm-pdbutil/PrettyCompilandDumper.cpp +++ b/tools/llvm-pdbutil/PrettyCompilandDumper.cpp @@ -28,6 +28,7 @@ #include "llvm/DebugInfo/PDB/PDBSymbolThunk.h" #include "llvm/DebugInfo/PDB/PDBSymbolTypeFunctionSig.h" #include "llvm/DebugInfo/PDB/PDBSymbolUnknown.h" +#include "llvm/DebugInfo/PDB/PDBSymbolUsingNamespace.h" #include "llvm/Support/Format.h" #include "llvm/Support/Path.h" #include "llvm/Support/raw_ostream.h" @@ -216,3 +217,13 @@ void CompilandDumper::dump(const PDBSymbolUnknown &Symbol) { Printer.NewLine(); Printer << "unknown (" << Symbol.getSymTag() << ")"; } + +void CompilandDumper::dump(const PDBSymbolUsingNamespace &Symbol) { + if (Printer.IsSymbolExcluded(Symbol.getName())) + return; + + Printer.NewLine(); + Printer << "using namespace "; + std::string Name = Symbol.getName(); + WithColor(Printer, PDB_ColorItem::Identifier).get() << Name; +} diff --git a/tools/llvm-pdbutil/PrettyCompilandDumper.h b/tools/llvm-pdbutil/PrettyCompilandDumper.h index cae196e9d13..1a840e49607 100644 --- a/tools/llvm-pdbutil/PrettyCompilandDumper.h +++ b/tools/llvm-pdbutil/PrettyCompilandDumper.h @@ -34,6 +34,7 @@ public: void dump(const PDBSymbolThunk &Symbol) override; void dump(const PDBSymbolTypeTypedef &Symbol) override; void dump(const PDBSymbolUnknown &Symbol) override; + void dump(const PDBSymbolUsingNamespace &Symbol) override; private: LinePrinter &Printer; -- GitLab From b4c8a95abbbc8ccdf4079426ddc22b8ba991ea2b Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Thu, 11 Oct 2018 21:44:38 +0000 Subject: [PATCH 0075/1116] [x86] regenerate CHECKs; NFC git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344301 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/CodeGen/X86/extract-insert.ll | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/test/CodeGen/X86/extract-insert.ll b/test/CodeGen/X86/extract-insert.ll index 823390e86d1..a6cac874c41 100644 --- a/test/CodeGen/X86/extract-insert.ll +++ b/test/CodeGen/X86/extract-insert.ll @@ -3,13 +3,9 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=CHECK --check-prefix=X64 define i32 @extractelt_undef_insertelt(i32 %x, i32 %y) { -; X86-LABEL: extractelt_undef_insertelt: -; X86: # %bb.0: -; X86-NEXT: retl -; -; X64-LABEL: extractelt_undef_insertelt: -; X64: # %bb.0: -; X64-NEXT: retq +; CHECK-LABEL: extractelt_undef_insertelt: +; CHECK: # %bb.0: +; CHECK-NEXT: ret{{[l|q]}} %b = insertelement <4 x i32> zeroinitializer, i32 %x, i64 3 %c = icmp uge i32 %y, %y %d = extractelement <4 x i32> %b, i1 %c -- GitLab From 9b3effed99daf83effa68df8706fb12a15e9a65d Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Thu, 11 Oct 2018 22:04:36 +0000 Subject: [PATCH 0076/1116] [x86] add tests for extract_element; NFC The transform for this pattern has an unnecessary one-use limitation. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344303 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/CodeGen/X86/extract-insert.ll | 44 ++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/test/CodeGen/X86/extract-insert.ll b/test/CodeGen/X86/extract-insert.ll index a6cac874c41..de8ee704b88 100644 --- a/test/CodeGen/X86/extract-insert.ll +++ b/test/CodeGen/X86/extract-insert.ll @@ -12,3 +12,47 @@ define i32 @extractelt_undef_insertelt(i32 %x, i32 %y) { ret i32 %d } +define i8 @extractelt_bitcast(i32 %x) nounwind { +; X86-LABEL: extractelt_bitcast: +; X86: # %bb.0: +; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: retl +; +; X64-LABEL: extractelt_bitcast: +; X64: # %bb.0: +; X64-NEXT: movl %edi, %eax +; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: retq + %bc = bitcast i32 %x to <4 x i8> + %ext = extractelement <4 x i8> %bc, i32 0 + ret i8 %ext +} + +define i8 @extractelt_bitcast_extra_use(i32 %x, <4 x i8>* %p) nounwind { +; X86-LABEL: extractelt_bitcast_extra_use: +; X86: # %bb.0: +; X86-NEXT: pushl %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movd %eax, %xmm0 +; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %eax, (%ecx) +; X86-NEXT: movd %xmm0, %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: popl %ecx +; X86-NEXT: retl +; +; X64-LABEL: extractelt_bitcast_extra_use: +; X64: # %bb.0: +; X64-NEXT: movd %edi, %xmm0 +; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X64-NEXT: movl %edi, (%rsi) +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: retq + %bc = bitcast i32 %x to <4 x i8> + store <4 x i8> %bc, <4 x i8>* %p + %ext = extractelement <4 x i8> %bc, i32 0 + ret i8 %ext +} + -- GitLab From d7ed5b6ae7ec07fd6ead0b495a2c73e7d34728e0 Mon Sep 17 00:00:00 2001 From: Wei Mi Date: Thu, 11 Oct 2018 22:14:27 +0000 Subject: [PATCH 0077/1116] [SampleFDO][NFC] Remove debugging log left over in the code. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344304 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/ProfileData/SampleProf.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/llvm/ProfileData/SampleProf.h b/include/llvm/ProfileData/SampleProf.h index e632a1c955b..927dfd24687 100644 --- a/include/llvm/ProfileData/SampleProf.h +++ b/include/llvm/ProfileData/SampleProf.h @@ -488,8 +488,6 @@ public: // If the format is SPF_Compact_Binary, the name is already a GUID and we // don't want to return the GUID of GUID. static uint64_t getGUID(StringRef Name) { - if (Format == SPF_Compact_Binary) - errs() << Name << '\n'; return (Format == SPF_Compact_Binary) ? std::stoull(Name.data()) : Function::getGUID(Name); } -- GitLab From 744c960d1283eb9afea0ece9cd8653b382592171 Mon Sep 17 00:00:00 2001 From: Aaron Smith Date: Thu, 11 Oct 2018 22:25:55 +0000 Subject: [PATCH 0078/1116] [llvm-pdbutil] Add missing pdb for test git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344306 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../llvm-pdbdump/Inputs/UsingNamespaceTest.pdb | Bin 0 -> 102400 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/test/tools/llvm-pdbdump/Inputs/UsingNamespaceTest.pdb b/test/tools/llvm-pdbdump/Inputs/UsingNamespaceTest.pdb index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..ce5211e3fc8dca5ced3ac638943aac0aff735046 100644 GIT binary patch literal 102400 zcmeaxOfJeV&QB{*aMpL$)>iNhc2h9dGce%gl5z=VU|?VnU|?WkU|^7BU|?tf@fjEx z#G&*kH5vk=Aut*OqaiRF0;3@?8UmvsFd71*Aut*OqaiRF0;3@?G(zCOpZ}v^Gz3ON zU^E0qLtr!nMnhmU1V%$(Gz3ONU^E0qLtr!nMnhnjhkyk$!!Y;ZsP{%gU^E0qLtr!n zMnhmU1V%$(Gz3ONU^E0qLtr!nMnhmU1h`KbGFpI^DhNQ<`h%FDP+?(UVEF%^8N>w- z5P%pUaTv|Uz$gS_^D%G;lz0Ct&|_d0ViE<}k(XGKS(eK1`O`-bpOGOsKbHZ*W@5-q%u7s9O=0--v50|Pl$n8p z31l9t00TpOd|7gQVo6C+W>RTMYO$47YFTPtNqljBX;E^jBTU@Ns-&_YH9nO=&)zZI z1j2T}s?q_?KnDkgk5X_q78j(ZrZ7OK=9y+>>8Wl9`)X1$PsI zPrSRb7+6Ia7#cvq3krECM)LrU5CMfmD6t{%`O`;GoN@{n{B*z6xKTM1a+#s3(n;as22{EtO=1{jWrf8?aG+ulC(0#|G%JjjW`&T_EJ#iS6xU3Q45A>Kfq}se zng&7YK%o?$%J5N8l$k+HfPn#vOBg_D4Wt&9)NOAX^6Y3j+&O&Ly!V5y_1V9H~el0yRef;qQRNqQunHm@xQXTX1^D1-mOOe8#rGbHgks&ptvLH1jH4Wl0kU1c6 zCWh3|yv+POh$zGykT?rNs%u_pE<_mWKae;FL#j(^T4r8qNMbrf5}`+cAvL(9s5H3* zA_0ywkRAnwRG<9h#2kn)LcIe+YCutbL26M6Gyx&m)4-7G3aV|<^)N7|hNR{e$j;O$d|q>>WcY{{yQ^Y;FX#ONnUbV>J()EU?O8u?gDF*F{SkI!I{)B&P?84<<$i zeGrY5HV|@{X~O`THVl!{2FP4k8w->!j06}MusRjmrU$8qwX;BB0!kakP&eR68`zBl zB@e7J*uw-=77)?rEJ{sI%`8hTf~ROay-p0ZSZw~7M_gF|=>}n#2#$Fia;UbE+yCcc zU=;&}yI(NFM?MBdG0-^yi~p^1*Ij$3}sM#&Q>u2 zMfvGPiMa}HnK`M&3K|s_W|}dGqZGn2i%S!86oN}iQ!?{oj0_CTW1RDIb4&9wODbc+ zoMS@3R`>>oImZ~9=oy;o8JU?InZ+b!=EZpA7neksnZ#6>nZzXL=%rSqGJI5m+N&26 z3aZTf5_40F3lfu4LsE-N^pXn-K;i2IRYRhmV#<;YO$_u3Qj!=xeUyWmp_{Cr8mH=5pz9l=pc@jPpc?@m3lU>w0QK}iy?jvb9n^CN^~OJa1f_*hJQ@O{Aut*OqaiRF z0;3@?8UmvsFd71*Aut*OqaiSOLtu)YA)`0f#%R!-?!TL}mMi9;blCh(?8v3ryg@Km zgXBOM#s|@N0*%pyK*FPVGz3ONU^E0qLtr!nMnhmU1V%$(Gz3ONU^E0qLtr!nMnhl_ zh5%@tzaImmAjoN?&K+Z!jm}TX0<9yn5oAnYV7R~_%)rl(nZW?35mV$EL7EEwkah5x zdFcwy`MCv|IjKbqLJTT`Xsg&6#28crF&52&)}S+}2qu=~Ft9PG2o{&5fN8&A(7plz zuzysbBNS{P&oMACu!7bE2rw`xfM^B=zhEmX&~gO^IR?loUZ^r;$_Zo;69XHA76Zi0 zIZ!h}cOdXGFff2NL4ejdK-MZimOFsV(?B)_!3LWr#2_dLQV-h4;Ln&?l%5PSM^F&7 z4+FGM0xVVrx_5z%fsMgO5VR)!69dHEP`5L{`~-522-JL528ITZI*{2Q43Y!66C@2< zXAfFq0b+A9Ff@SN&c(pc0CGP!14F|X1_lNR1_p)%(DEI|849Yp9iLpb-bmXN{yEgb z4zyODfx(1i-Kt z`~O-BKACyhsYMD}pgy#XAUgx74-fMaAG9xz+K2b?^z#O(kYltJWJoSgVIXl3fdRC_ zon8y-b29U?!At5v>)lCoD7LltAY+nqQy3KVQ&N*k(-rjd@>5dN5=(PRax#+?^i%Un ziYl$ZJ0KW@7<2?dBM_i52uR#XJi1&sO08>_S>5YGJ*6NFfiO;VPJ4zU~teYN=Yn91j(5Tx-de-K>C;% zU@{C0at!8zJy1E28kn3)RaI3=QaS_3P7WrB-V02iF$uj2h^Y{BLFR$%0@=yLpqGk} zGZSP6xyONl!2)W&UI9cFWF|}uq@S4qCId3lOppbnF90&;;K0D(q*s)hlb;MyIf_R^ zU^E0qLtr!nMnhmU1V%$(Gz3ONU^E0qLtr!n$PEEde-t#QY{S670GfXWo$&{vLG#NX zIncZ@XpDed^GEfKhQMeDjE2By2#kinXb6mkz-S1JhQMeDjE2By2#kinXb22^2yhI% z6GmMy8UmvsFd71*Aut*OqaiRF0;3@?8UmvsFd71*Aut*Oqai@!5CE?rXJB9ett$te z;Sbr+j%uj_149CA#W-l4Hv?z^J7|Fb_yz!kTF^Rh*h+5D`fku7bOz83butJwAQo61 zbWJyS{2_n=w6Gks7#$=H$Dp;}aLy=WGz3ONU^E0qLtr!nMnhmU1V%$(Gz3ONU^E0q zLtr!nMniz?5CE z#HbraLtr!nMnhmU1V%$(Gz3ONU^E0qLtr!nMnhmU1jr5nIR-vK&=ftWkIu+opO&BR z;1}%R5b5X<5d|_-2r3Vn@@Iq`Z_gmcpb9z`pWzC4i~xR&K7$ZcT!WE;0j3sY20je3 z7oR+)9C*b5rtql9fQ0~TYycFO2B0$wL4!#GpfvFR|A0Aa)Z)<)7!85Z5Eu=C(GVC7 zfzc2c4S~@R7!85Z5Eu;sln~Hif}FDs8Up~G_XoN&4o0J|4NwI4_d)$S2u3l8Ob)2e zNSfBs{y%BqMSInw{eRkr@u&@>Aut*OqaiRF0;3@?8UmvsFd71*Aut*Oqai?P2!Q(d z<(!6$png9)1AiX}LuLji0|SE$BLl-41_p)<(AfY;mVmbWr+~)+Qrcj`Aibb|ygK+E zy8zG?@(g`FM?lIL7#R2%R)7>RFfcTL^}%#OXe!*om!FhH>aBy=F7pE!19217xeWYZ z|1c!L_z+nhhBctU5(o!{1gS&5Ls55^-bauaND(%(KytXk4Q2)CE(Xwf2_Ur~b3hm* z2fC{-o&j_(pMe9q+!96x1`!-`pga6PBPb3G1`Z&zI*db~gJ@O;Rt8?ML5vKbGyxI= zoe9AS7Gr{ofzE$`h%v*(KxaKb#8}{BAUirMU$$nRx}JCB-qN#hH2Od5O8H#RZAU zsUXFA$pr-r4FBH$dk-?%17s`}5*6rbM^HF}j02eq!XWoUFgMtr5H?7~C>{-g(GVC7 zfzc2c4S~@R7!85Z5Eu=C(GVC7fx#aFuzu5WkW)$Rj1?56rlh82=B1`6Bx`FcB$kvE zWhRxDq!ue^B8ni~bF#Z<@0Uf3sUn^ zQu9g}KISp7in1{XG1)=g2?`<(hWPlhWO1Ew!>^HWN5QsDt*Wd#bK_*4cxd&h7S2-^Wo4HjQ~ zjALLGWngFkB@s|?K{1+lt*p>Y0@)IZCWggyXj*iFrbR9WMu7tGAP*#sx`4zP7#W-x z7==J+qzgZLl}OOqhGoI&x=#LwUcqLI=y$b3w7 zj0_%7b(nIP>DnEdo;{(omjDAp5+fvCgUpA7JS1Iv3otOC1rj(tgVZ6XXOR0j7(fdl zAn6;#7hot$Eh^5;&tv!mPV*o!35L?r%oKMNmEg z>Cs@wNX-E)%E-)12bB#VagbgEByo_g)QS?Y+dz6OkmNuq8>9y$?f@1~%gjkdxWNM~ z1~UQTR*+kK7&0U7IA%Wtmc7xI9O${ zgaEWZ&I#&wvoONPd)Nd(<2I0SA&?vwsE@_O$iNMv!DBk;a)^Gh5OlnU2ihOz6<}bf zgO2ln%!T!bLE}Cg0t^g@@B#IanHc!sV?St74ej@W%s}odgUn!on*!;Fg8G&m3@NF} z`9+B(sSF=M;mOV*#KaGEKPWx(VD=TUh8n5i=>$(xsZRO%IjMQ1Fv`jbi;dtw#Uf6WOCV!K!f0t$2r12i9~7?C%RodPQl8$*B;D&ycZ6Hb|O8l7NVF zFr@mX<|d^UK~#g&uK-A_BqKisNlXDrzi%o+5>$>lpvWN%1IaZoz}$&!4a9B+MuZ%~ zM5r8ds(Vg;QeqCWWl+B`ut4Qp5=#=1+{nO@iWDMHa|FP75$x}P#G=I9)DjfGG8m*n zY(SF)xt+lw)ziht7d+Ssu?Au$Qv*Xos$*V$US)26DN^{eG=R?2Ne!ti04=A3=m41m z5@%va4b98U&x43U%mInBFr>QXmF7Z(q5cDjb1x__jH3KJc!0s(@W0+3oVGn(d?1ea z*u}suDhDkSC5bE(OF=mUsW=4lVMF3jjsuv6DDNSVn z`4?37$wSQ|zU%|rgfJO2qk<|#lz+XTbt)r6VnJe2NiJw|2b7MO7(g2abKq@DW`?w) z)Kp0Q%EFMBT3S++2panasbytI$}h=a_*llkF3Ji`M~c|eQAtKoYGO)eUOL>_R#tE^ zSmHqvaDcPn8n9HKACrhnGnj^f6A7j;HakG+LK!VxC?VC0AUPFK95OL7sDfyuG=q@C zY|E)Z+j8mx3=E+54XEt~G8gWu_*4ddQ6>fr0R{%R7(6r}=?SDBmYzUi!NMTKqzQEc zj`W1ZC{O~z6vh@7Nd1r0veY7I{g{}OlV1+*Z-CMS6GLK7d17TTs11hP|Io&kE)vtz zi&E1QOA?a^&9Y%tiOr3mb}12aZCK3%Ckw1HSZso}^L5eEh7MBN0Lkfr;)98iK_5gT zr457}X4(Lq2@dK>8X~0)kh!on7ARd92{15Vbt<$?4^j_nXMw^5ls1f^ZorW?up0+T z9$00thY6@GAfnHSF$s-zZWYpdz)*|DX7Ic;5%G@0L~zXGkVCa?ik=~(H`m5!&>a51 zo3oZH=AU%f{7&r1rP;heN(>AP`aYR?*`9f6`3(A?IkjR2{oc6oc!c$ z@Qghhc!n0#BLu|-0|V%K050$>Dgy&Z4kQXXRS-nq2{dLL?f;MV{}~uY`~Rc;f6#8B z(f&WS(FIUNg+-jG*6^o~qy2xR!3JobakT$G+W#*=8xuwCYFb%=`LHe=lr!4@hmHrJ z_18xG|L_1A?f;MV|E;XB_A9aW1hJ|_?U{jBW`pOYi0B7mH4oei#43ZurqTXC+5%?C zdhOxf|3{wx7Z6}z5MW|p;76PP2OSg#!l3zo@cC-so;@q*C_o0kV1|!;42)u+xqn6h z28Oi!d;A0V1XV z6$8!DGBQBKw4h>3!FLNn#7v-K@=OTx?4e?y!wF$}eW79xpnA<1AZ{n|m^g+qs6J<_ zn1G`E^rFOE1-Hx`&}j!17G|0;zM08I`NjEZB?@7g#ifZk3c)3%DVg~(Mh1rFG0yq9 zxutoTC6zH@&M_h2eRnYYf zQP2$uP|%HF0Ih`(V`Tu%^n+&kLA`g-%spr}{?kX$$j&Gp4S~@R7!85Z5Eu=C(GVC7 zfzc2c4S~@R7!85Z5E#56(4xZ3@Uo7HL9m0F!J?Fb!Mlr@;r%~mhME3M3`+%>80N|` zGB8v!GBkZ>W>7X_VEEb0%y8O~nc=n;6T>7`28IWVnHi+LnHdyynHbE@GBDf}V`Nyx zz`!8Bh=IZLBQwK&2WE!cZUzR2xl9Zd(##BR8JQTW_A@YOyk=yu@@8h((89#9_$32F zzAiJvPkROi?d6ON0j>~pD`zt^>`7*32w%p?@Z}OC!wVZmhJUXZ7#0RFGf3}YX3*+l zW~g1qz!0~QfuX~Zfx+PjBg6F~Muz9C%nUnrGcdeb#>7x`n~|Z>fsuhZor%Hu6f?u_ z(+ms(o0%DIOEEG0b75joG-YBqe~^)37ZWqXKW+ww7hQ}D|86lc+&jd`u-KQ8A$$@u zLr4}A!;*)dcQ7_?3?F??xYgr8k7cw=_dWuqZ58UmvsFd71*Aut*OqaiRF0;3@? z8UmvsFd71*A%F;hV;Rg0`?8oAqWYK_q-7Wwlw}wgxMdg^1Z5Z)IC}gXn$4UJrzcqpZ;o7!85Z5Eu=C(GVC7fzc2c4S~@R z7!85Z5Eu=C(GVC7fx#96pd%DC7#JAJ7#J9AL>U;Im>C#6peMVZW?*2@VParN0AZ-Z z7cej|Y+zzw0G$J9!o7yYq8UmvsFd71*Aut*OqaiRF0;3@?8UmvsFd71*Aut*OqhK@yMnhmU m1V%$(Gz3ONU^E0qLtr!nMnhmU1V%$(Gz3ONU^E1190C9z*kWG* literal 0 HcmV?d00001 -- GitLab From bd934ffaabd8b706cf6d7bc52bd67bab81f7e5d5 Mon Sep 17 00:00:00 2001 From: Alexander Shaposhnikov Date: Thu, 11 Oct 2018 22:33:50 +0000 Subject: [PATCH 0079/1116] [llvm-objcopy] Factor out CopyConfig In this diff we move out CopyConfig from llvm-oobjcopy.cpp into a separate header CopyConfig.h to enable us (in the future) reuse this class in the other implementations of objcopy (for coff, mach-o). Additionally this enables us to unload the complexity from llvm-objcopy.cpp a little bit. Test plan: make check-all Differential revision: https://reviews.llvm.org/D53006 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344307 91177308-0d34-0410-b5e6-96231b3b80d8 --- tools/llvm-objcopy/CMakeLists.txt | 1 + tools/llvm-objcopy/CopyConfig.cpp | 424 +++++++++++++++++++++++++ tools/llvm-objcopy/CopyConfig.h | 113 +++++++ tools/llvm-objcopy/Object.h | 10 +- tools/llvm-objcopy/llvm-objcopy.cpp | 464 +--------------------------- 5 files changed, 541 insertions(+), 471 deletions(-) create mode 100644 tools/llvm-objcopy/CopyConfig.cpp create mode 100644 tools/llvm-objcopy/CopyConfig.h diff --git a/tools/llvm-objcopy/CMakeLists.txt b/tools/llvm-objcopy/CMakeLists.txt index b0cd66be5b3..8d963e56758 100644 --- a/tools/llvm-objcopy/CMakeLists.txt +++ b/tools/llvm-objcopy/CMakeLists.txt @@ -14,6 +14,7 @@ tablegen(LLVM StripOpts.inc -gen-opt-parser-defs) add_public_tablegen_target(StripOptsTableGen) add_llvm_tool(llvm-objcopy + CopyConfig.cpp llvm-objcopy.cpp Object.cpp DEPENDS diff --git a/tools/llvm-objcopy/CopyConfig.cpp b/tools/llvm-objcopy/CopyConfig.cpp new file mode 100644 index 00000000000..d814df10525 --- /dev/null +++ b/tools/llvm-objcopy/CopyConfig.cpp @@ -0,0 +1,424 @@ +//===- CopyConfig.cpp -----------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "CopyConfig.h" +#include "llvm-objcopy.h" + +#include "llvm/ADT/BitmaskEnum.h" +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Object/ELFTypes.h" +#include "llvm/Option/Arg.h" +#include "llvm/Option/ArgList.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compression.h" +#include "llvm/Support/MemoryBuffer.h" +#include +#include + +namespace llvm { +namespace objcopy { + +namespace { +enum ObjcopyID { + OBJCOPY_INVALID = 0, // This is not an option ID. +#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM, \ + HELPTEXT, METAVAR, VALUES) \ + OBJCOPY_##ID, +#include "ObjcopyOpts.inc" +#undef OPTION +}; + +#define PREFIX(NAME, VALUE) const char *const OBJCOPY_##NAME[] = VALUE; +#include "ObjcopyOpts.inc" +#undef PREFIX + +static const opt::OptTable::Info ObjcopyInfoTable[] = { +#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM, \ + HELPTEXT, METAVAR, VALUES) \ + {OBJCOPY_##PREFIX, \ + NAME, \ + HELPTEXT, \ + METAVAR, \ + OBJCOPY_##ID, \ + opt::Option::KIND##Class, \ + PARAM, \ + FLAGS, \ + OBJCOPY_##GROUP, \ + OBJCOPY_##ALIAS, \ + ALIASARGS, \ + VALUES}, +#include "ObjcopyOpts.inc" +#undef OPTION +}; + +class ObjcopyOptTable : public opt::OptTable { +public: + ObjcopyOptTable() : OptTable(ObjcopyInfoTable, true) {} +}; + +enum StripID { + STRIP_INVALID = 0, // This is not an option ID. +#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM, \ + HELPTEXT, METAVAR, VALUES) \ + STRIP_##ID, +#include "StripOpts.inc" +#undef OPTION +}; + +#define PREFIX(NAME, VALUE) const char *const STRIP_##NAME[] = VALUE; +#include "StripOpts.inc" +#undef PREFIX + +static const opt::OptTable::Info StripInfoTable[] = { +#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM, \ + HELPTEXT, METAVAR, VALUES) \ + {STRIP_##PREFIX, NAME, HELPTEXT, \ + METAVAR, STRIP_##ID, opt::Option::KIND##Class, \ + PARAM, FLAGS, STRIP_##GROUP, \ + STRIP_##ALIAS, ALIASARGS, VALUES}, +#include "StripOpts.inc" +#undef OPTION +}; + +class StripOptTable : public opt::OptTable { +public: + StripOptTable() : OptTable(StripInfoTable, true) {} +}; + +enum SectionFlag { + SecNone = 0, + SecAlloc = 1 << 0, + SecLoad = 1 << 1, + SecNoload = 1 << 2, + SecReadonly = 1 << 3, + SecDebug = 1 << 4, + SecCode = 1 << 5, + SecData = 1 << 6, + SecRom = 1 << 7, + SecMerge = 1 << 8, + SecStrings = 1 << 9, + SecContents = 1 << 10, + SecShare = 1 << 11, + LLVM_MARK_AS_BITMASK_ENUM(/* LargestValue = */ SecShare) +}; + +} // namespace + +static SectionFlag parseSectionRenameFlag(StringRef SectionName) { + return llvm::StringSwitch(SectionName) + .Case("alloc", SectionFlag::SecAlloc) + .Case("load", SectionFlag::SecLoad) + .Case("noload", SectionFlag::SecNoload) + .Case("readonly", SectionFlag::SecReadonly) + .Case("debug", SectionFlag::SecDebug) + .Case("code", SectionFlag::SecCode) + .Case("data", SectionFlag::SecData) + .Case("rom", SectionFlag::SecRom) + .Case("merge", SectionFlag::SecMerge) + .Case("strings", SectionFlag::SecStrings) + .Case("contents", SectionFlag::SecContents) + .Case("share", SectionFlag::SecShare) + .Default(SectionFlag::SecNone); +} + +static SectionRename parseRenameSectionValue(StringRef FlagValue) { + if (!FlagValue.contains('=')) + error("Bad format for --rename-section: missing '='"); + + // Initial split: ".foo" = ".bar,f1,f2,..." + auto Old2New = FlagValue.split('='); + SectionRename SR; + SR.OriginalName = Old2New.first; + + // Flags split: ".bar" "f1" "f2" ... + SmallVector NameAndFlags; + Old2New.second.split(NameAndFlags, ','); + SR.NewName = NameAndFlags[0]; + + if (NameAndFlags.size() > 1) { + SectionFlag Flags = SectionFlag::SecNone; + for (size_t I = 1, Size = NameAndFlags.size(); I < Size; ++I) { + SectionFlag Flag = parseSectionRenameFlag(NameAndFlags[I]); + if (Flag == SectionFlag::SecNone) + error("Unrecognized section flag '" + NameAndFlags[I] + + "'. Flags supported for GNU compatibility: alloc, load, noload, " + "readonly, debug, code, data, rom, share, contents, merge, " + "strings."); + Flags |= Flag; + } + + SR.NewFlags = 0; + if (Flags & SectionFlag::SecAlloc) + *SR.NewFlags |= ELF::SHF_ALLOC; + if (!(Flags & SectionFlag::SecReadonly)) + *SR.NewFlags |= ELF::SHF_WRITE; + if (Flags & SectionFlag::SecCode) + *SR.NewFlags |= ELF::SHF_EXECINSTR; + if (Flags & SectionFlag::SecMerge) + *SR.NewFlags |= ELF::SHF_MERGE; + if (Flags & SectionFlag::SecStrings) + *SR.NewFlags |= ELF::SHF_STRINGS; + } + + return SR; +} + +static const StringMap ArchMap{ + // Name, {EMachine, 64bit, LittleEndian} + {"aarch64", {ELF::EM_AARCH64, true, true}}, + {"arm", {ELF::EM_ARM, false, true}}, + {"i386", {ELF::EM_386, false, true}}, + {"i386:x86-64", {ELF::EM_X86_64, true, true}}, + {"powerpc:common64", {ELF::EM_PPC64, true, true}}, + {"sparc", {ELF::EM_SPARC, false, true}}, + {"x86-64", {ELF::EM_X86_64, true, true}}, +}; + +static const MachineInfo &getMachineInfo(StringRef Arch) { + auto Iter = ArchMap.find(Arch); + if (Iter == std::end(ArchMap)) + error("Invalid architecture: '" + Arch + "'"); + return Iter->getValue(); +} + +static void addGlobalSymbolsFromFile(std::vector &Symbols, + StringRef Filename) { + SmallVector Lines; + auto BufOrErr = MemoryBuffer::getFile(Filename); + if (!BufOrErr) + reportError(Filename, BufOrErr.getError()); + + BufOrErr.get()->getBuffer().split(Lines, '\n'); + for (StringRef Line : Lines) { + // Ignore everything after '#', trim whitespace, and only add the symbol if + // it's not empty. + auto TrimmedLine = Line.split('#').first.trim(); + if (!TrimmedLine.empty()) + Symbols.push_back(TrimmedLine.str()); + } +} + +// ParseObjcopyOptions returns the config and sets the input arguments. If a +// help flag is set then ParseObjcopyOptions will print the help messege and +// exit. +DriverConfig parseObjcopyOptions(ArrayRef ArgsArr) { + ObjcopyOptTable T; + unsigned MissingArgumentIndex, MissingArgumentCount; + llvm::opt::InputArgList InputArgs = + T.ParseArgs(ArgsArr, MissingArgumentIndex, MissingArgumentCount); + + if (InputArgs.size() == 0) { + T.PrintHelp(errs(), "llvm-objcopy input [output]", "objcopy tool"); + exit(1); + } + + if (InputArgs.hasArg(OBJCOPY_help)) { + T.PrintHelp(outs(), "llvm-objcopy input [output]", "objcopy tool"); + exit(0); + } + + if (InputArgs.hasArg(OBJCOPY_version)) { + cl::PrintVersionMessage(); + exit(0); + } + + SmallVector Positional; + + for (auto Arg : InputArgs.filtered(OBJCOPY_UNKNOWN)) + error("unknown argument '" + Arg->getAsString(InputArgs) + "'"); + + for (auto Arg : InputArgs.filtered(OBJCOPY_INPUT)) + Positional.push_back(Arg->getValue()); + + if (Positional.empty()) + error("No input file specified"); + + if (Positional.size() > 2) + error("Too many positional arguments"); + + CopyConfig Config; + Config.InputFilename = Positional[0]; + Config.OutputFilename = Positional[Positional.size() == 1 ? 0 : 1]; + Config.InputFormat = InputArgs.getLastArgValue(OBJCOPY_input_target); + Config.OutputFormat = InputArgs.getLastArgValue(OBJCOPY_output_target); + if (Config.InputFormat == "binary") { + auto BinaryArch = InputArgs.getLastArgValue(OBJCOPY_binary_architecture); + if (BinaryArch.empty()) + error("Specified binary input without specifiying an architecture"); + Config.BinaryArch = getMachineInfo(BinaryArch); + } + + if (auto Arg = InputArgs.getLastArg(OBJCOPY_compress_debug_sections, + OBJCOPY_compress_debug_sections_eq)) { + Config.CompressionType = DebugCompressionType::Z; + + if (Arg->getOption().getID() == OBJCOPY_compress_debug_sections_eq) { + Config.CompressionType = + StringSwitch( + InputArgs.getLastArgValue(OBJCOPY_compress_debug_sections_eq)) + .Case("zlib-gnu", DebugCompressionType::GNU) + .Case("zlib", DebugCompressionType::Z) + .Default(DebugCompressionType::None); + if (Config.CompressionType == DebugCompressionType::None) + error("Invalid or unsupported --compress-debug-sections format: " + + InputArgs.getLastArgValue(OBJCOPY_compress_debug_sections_eq)); + if (!zlib::isAvailable()) + error("LLVM was not compiled with LLVM_ENABLE_ZLIB: can not compress."); + } + } + + Config.SplitDWO = InputArgs.getLastArgValue(OBJCOPY_split_dwo); + Config.AddGnuDebugLink = InputArgs.getLastArgValue(OBJCOPY_add_gnu_debuglink); + Config.SymbolsPrefix = InputArgs.getLastArgValue(OBJCOPY_prefix_symbols); + + for (auto Arg : InputArgs.filtered(OBJCOPY_redefine_symbol)) { + if (!StringRef(Arg->getValue()).contains('=')) + error("Bad format for --redefine-sym"); + auto Old2New = StringRef(Arg->getValue()).split('='); + if (!Config.SymbolsToRename.insert(Old2New).second) + error("Multiple redefinition of symbol " + Old2New.first); + } + + for (auto Arg : InputArgs.filtered(OBJCOPY_rename_section)) { + SectionRename SR = parseRenameSectionValue(StringRef(Arg->getValue())); + if (!Config.SectionsToRename.try_emplace(SR.OriginalName, SR).second) + error("Multiple renames of section " + SR.OriginalName); + } + + for (auto Arg : InputArgs.filtered(OBJCOPY_remove_section)) + Config.ToRemove.push_back(Arg->getValue()); + for (auto Arg : InputArgs.filtered(OBJCOPY_keep)) + Config.Keep.push_back(Arg->getValue()); + for (auto Arg : InputArgs.filtered(OBJCOPY_only_keep)) + Config.OnlyKeep.push_back(Arg->getValue()); + for (auto Arg : InputArgs.filtered(OBJCOPY_add_section)) + Config.AddSection.push_back(Arg->getValue()); + for (auto Arg : InputArgs.filtered(OBJCOPY_dump_section)) + Config.DumpSection.push_back(Arg->getValue()); + Config.StripAll = InputArgs.hasArg(OBJCOPY_strip_all); + Config.StripAllGNU = InputArgs.hasArg(OBJCOPY_strip_all_gnu); + Config.StripDebug = InputArgs.hasArg(OBJCOPY_strip_debug); + Config.StripDWO = InputArgs.hasArg(OBJCOPY_strip_dwo); + Config.StripSections = InputArgs.hasArg(OBJCOPY_strip_sections); + Config.StripNonAlloc = InputArgs.hasArg(OBJCOPY_strip_non_alloc); + Config.StripUnneeded = InputArgs.hasArg(OBJCOPY_strip_unneeded); + Config.ExtractDWO = InputArgs.hasArg(OBJCOPY_extract_dwo); + Config.LocalizeHidden = InputArgs.hasArg(OBJCOPY_localize_hidden); + Config.Weaken = InputArgs.hasArg(OBJCOPY_weaken); + Config.DiscardAll = InputArgs.hasArg(OBJCOPY_discard_all); + Config.OnlyKeepDebug = InputArgs.hasArg(OBJCOPY_only_keep_debug); + Config.KeepFileSymbols = InputArgs.hasArg(OBJCOPY_keep_file_symbols); + Config.DecompressDebugSections = + InputArgs.hasArg(OBJCOPY_decompress_debug_sections); + for (auto Arg : InputArgs.filtered(OBJCOPY_localize_symbol)) + Config.SymbolsToLocalize.push_back(Arg->getValue()); + for (auto Arg : InputArgs.filtered(OBJCOPY_keep_global_symbol)) + Config.SymbolsToKeepGlobal.push_back(Arg->getValue()); + for (auto Arg : InputArgs.filtered(OBJCOPY_keep_global_symbols)) + addGlobalSymbolsFromFile(Config.SymbolsToKeepGlobal, Arg->getValue()); + for (auto Arg : InputArgs.filtered(OBJCOPY_globalize_symbol)) + Config.SymbolsToGlobalize.push_back(Arg->getValue()); + for (auto Arg : InputArgs.filtered(OBJCOPY_weaken_symbol)) + Config.SymbolsToWeaken.push_back(Arg->getValue()); + for (auto Arg : InputArgs.filtered(OBJCOPY_strip_symbol)) + Config.SymbolsToRemove.push_back(Arg->getValue()); + for (auto Arg : InputArgs.filtered(OBJCOPY_keep_symbol)) + Config.SymbolsToKeep.push_back(Arg->getValue()); + + Config.PreserveDates = InputArgs.hasArg(OBJCOPY_preserve_dates); + + DriverConfig DC; + DC.CopyConfigs.push_back(std::move(Config)); + if (Config.DecompressDebugSections && + Config.CompressionType != DebugCompressionType::None) { + error("Cannot specify --compress-debug-sections at the same time as " + "--decompress-debug-sections at the same time"); + } + + if (Config.DecompressDebugSections && !zlib::isAvailable()) + error("LLVM was not compiled with LLVM_ENABLE_ZLIB: cannot decompress."); + + return DC; +} + +// ParseStripOptions returns the config and sets the input arguments. If a +// help flag is set then ParseStripOptions will print the help messege and +// exit. +DriverConfig parseStripOptions(ArrayRef ArgsArr) { + StripOptTable T; + unsigned MissingArgumentIndex, MissingArgumentCount; + llvm::opt::InputArgList InputArgs = + T.ParseArgs(ArgsArr, MissingArgumentIndex, MissingArgumentCount); + + if (InputArgs.size() == 0) { + T.PrintHelp(errs(), "llvm-strip [options] file...", "strip tool"); + exit(1); + } + + if (InputArgs.hasArg(STRIP_help)) { + T.PrintHelp(outs(), "llvm-strip [options] file...", "strip tool"); + exit(0); + } + + if (InputArgs.hasArg(STRIP_version)) { + cl::PrintVersionMessage(); + exit(0); + } + + SmallVector Positional; + for (auto Arg : InputArgs.filtered(STRIP_UNKNOWN)) + error("unknown argument '" + Arg->getAsString(InputArgs) + "'"); + for (auto Arg : InputArgs.filtered(STRIP_INPUT)) + Positional.push_back(Arg->getValue()); + + if (Positional.empty()) + error("No input file specified"); + + if (Positional.size() > 1 && InputArgs.hasArg(STRIP_output)) + error("Multiple input files cannot be used in combination with -o"); + + CopyConfig Config; + Config.StripDebug = InputArgs.hasArg(STRIP_strip_debug); + + Config.DiscardAll = InputArgs.hasArg(STRIP_discard_all); + Config.StripUnneeded = InputArgs.hasArg(STRIP_strip_unneeded); + Config.StripAll = InputArgs.hasArg(STRIP_strip_all); + + if (!Config.StripDebug && !Config.StripUnneeded && !Config.DiscardAll) + Config.StripAll = true; + + for (auto Arg : InputArgs.filtered(STRIP_remove_section)) + Config.ToRemove.push_back(Arg->getValue()); + + for (auto Arg : InputArgs.filtered(STRIP_keep_symbol)) + Config.SymbolsToKeep.push_back(Arg->getValue()); + + Config.PreserveDates = InputArgs.hasArg(STRIP_preserve_dates); + + DriverConfig DC; + if (Positional.size() == 1) { + Config.InputFilename = Positional[0]; + Config.OutputFilename = + InputArgs.getLastArgValue(STRIP_output, Positional[0]); + DC.CopyConfigs.push_back(std::move(Config)); + } else { + for (const char *Filename : Positional) { + Config.InputFilename = Filename; + Config.OutputFilename = Filename; + DC.CopyConfigs.push_back(Config); + } + } + + return DC; +} + +} // namespace objcopy +} // namespace llvm diff --git a/tools/llvm-objcopy/CopyConfig.h b/tools/llvm-objcopy/CopyConfig.h new file mode 100644 index 00000000000..203432a11a6 --- /dev/null +++ b/tools/llvm-objcopy/CopyConfig.h @@ -0,0 +1,113 @@ +//===- CopyConfig.h -------------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_OBJCOPY_COPY_CONFIG_H +#define LLVM_TOOLS_LLVM_OBJCOPY_COPY_CONFIG_H + +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/ADT/StringRef.h" +// Necessary for llvm::DebugCompressionType::None +#include "llvm/Target/TargetOptions.h" +#include +#include + +namespace llvm { +namespace objcopy { + +// This type keeps track of the machine info for various architectures. This +// lets us map architecture names to ELF types and the e_machine value of the +// ELF file. +struct MachineInfo { + uint16_t EMachine; + bool Is64Bit; + bool IsLittleEndian; +}; + +struct SectionRename { + StringRef OriginalName; + StringRef NewName; + Optional NewFlags; +}; + +// Configuration for copying/stripping a single file. +struct CopyConfig { + // Main input/output options + StringRef InputFilename; + StringRef InputFormat; + StringRef OutputFilename; + StringRef OutputFormat; + + // Only applicable for --input-format=Binary + MachineInfo BinaryArch; + + // Advanced options + StringRef AddGnuDebugLink; + StringRef SplitDWO; + StringRef SymbolsPrefix; + + // Repeated options + std::vector AddSection; + std::vector DumpSection; + std::vector Keep; + std::vector OnlyKeep; + std::vector SymbolsToGlobalize; + std::vector SymbolsToKeep; + std::vector SymbolsToLocalize; + std::vector SymbolsToRemove; + std::vector SymbolsToWeaken; + std::vector ToRemove; + std::vector SymbolsToKeepGlobal; + + // Map options + StringMap SectionsToRename; + StringMap SymbolsToRename; + + // Boolean options + bool DiscardAll = false; + bool ExtractDWO = false; + bool KeepFileSymbols = false; + bool LocalizeHidden = false; + bool OnlyKeepDebug = false; + bool PreserveDates = false; + bool StripAll = false; + bool StripAllGNU = false; + bool StripDWO = false; + bool StripDebug = false; + bool StripNonAlloc = false; + bool StripSections = false; + bool StripUnneeded = false; + bool Weaken = false; + bool DecompressDebugSections = false; + DebugCompressionType CompressionType = DebugCompressionType::None; +}; + +// Configuration for the overall invocation of this tool. When invoked as +// objcopy, will always contain exactly one CopyConfig. When invoked as strip, +// will contain one or more CopyConfigs. +struct DriverConfig { + SmallVector CopyConfigs; +}; + +// ParseObjcopyOptions returns the config and sets the input arguments. If a +// help flag is set then ParseObjcopyOptions will print the help messege and +// exit. +DriverConfig parseObjcopyOptions(ArrayRef ArgsArr); + +// ParseStripOptions returns the config and sets the input arguments. If a +// help flag is set then ParseStripOptions will print the help messege and +// exit. +DriverConfig parseStripOptions(ArrayRef ArgsArr); + +} // namespace objcopy +} // namespace llvm + +#endif diff --git a/tools/llvm-objcopy/Object.h b/tools/llvm-objcopy/Object.h index 5fb03a5501e..46c8f1ca4bf 100644 --- a/tools/llvm-objcopy/Object.h +++ b/tools/llvm-objcopy/Object.h @@ -10,6 +10,7 @@ #ifndef LLVM_TOOLS_OBJCOPY_OBJECT_H #define LLVM_TOOLS_OBJCOPY_OBJECT_H +#include "CopyConfig.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Twine.h" @@ -67,15 +68,6 @@ public: enum ElfType { ELFT_ELF32LE, ELFT_ELF64LE, ELFT_ELF32BE, ELFT_ELF64BE }; -// This type keeps track of the machine info for various architectures. This -// lets us map architecture names to ELF types and the e_machine value of the -// ELF file. -struct MachineInfo { - uint16_t EMachine; - bool Is64Bit; - bool IsLittleEndian; -}; - class SectionVisitor { public: virtual ~SectionVisitor(); diff --git a/tools/llvm-objcopy/llvm-objcopy.cpp b/tools/llvm-objcopy/llvm-objcopy.cpp index 41c6ef3f3dc..c9b170d1d61 100644 --- a/tools/llvm-objcopy/llvm-objcopy.cpp +++ b/tools/llvm-objcopy/llvm-objcopy.cpp @@ -8,8 +8,9 @@ //===----------------------------------------------------------------------===// #include "llvm-objcopy.h" - +#include "CopyConfig.h" #include "Object.h" + #include "llvm/ADT/BitmaskEnum.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/STLExtras.h" @@ -56,160 +57,8 @@ using namespace llvm::objcopy; using namespace object; using namespace ELF; -namespace { - -enum ObjcopyID { - OBJCOPY_INVALID = 0, // This is not an option ID. -#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM, \ - HELPTEXT, METAVAR, VALUES) \ - OBJCOPY_##ID, -#include "ObjcopyOpts.inc" -#undef OPTION -}; - -#define PREFIX(NAME, VALUE) const char *const OBJCOPY_##NAME[] = VALUE; -#include "ObjcopyOpts.inc" -#undef PREFIX - -static const opt::OptTable::Info ObjcopyInfoTable[] = { -#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM, \ - HELPTEXT, METAVAR, VALUES) \ - {OBJCOPY_##PREFIX, \ - NAME, \ - HELPTEXT, \ - METAVAR, \ - OBJCOPY_##ID, \ - opt::Option::KIND##Class, \ - PARAM, \ - FLAGS, \ - OBJCOPY_##GROUP, \ - OBJCOPY_##ALIAS, \ - ALIASARGS, \ - VALUES}, -#include "ObjcopyOpts.inc" -#undef OPTION -}; - -class ObjcopyOptTable : public opt::OptTable { -public: - ObjcopyOptTable() : OptTable(ObjcopyInfoTable, true) {} -}; - -enum StripID { - STRIP_INVALID = 0, // This is not an option ID. -#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM, \ - HELPTEXT, METAVAR, VALUES) \ - STRIP_##ID, -#include "StripOpts.inc" -#undef OPTION -}; - -#define PREFIX(NAME, VALUE) const char *const STRIP_##NAME[] = VALUE; -#include "StripOpts.inc" -#undef PREFIX - -static const opt::OptTable::Info StripInfoTable[] = { -#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM, \ - HELPTEXT, METAVAR, VALUES) \ - {STRIP_##PREFIX, NAME, HELPTEXT, \ - METAVAR, STRIP_##ID, opt::Option::KIND##Class, \ - PARAM, FLAGS, STRIP_##GROUP, \ - STRIP_##ALIAS, ALIASARGS, VALUES}, -#include "StripOpts.inc" -#undef OPTION -}; - -class StripOptTable : public opt::OptTable { -public: - StripOptTable() : OptTable(StripInfoTable, true) {} -}; - -struct SectionRename { - StringRef OriginalName; - StringRef NewName; - Optional NewFlags; -}; - -// Configuration for copying/stripping a single file. -struct CopyConfig { - // Main input/output options - StringRef InputFilename; - StringRef InputFormat; - StringRef OutputFilename; - StringRef OutputFormat; - - // Only applicable for --input-format=Binary - MachineInfo BinaryArch; - - // Advanced options - StringRef AddGnuDebugLink; - StringRef SplitDWO; - StringRef SymbolsPrefix; - - // Repeated options - std::vector AddSection; - std::vector DumpSection; - std::vector Keep; - std::vector OnlyKeep; - std::vector SymbolsToGlobalize; - std::vector SymbolsToKeep; - std::vector SymbolsToLocalize; - std::vector SymbolsToRemove; - std::vector SymbolsToWeaken; - std::vector ToRemove; - std::vector SymbolsToKeepGlobal; - - // Map options - StringMap SectionsToRename; - StringMap SymbolsToRename; - - // Boolean options - bool DiscardAll = false; - bool ExtractDWO = false; - bool KeepFileSymbols = false; - bool LocalizeHidden = false; - bool OnlyKeepDebug = false; - bool PreserveDates = false; - bool StripAll = false; - bool StripAllGNU = false; - bool StripDWO = false; - bool StripDebug = false; - bool StripNonAlloc = false; - bool StripSections = false; - bool StripUnneeded = false; - bool Weaken = false; - bool DecompressDebugSections = false; - DebugCompressionType CompressionType = DebugCompressionType::None; -}; - -// Configuration for the overall invocation of this tool. When invoked as -// objcopy, will always contain exactly one CopyConfig. When invoked as strip, -// will contain one or more CopyConfigs. -struct DriverConfig { - SmallVector CopyConfigs; -}; - using SectionPred = std::function; -enum SectionFlag { - SecNone = 0, - SecAlloc = 1 << 0, - SecLoad = 1 << 1, - SecNoload = 1 << 2, - SecReadonly = 1 << 3, - SecDebug = 1 << 4, - SecCode = 1 << 5, - SecData = 1 << 6, - SecRom = 1 << 7, - SecMerge = 1 << 8, - SecStrings = 1 << 9, - SecContents = 1 << 10, - SecShare = 1 << 11, - LLVM_MARK_AS_BITMASK_ENUM(/* LargestValue = */ SecShare) -}; - -} // namespace - namespace llvm { namespace objcopy { @@ -242,65 +91,6 @@ LLVM_ATTRIBUTE_NORETURN void reportError(StringRef File, Error E) { } // end namespace objcopy } // end namespace llvm -static SectionFlag parseSectionRenameFlag(StringRef SectionName) { - return llvm::StringSwitch(SectionName) - .Case("alloc", SectionFlag::SecAlloc) - .Case("load", SectionFlag::SecLoad) - .Case("noload", SectionFlag::SecNoload) - .Case("readonly", SectionFlag::SecReadonly) - .Case("debug", SectionFlag::SecDebug) - .Case("code", SectionFlag::SecCode) - .Case("data", SectionFlag::SecData) - .Case("rom", SectionFlag::SecRom) - .Case("merge", SectionFlag::SecMerge) - .Case("strings", SectionFlag::SecStrings) - .Case("contents", SectionFlag::SecContents) - .Case("share", SectionFlag::SecShare) - .Default(SectionFlag::SecNone); -} - -static SectionRename parseRenameSectionValue(StringRef FlagValue) { - if (!FlagValue.contains('=')) - error("Bad format for --rename-section: missing '='"); - - // Initial split: ".foo" = ".bar,f1,f2,..." - auto Old2New = FlagValue.split('='); - SectionRename SR; - SR.OriginalName = Old2New.first; - - // Flags split: ".bar" "f1" "f2" ... - SmallVector NameAndFlags; - Old2New.second.split(NameAndFlags, ','); - SR.NewName = NameAndFlags[0]; - - if (NameAndFlags.size() > 1) { - SectionFlag Flags = SectionFlag::SecNone; - for (size_t I = 1, Size = NameAndFlags.size(); I < Size; ++I) { - SectionFlag Flag = parseSectionRenameFlag(NameAndFlags[I]); - if (Flag == SectionFlag::SecNone) - error("Unrecognized section flag '" + NameAndFlags[I] + - "'. Flags supported for GNU compatibility: alloc, load, noload, " - "readonly, debug, code, data, rom, share, contents, merge, " - "strings."); - Flags |= Flag; - } - - SR.NewFlags = 0; - if (Flags & SectionFlag::SecAlloc) - *SR.NewFlags |= ELF::SHF_ALLOC; - if (!(Flags & SectionFlag::SecReadonly)) - *SR.NewFlags |= ELF::SHF_WRITE; - if (Flags & SectionFlag::SecCode) - *SR.NewFlags |= ELF::SHF_EXECINSTR; - if (Flags & SectionFlag::SecMerge) - *SR.NewFlags |= ELF::SHF_MERGE; - if (Flags & SectionFlag::SecStrings) - *SR.NewFlags |= ELF::SHF_STRINGS; - } - - return SR; -} - static bool isDebugSection(const SectionBase &Sec) { return StringRef(Sec.Name).startswith(".debug") || StringRef(Sec.Name).startswith(".zdebug") || Sec.Name == ".gdb_index"; @@ -319,24 +109,6 @@ static bool onlyKeepDWOPred(const Object &Obj, const SectionBase &Sec) { return !isDWOSection(Sec); } -static const StringMap ArchMap{ - // Name, {EMachine, 64bit, LittleEndian} - {"aarch64", {EM_AARCH64, true, true}}, - {"arm", {EM_ARM, false, true}}, - {"i386", {EM_386, false, true}}, - {"i386:x86-64", {EM_X86_64, true, true}}, - {"powerpc:common64", {EM_PPC64, true, true}}, - {"sparc", {EM_SPARC, false, true}}, - {"x86-64", {EM_X86_64, true, true}}, -}; - -static const MachineInfo &getMachineInfo(StringRef Arch) { - auto Iter = ArchMap.find(Arch); - if (Iter == std::end(ArchMap)) - error("Invalid architecture: '" + Arch + "'"); - return Iter->getValue(); -} - static ElfType getOutputElfType(const Binary &Bin) { // Infer output ELF type from the input ELF object if (isa>(Bin)) @@ -870,238 +642,6 @@ static void executeElfObjcopy(const CopyConfig &Config) { } } -static void addGlobalSymbolsFromFile(std::vector &Symbols, - StringRef Filename) { - SmallVector Lines; - auto BufOrErr = MemoryBuffer::getFile(Filename); - if (!BufOrErr) - reportError(Filename, BufOrErr.getError()); - - BufOrErr.get()->getBuffer().split(Lines, '\n'); - for (StringRef Line : Lines) { - // Ignore everything after '#', trim whitespace, and only add the symbol if - // it's not empty. - auto TrimmedLine = Line.split('#').first.trim(); - if (!TrimmedLine.empty()) - Symbols.push_back(TrimmedLine.str()); - } -} - -// ParseObjcopyOptions returns the config and sets the input arguments. If a -// help flag is set then ParseObjcopyOptions will print the help messege and -// exit. -static DriverConfig parseObjcopyOptions(ArrayRef ArgsArr) { - ObjcopyOptTable T; - unsigned MissingArgumentIndex, MissingArgumentCount; - llvm::opt::InputArgList InputArgs = - T.ParseArgs(ArgsArr, MissingArgumentIndex, MissingArgumentCount); - - if (InputArgs.size() == 0) { - T.PrintHelp(errs(), "llvm-objcopy input [output]", "objcopy tool"); - exit(1); - } - - if (InputArgs.hasArg(OBJCOPY_help)) { - T.PrintHelp(outs(), "llvm-objcopy input [output]", "objcopy tool"); - exit(0); - } - - if (InputArgs.hasArg(OBJCOPY_version)) { - cl::PrintVersionMessage(); - exit(0); - } - - SmallVector Positional; - - for (auto Arg : InputArgs.filtered(OBJCOPY_UNKNOWN)) - error("unknown argument '" + Arg->getAsString(InputArgs) + "'"); - - for (auto Arg : InputArgs.filtered(OBJCOPY_INPUT)) - Positional.push_back(Arg->getValue()); - - if (Positional.empty()) - error("No input file specified"); - - if (Positional.size() > 2) - error("Too many positional arguments"); - - CopyConfig Config; - Config.InputFilename = Positional[0]; - Config.OutputFilename = Positional[Positional.size() == 1 ? 0 : 1]; - Config.InputFormat = InputArgs.getLastArgValue(OBJCOPY_input_target); - Config.OutputFormat = InputArgs.getLastArgValue(OBJCOPY_output_target); - if (Config.InputFormat == "binary") { - auto BinaryArch = InputArgs.getLastArgValue(OBJCOPY_binary_architecture); - if (BinaryArch.empty()) - error("Specified binary input without specifiying an architecture"); - Config.BinaryArch = getMachineInfo(BinaryArch); - } - - if (auto Arg = InputArgs.getLastArg(OBJCOPY_compress_debug_sections, - OBJCOPY_compress_debug_sections_eq)) { - Config.CompressionType = DebugCompressionType::Z; - - if (Arg->getOption().getID() == OBJCOPY_compress_debug_sections_eq) { - Config.CompressionType = - StringSwitch( - InputArgs.getLastArgValue(OBJCOPY_compress_debug_sections_eq)) - .Case("zlib-gnu", DebugCompressionType::GNU) - .Case("zlib", DebugCompressionType::Z) - .Default(DebugCompressionType::None); - if (Config.CompressionType == DebugCompressionType::None) - error("Invalid or unsupported --compress-debug-sections format: " + - InputArgs.getLastArgValue(OBJCOPY_compress_debug_sections_eq)); - if (!zlib::isAvailable()) - error("LLVM was not compiled with LLVM_ENABLE_ZLIB: can not compress."); - } - } - - Config.SplitDWO = InputArgs.getLastArgValue(OBJCOPY_split_dwo); - Config.AddGnuDebugLink = InputArgs.getLastArgValue(OBJCOPY_add_gnu_debuglink); - Config.SymbolsPrefix = InputArgs.getLastArgValue(OBJCOPY_prefix_symbols); - - for (auto Arg : InputArgs.filtered(OBJCOPY_redefine_symbol)) { - if (!StringRef(Arg->getValue()).contains('=')) - error("Bad format for --redefine-sym"); - auto Old2New = StringRef(Arg->getValue()).split('='); - if (!Config.SymbolsToRename.insert(Old2New).second) - error("Multiple redefinition of symbol " + Old2New.first); - } - - for (auto Arg : InputArgs.filtered(OBJCOPY_rename_section)) { - SectionRename SR = parseRenameSectionValue(StringRef(Arg->getValue())); - if (!Config.SectionsToRename.try_emplace(SR.OriginalName, SR).second) - error("Multiple renames of section " + SR.OriginalName); - } - - for (auto Arg : InputArgs.filtered(OBJCOPY_remove_section)) - Config.ToRemove.push_back(Arg->getValue()); - for (auto Arg : InputArgs.filtered(OBJCOPY_keep)) - Config.Keep.push_back(Arg->getValue()); - for (auto Arg : InputArgs.filtered(OBJCOPY_only_keep)) - Config.OnlyKeep.push_back(Arg->getValue()); - for (auto Arg : InputArgs.filtered(OBJCOPY_add_section)) - Config.AddSection.push_back(Arg->getValue()); - for (auto Arg : InputArgs.filtered(OBJCOPY_dump_section)) - Config.DumpSection.push_back(Arg->getValue()); - Config.StripAll = InputArgs.hasArg(OBJCOPY_strip_all); - Config.StripAllGNU = InputArgs.hasArg(OBJCOPY_strip_all_gnu); - Config.StripDebug = InputArgs.hasArg(OBJCOPY_strip_debug); - Config.StripDWO = InputArgs.hasArg(OBJCOPY_strip_dwo); - Config.StripSections = InputArgs.hasArg(OBJCOPY_strip_sections); - Config.StripNonAlloc = InputArgs.hasArg(OBJCOPY_strip_non_alloc); - Config.StripUnneeded = InputArgs.hasArg(OBJCOPY_strip_unneeded); - Config.ExtractDWO = InputArgs.hasArg(OBJCOPY_extract_dwo); - Config.LocalizeHidden = InputArgs.hasArg(OBJCOPY_localize_hidden); - Config.Weaken = InputArgs.hasArg(OBJCOPY_weaken); - Config.DiscardAll = InputArgs.hasArg(OBJCOPY_discard_all); - Config.OnlyKeepDebug = InputArgs.hasArg(OBJCOPY_only_keep_debug); - Config.KeepFileSymbols = InputArgs.hasArg(OBJCOPY_keep_file_symbols); - Config.DecompressDebugSections = - InputArgs.hasArg(OBJCOPY_decompress_debug_sections); - for (auto Arg : InputArgs.filtered(OBJCOPY_localize_symbol)) - Config.SymbolsToLocalize.push_back(Arg->getValue()); - for (auto Arg : InputArgs.filtered(OBJCOPY_keep_global_symbol)) - Config.SymbolsToKeepGlobal.push_back(Arg->getValue()); - for (auto Arg : InputArgs.filtered(OBJCOPY_keep_global_symbols)) - addGlobalSymbolsFromFile(Config.SymbolsToKeepGlobal, Arg->getValue()); - for (auto Arg : InputArgs.filtered(OBJCOPY_globalize_symbol)) - Config.SymbolsToGlobalize.push_back(Arg->getValue()); - for (auto Arg : InputArgs.filtered(OBJCOPY_weaken_symbol)) - Config.SymbolsToWeaken.push_back(Arg->getValue()); - for (auto Arg : InputArgs.filtered(OBJCOPY_strip_symbol)) - Config.SymbolsToRemove.push_back(Arg->getValue()); - for (auto Arg : InputArgs.filtered(OBJCOPY_keep_symbol)) - Config.SymbolsToKeep.push_back(Arg->getValue()); - - Config.PreserveDates = InputArgs.hasArg(OBJCOPY_preserve_dates); - - DriverConfig DC; - DC.CopyConfigs.push_back(std::move(Config)); - if (Config.DecompressDebugSections && - Config.CompressionType != DebugCompressionType::None) { - error("Cannot specify --compress-debug-sections at the same time as " - "--decompress-debug-sections at the same time"); - } - - if (Config.DecompressDebugSections && !zlib::isAvailable()) - error("LLVM was not compiled with LLVM_ENABLE_ZLIB: cannot decompress."); - - return DC; -} - -// ParseStripOptions returns the config and sets the input arguments. If a -// help flag is set then ParseStripOptions will print the help messege and -// exit. -static DriverConfig parseStripOptions(ArrayRef ArgsArr) { - StripOptTable T; - unsigned MissingArgumentIndex, MissingArgumentCount; - llvm::opt::InputArgList InputArgs = - T.ParseArgs(ArgsArr, MissingArgumentIndex, MissingArgumentCount); - - static const char Usage[] = "llvm-strip [options] file..."; - if (InputArgs.size() == 0) { - T.PrintHelp(errs(), Usage, "strip tool"); - exit(1); - } - - if (InputArgs.hasArg(STRIP_help)) { - T.PrintHelp(outs(), Usage, "strip tool"); - exit(0); - } - - if (InputArgs.hasArg(STRIP_version)) { - cl::PrintVersionMessage(); - exit(0); - } - - SmallVector Positional; - for (auto Arg : InputArgs.filtered(STRIP_UNKNOWN)) - error("unknown argument '" + Arg->getAsString(InputArgs) + "'"); - for (auto Arg : InputArgs.filtered(STRIP_INPUT)) - Positional.push_back(Arg->getValue()); - - if (Positional.empty()) - error("No input file specified"); - - if (Positional.size() > 1 && InputArgs.hasArg(STRIP_output)) - error("Multiple input files cannot be used in combination with -o"); - - CopyConfig Config; - Config.StripDebug = InputArgs.hasArg(STRIP_strip_debug); - - Config.DiscardAll = InputArgs.hasArg(STRIP_discard_all); - Config.StripUnneeded = InputArgs.hasArg(STRIP_strip_unneeded); - Config.StripAll = InputArgs.hasArg(STRIP_strip_all); - - if (!Config.StripDebug && !Config.StripUnneeded && !Config.DiscardAll) - Config.StripAll = true; - - for (auto Arg : InputArgs.filtered(STRIP_remove_section)) - Config.ToRemove.push_back(Arg->getValue()); - - for (auto Arg : InputArgs.filtered(STRIP_keep_symbol)) - Config.SymbolsToKeep.push_back(Arg->getValue()); - - Config.PreserveDates = InputArgs.hasArg(STRIP_preserve_dates); - - DriverConfig DC; - if (Positional.size() == 1) { - Config.InputFilename = Positional[0]; - Config.OutputFilename = - InputArgs.getLastArgValue(STRIP_output, Positional[0]); - DC.CopyConfigs.push_back(std::move(Config)); - } else { - for (const char *Filename : Positional) { - Config.InputFilename = Filename; - Config.OutputFilename = Filename; - DC.CopyConfigs.push_back(Config); - } - } - - return DC; -} - int main(int argc, char **argv) { InitLLVM X(argc, argv); ToolName = argv[0]; -- GitLab From 0739d3ad5470acdf5fdc9cfa036194e9af438fb3 Mon Sep 17 00:00:00 2001 From: Richard Trieu Date: Thu, 11 Oct 2018 22:42:41 +0000 Subject: [PATCH 0080/1116] Inline variable into assert to avoid unused variable warning. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344308 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index d118e38ae72..c6ab4fb70f6 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -26428,8 +26428,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, // Use an f64 load and a scalar_to_vector for v2f32 loads. This avoids // scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp cast // since type legalization will try to use an i64 load. - EVT VT = N->getValueType(0); - assert(VT == MVT::v2f32 && "Unexpected VT"); + assert(N->getValueType(0) == MVT::v2f32 && "Unexpected VT"); if (!ISD::isNON_EXTLoad(N)) return; auto *Ld = cast(N); -- GitLab From 81c9e86b06fc9ce882a0ea7f1a8df0d4b49f5cb2 Mon Sep 17 00:00:00 2001 From: Ana Pazos Date: Thu, 11 Oct 2018 22:49:13 +0000 Subject: [PATCH 0081/1116] [RISCV] Fix disassembling of fence instruction with invalid field Summary: Instruction with 0 in fence field being disassembled as fence , iorw. Printing "unknown" to match GAS behavior. This bug was uncovered by a LLVM MC Disassembler Protocol Buffer Fuzzer for the RISC-V assembly language. Reviewers: asb Subscribers: rbar, johnrusso, simoncook, sabuasal, niosHD, kito-cheng, shiva0217, zzheng, edward-jones, mgrang, rogfer01, MartinMosbeck, brucehoult, the_o, rkruppe, jfb, PkmX, jocewei, asb Differential Revision: https://reviews.llvm.org/D51828 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344309 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/RISCV/InstPrinter/RISCVInstPrinter.cpp | 4 ++++ test/MC/Disassembler/RISCV/unknown-fence-field.txt | 9 +++++++++ test/MC/RISCV/rv32i-invalid.s | 1 + 3 files changed, 14 insertions(+) create mode 100644 test/MC/Disassembler/RISCV/unknown-fence-field.txt diff --git a/lib/Target/RISCV/InstPrinter/RISCVInstPrinter.cpp b/lib/Target/RISCV/InstPrinter/RISCVInstPrinter.cpp index aa21cf0e6b4..979c8f4e2fa 100644 --- a/lib/Target/RISCV/InstPrinter/RISCVInstPrinter.cpp +++ b/lib/Target/RISCV/InstPrinter/RISCVInstPrinter.cpp @@ -93,6 +93,8 @@ void RISCVInstPrinter::printFenceArg(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { unsigned FenceArg = MI->getOperand(OpNo).getImm(); + assert (((FenceArg >> 4) == 0) && "Invalid immediate in printFenceArg"); + if ((FenceArg & RISCVFenceField::I) != 0) O << 'i'; if ((FenceArg & RISCVFenceField::O) != 0) @@ -101,6 +103,8 @@ void RISCVInstPrinter::printFenceArg(const MCInst *MI, unsigned OpNo, O << 'r'; if ((FenceArg & RISCVFenceField::W) != 0) O << 'w'; + if (FenceArg == 0) + O << "unknown"; } void RISCVInstPrinter::printFRMArg(const MCInst *MI, unsigned OpNo, diff --git a/test/MC/Disassembler/RISCV/unknown-fence-field.txt b/test/MC/Disassembler/RISCV/unknown-fence-field.txt new file mode 100644 index 00000000000..5b20994dcb6 --- /dev/null +++ b/test/MC/Disassembler/RISCV/unknown-fence-field.txt @@ -0,0 +1,9 @@ +# RUN: llvm-mc -disassemble -triple=riscv32 < %s 2>&1 | FileCheck %s +# RUN: llvm-mc -disassemble -triple=riscv64 < %s 2>&1 | FileCheck %s +# +# Test generated by a LLVM MC Disassembler Protocol Buffer Fuzzer +# for the RISC-V assembly language. + +# This decodes as fence , iorw with invalid fence field as 0. +[0x0f 0x00 0xf0 0x00] +# CHECK: fence unknown, iorw diff --git a/test/MC/RISCV/rv32i-invalid.s b/test/MC/RISCV/rv32i-invalid.s index 92b9b4ad34f..f856bf1f934 100644 --- a/test/MC/RISCV/rv32i-invalid.s +++ b/test/MC/RISCV/rv32i-invalid.s @@ -6,6 +6,7 @@ fence iorw, iore # CHECK: :[[@LINE]]:13: error: operand must be formed of letter fence wr, wr # CHECK: :[[@LINE]]:7: error: operand must be formed of letters selected in-order from 'iorw' fence rw, rr # CHECK: :[[@LINE]]:11: error: operand must be formed of letters selected in-order from 'iorw' fence 1, rw # CHECK: :[[@LINE]]:7: error: operand must be formed of letters selected in-order from 'iorw' +fence unknown, unknown # CHECK: :[[@LINE]]:7: error: operand must be formed of letters selected in-order from 'iorw' ## uimm5 slli a0, a0, 32 # CHECK: :[[@LINE]]:14: error: immediate must be an integer in the range [0, 31] -- GitLab From 1a0ffaa45417bbc9e684e65ff3fcda884435860f Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Thu, 11 Oct 2018 22:49:54 +0000 Subject: [PATCH 0082/1116] AMDGPU/GlobalISel: Implement select for G_INSERT Reviewers: arsenm Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, kristof.beyls, dstuttard, tpr, t-tye, llvm-commits Differential Revision: https://reviews.llvm.org/D53116 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344310 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../AMDGPU/AMDGPUInstructionSelector.cpp | 30 ++++++++++++ lib/Target/AMDGPU/AMDGPUInstructionSelector.h | 1 + .../AMDGPU/GlobalISel/inst-select-insert.mir | 49 +++++++++++++++++++ 3 files changed, 80 insertions(+) create mode 100644 test/CodeGen/AMDGPU/GlobalISel/inst-select-insert.mir diff --git a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 8eb49d49b2e..55ceb8f666f 100644 --- a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -178,6 +178,34 @@ bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const { return true; } +bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const { + MachineBasicBlock *BB = I.getParent(); + MachineFunction *MF = BB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + unsigned SubReg = TRI.getSubRegFromChannel(I.getOperand(3).getImm() / 32); + DebugLoc DL = I.getDebugLoc(); + MachineInstr *Ins = BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG)) + .addDef(I.getOperand(0).getReg()) + .addReg(I.getOperand(1).getReg()) + .addReg(I.getOperand(2).getReg()) + .addImm(SubReg); + + for (const MachineOperand &MO : Ins->operands()) { + if (!MO.isReg()) + continue; + if (TargetRegisterInfo::isPhysicalRegister(MO.getReg())) + continue; + + const TargetRegisterClass *RC = + TRI.getConstrainedRegClassForOperand(MO, MRI); + if (!RC) + continue; + RBI.constrainGenericRegister(MO.getReg(), *RC, MRI); + } + I.eraseFromParent(); + return true; +} + bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I, CodeGenCoverage &CoverageInfo) const { unsigned IntrinsicID = I.getOperand(1).getIntrinsicID(); @@ -640,6 +668,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I, return selectG_GEP(I); case TargetOpcode::G_IMPLICIT_DEF: return selectG_IMPLICIT_DEF(I); + case TargetOpcode::G_INSERT: + return selectG_INSERT(I); case TargetOpcode::G_INTRINSIC: return selectG_INTRINSIC(I, CoverageInfo); case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: diff --git a/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index 449431adc56..f3a835a32a8 100644 --- a/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -67,6 +67,7 @@ private: bool selectG_ADD(MachineInstr &I) const; bool selectG_GEP(MachineInstr &I) const; bool selectG_IMPLICIT_DEF(MachineInstr &I) const; + bool selectG_INSERT(MachineInstr &I) const; bool selectG_INTRINSIC(MachineInstr &I, CodeGenCoverage &CoverageInfo) const; bool selectG_INTRINSIC_W_SIDE_EFFECTS(MachineInstr &I, CodeGenCoverage &CoverageInfo) const; diff --git a/test/CodeGen/AMDGPU/GlobalISel/inst-select-insert.mir b/test/CodeGen/AMDGPU/GlobalISel/inst-select-insert.mir new file mode 100644 index 00000000000..93e35ead4d4 --- /dev/null +++ b/test/CodeGen/AMDGPU/GlobalISel/inst-select-insert.mir @@ -0,0 +1,49 @@ +# RUN: llc -march=amdgcn -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s +--- + +name: insert512 +legalized: true +regBankSelected: true + +# CHECK-LABEL: insert512 +# CHECK: [[BASE:%[0-9]+]]:sreg_512 = IMPLICIT_DEF +# CHECK: [[VAL:%[0-9]+]]:sreg_32_xm0 = IMPLICIT_DEF +# CHECK: [[BASE0:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE]], [[VAL]], %subreg.sub0 +# CHECK: [[BASE1:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE0]], [[VAL]], %subreg.sub1 +# CHECK: [[BASE2:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE1]], [[VAL]], %subreg.sub2 +# CHECK: [[BASE3:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE2]], [[VAL]], %subreg.sub3 +# CHECK: [[BASE4:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE3]], [[VAL]], %subreg.sub4 +# CHECK: [[BASE5:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE4]], [[VAL]], %subreg.sub5 +# CHECK: [[BASE6:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE5]], [[VAL]], %subreg.sub6 +# CHECK: [[BASE7:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE6]], [[VAL]], %subreg.sub7 +# CHECK: [[BASE8:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE7]], [[VAL]], %subreg.sub8 +# CHECK: [[BASE9:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE8]], [[VAL]], %subreg.sub9 +# CHECK: [[BASE10:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE9]], [[VAL]], %subreg.sub10 +# CHECK: [[BASE11:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE10]], [[VAL]], %subreg.sub11 +# CHECK: [[BASE12:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE11]], [[VAL]], %subreg.sub12 +# CHECK: [[BASE13:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE12]], [[VAL]], %subreg.sub13 +# CHECK: [[BASE14:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE13]], [[VAL]], %subreg.sub14 +# CHECK: [[BASE15:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE14]], [[VAL]], %subreg.sub15 + +body: | + bb.0: + %0:sgpr(s512) = G_IMPLICIT_DEF + %1:sgpr(s32) = G_IMPLICIT_DEF + %2:sgpr(s512) = G_INSERT %0:sgpr, %1:sgpr(s32), 0 + %3:sgpr(s512) = G_INSERT %2:sgpr, %1:sgpr(s32), 32 + %4:sgpr(s512) = G_INSERT %3:sgpr, %1:sgpr(s32), 64 + %5:sgpr(s512) = G_INSERT %4:sgpr, %1:sgpr(s32), 96 + %6:sgpr(s512) = G_INSERT %5:sgpr, %1:sgpr(s32), 128 + %7:sgpr(s512) = G_INSERT %6:sgpr, %1:sgpr(s32), 160 + %8:sgpr(s512) = G_INSERT %7:sgpr, %1:sgpr(s32), 192 + %9:sgpr(s512) = G_INSERT %8:sgpr, %1:sgpr(s32), 224 + %10:sgpr(s512) = G_INSERT %9:sgpr, %1:sgpr(s32), 256 + %11:sgpr(s512) = G_INSERT %10:sgpr, %1:sgpr(s32), 288 + %12:sgpr(s512) = G_INSERT %11:sgpr, %1:sgpr(s32), 320 + %13:sgpr(s512) = G_INSERT %12:sgpr, %1:sgpr(s32), 352 + %14:sgpr(s512) = G_INSERT %13:sgpr, %1:sgpr(s32), 384 + %15:sgpr(s512) = G_INSERT %14:sgpr, %1:sgpr(s32), 416 + %16:sgpr(s512) = G_INSERT %15:sgpr, %1:sgpr(s32), 448 + %17:sgpr(s512) = G_INSERT %16:sgpr, %1:sgpr(s32), 480 + $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY %17:sgpr(s512) + SI_RETURN_TO_EPILOG $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 -- GitLab From 8e011ae1984fe276b409af3d8ff612f17f786c8e Mon Sep 17 00:00:00 2001 From: Kostya Serebryany Date: Thu, 11 Oct 2018 23:03:27 +0000 Subject: [PATCH 0083/1116] merge two near-identical functions createPrivateGlobalForString into one Summary: We have two copies of createPrivateGlobalForString (in asan and in esan). This change merges them into one. NFC Reviewers: vitalybuka Reviewed By: vitalybuka Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D53178 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344314 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Transforms/Instrumentation.h | 5 +++++ .../Instrumentation/AddressSanitizer.cpp | 22 ++++--------------- .../Instrumentation/EfficiencySanitizer.cpp | 15 ------------- .../Instrumentation/Instrumentation.cpp | 17 ++++++++++++++ 4 files changed, 26 insertions(+), 33 deletions(-) diff --git a/include/llvm/Transforms/Instrumentation.h b/include/llvm/Transforms/Instrumentation.h index d6d9529ba9a..2157fcab726 100644 --- a/include/llvm/Transforms/Instrumentation.h +++ b/include/llvm/Transforms/Instrumentation.h @@ -36,6 +36,11 @@ class OptimizationRemarkEmitter; BasicBlock::iterator PrepareToSplitEntryBlock(BasicBlock &BB, BasicBlock::iterator IP); +// Create a constant for Str so that we can pass it to the run-time lib. +GlobalVariable *createPrivateGlobalForString(Module &M, StringRef Str, + bool AllowMerging, + const char *NamePrefix = ""); + // Insert GCOV profiling instrumentation struct GCOVOptions { static GCOVOptions getDefault(); diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp index b819565e7ba..b832417154e 100644 --- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -1174,25 +1174,11 @@ static size_t TypeSizeToSizeIndex(uint32_t TypeSize) { return Res; } -// Create a constant for Str so that we can pass it to the run-time lib. -static GlobalVariable *createPrivateGlobalForString(Module &M, StringRef Str, - bool AllowMerging) { - Constant *StrConst = ConstantDataArray::getString(M.getContext(), Str); - // We use private linkage for module-local strings. If they can be merged - // with another one, we set the unnamed_addr attribute. - GlobalVariable *GV = - new GlobalVariable(M, StrConst->getType(), true, - GlobalValue::PrivateLinkage, StrConst, kAsanGenPrefix); - if (AllowMerging) GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); - GV->setAlignment(1); // Strings may not be merged w/o setting align 1. - return GV; -} - /// Create a global describing a source location. static GlobalVariable *createPrivateGlobalForSourceLoc(Module &M, LocationMetadata MD) { Constant *LocData[] = { - createPrivateGlobalForString(M, MD.Filename, true), + createPrivateGlobalForString(M, MD.Filename, true, kAsanGenPrefix), ConstantInt::get(Type::getInt32Ty(M.getContext()), MD.LineNo), ConstantInt::get(Type::getInt32Ty(M.getContext()), MD.ColumnNo), }; @@ -2179,7 +2165,7 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M, bool // We shouldn't merge same module names, as this string serves as unique // module ID in runtime. GlobalVariable *ModuleName = createPrivateGlobalForString( - M, M.getModuleIdentifier(), /*AllowMerging*/ false); + M, M.getModuleIdentifier(), /*AllowMerging*/ false, kAsanGenPrefix); for (size_t i = 0; i < n; i++) { static const uint64_t kMaxGlobalRedzone = 1 << 18; @@ -2191,7 +2177,7 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M, bool // if it's available, otherwise just write the name of global variable). GlobalVariable *Name = createPrivateGlobalForString( M, MD.Name.empty() ? NameForGlobal : MD.Name, - /*AllowMerging*/ true); + /*AllowMerging*/ true, kAsanGenPrefix); Type *Ty = G->getValueType(); uint64_t SizeInBytes = DL.getTypeAllocSize(Ty); @@ -3072,7 +3058,7 @@ void FunctionStackPoisoner::processStaticAllocas() { IntptrPtrTy); GlobalVariable *StackDescriptionGlobal = createPrivateGlobalForString(*F.getParent(), DescriptionString, - /*AllowMerging*/ true); + /*AllowMerging*/ true, kAsanGenPrefix); Value *Description = IRB.CreatePointerCast(StackDescriptionGlobal, IntptrTy); IRB.CreateStore(Description, BasePlus1); // Write the PC to redzone[2]. diff --git a/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp b/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp index 33f220a893d..0ab915de60d 100644 --- a/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp +++ b/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp @@ -144,21 +144,6 @@ OverrideOptionsFromCL(EfficiencySanitizerOptions Options) { return Options; } -// Create a constant for Str so that we can pass it to the run-time lib. -static GlobalVariable *createPrivateGlobalForString(Module &M, StringRef Str, - bool AllowMerging) { - Constant *StrConst = ConstantDataArray::getString(M.getContext(), Str); - // We use private linkage for module-local strings. If they can be merged - // with another one, we set the unnamed_addr attribute. - GlobalVariable *GV = - new GlobalVariable(M, StrConst->getType(), true, - GlobalValue::PrivateLinkage, StrConst, ""); - if (AllowMerging) - GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); - GV->setAlignment(1); // Strings may not be merged w/o setting align 1. - return GV; -} - /// EfficiencySanitizer: instrument each module to find performance issues. class EfficiencySanitizer : public ModulePass { public: diff --git a/lib/Transforms/Instrumentation/Instrumentation.cpp b/lib/Transforms/Instrumentation/Instrumentation.cpp index ea819c1856b..1c739c09e39 100644 --- a/lib/Transforms/Instrumentation/Instrumentation.cpp +++ b/lib/Transforms/Instrumentation/Instrumentation.cpp @@ -15,6 +15,7 @@ #include "llvm/Transforms/Instrumentation.h" #include "llvm-c/Initialization.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" #include "llvm/InitializePasses.h" #include "llvm/PassRegistry.h" @@ -53,6 +54,22 @@ BasicBlock::iterator llvm::PrepareToSplitEntryBlock(BasicBlock &BB, return IP; } +// Create a constant for Str so that we can pass it to the run-time lib. +GlobalVariable *llvm::createPrivateGlobalForString(Module &M, StringRef Str, + bool AllowMerging, + const char *NamePrefix) { + Constant *StrConst = ConstantDataArray::getString(M.getContext(), Str); + // We use private linkage for module-local strings. If they can be merged + // with another one, we set the unnamed_addr attribute. + GlobalVariable *GV = + new GlobalVariable(M, StrConst->getType(), true, + GlobalValue::PrivateLinkage, StrConst, NamePrefix); + if (AllowMerging) + GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); + GV->setAlignment(1); // Strings may not be merged w/o setting align 1. + return GV; +} + /// initializeInstrumentation - Initialize all passes in the TransformUtils /// library. void llvm::initializeInstrumentation(PassRegistry &Registry) { -- GitLab From f7c87d986fb77519ba7cd394bf30da7d0e28241b Mon Sep 17 00:00:00 2001 From: Matthias Braun Date: Thu, 11 Oct 2018 23:14:35 +0000 Subject: [PATCH 0084/1116] X86/TargetTransformInfo: Report div/rem constant immediate costs as TCC_Free DIV/REM by constants should always be expanded into mul/shift/etc. patterns. Unfortunately the ConstantHoisting pass runs too early at a point where the pattern isn't expanded yet. However after ConstantHoisting hoisted some immediate the result may not expand anymore. Also the hoisting typically doesn't make sense because it operates on immediates that will change completely during the expansion. Report DIV/REM as TCC_Free so ConstantHoisting will not touch them. Differential Revision: https://reviews.llvm.org/D53174 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344315 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86TargetTransformInfo.cpp | 6 ++- .../ConstantHoisting/X86/bad-cases.ll | 47 +++++++++++++++++++ 2 files changed, 52 insertions(+), 1 deletion(-) create mode 100644 test/Transforms/ConstantHoisting/X86/bad-cases.ll diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp index 4c14715b758..d3a75123935 100644 --- a/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -2342,11 +2342,15 @@ int X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, return TTI::TCC_Free; ImmIdx = 1; break; - case Instruction::Mul: case Instruction::UDiv: case Instruction::SDiv: case Instruction::URem: case Instruction::SRem: + // Division by constant is typically expanded later into a different + // instruction sequence. This completely changes the constants. + // Report them as "free" to stop ConstantHoist from marking them as opaque. + return TTI::TCC_Free; + case Instruction::Mul: case Instruction::Or: case Instruction::Xor: ImmIdx = 1; diff --git a/test/Transforms/ConstantHoisting/X86/bad-cases.ll b/test/Transforms/ConstantHoisting/X86/bad-cases.ll new file mode 100644 index 00000000000..00890942096 --- /dev/null +++ b/test/Transforms/ConstantHoisting/X86/bad-cases.ll @@ -0,0 +1,47 @@ +; RUN: opt -consthoist -S < %s | FileCheck %s +target triple = "x86_64--" + +; We don't want to convert constant divides because the benefit from converting +; them to a mul in the backend is larget than constant materialization savings. +define void @signed_const_division(i64 %in1, i64 %in2, i64* %addr) { +; CHECK-LABEL: @signed_const_division +; CHECK: %res1 = sdiv i64 %l1, 4294967296 +; CHECK: %res2 = srem i64 %l2, 4294967296 +entry: + br label %loop + +loop: + %l1 = phi i64 [%res1, %loop], [%in1, %entry] + %l2 = phi i64 [%res2, %loop], [%in2, %entry] + %res1 = sdiv i64 %l1, 4294967296 + store volatile i64 %res1, i64* %addr + %res2 = srem i64 %l2, 4294967296 + store volatile i64 %res2, i64* %addr + %again = icmp eq i64 %res1, %res2 + br i1 %again, label %loop, label %end + +end: + ret void +} + +define void @unsigned_const_division(i64 %in1, i64 %in2, i64* %addr) { +; CHECK-LABEL: @unsigned_const_division +; CHECK: %res1 = udiv i64 %l1, 4294967296 +; CHECK: %res2 = urem i64 %l2, 4294967296 + +entry: + br label %loop + +loop: + %l1 = phi i64 [%res1, %loop], [%in1, %entry] + %l2 = phi i64 [%res2, %loop], [%in2, %entry] + %res1 = udiv i64 %l1, 4294967296 + store volatile i64 %res1, i64* %addr + %res2 = urem i64 %l2, 4294967296 + store volatile i64 %res2, i64* %addr + %again = icmp eq i64 %res1, %res2 + br i1 %again, label %loop, label %end + +end: + ret void +} -- GitLab From 639949cb1501998d06b97f782079ed5fe65e597a Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Thu, 11 Oct 2018 23:36:46 +0000 Subject: [PATCH 0085/1116] Revert "AMDGPU/GlobalISel: Implement select for G_INSERT" This reverts commit r344310. The test case was failing on some bots. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344317 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../AMDGPU/AMDGPUInstructionSelector.cpp | 30 ------------ lib/Target/AMDGPU/AMDGPUInstructionSelector.h | 1 - .../AMDGPU/GlobalISel/inst-select-insert.mir | 49 ------------------- 3 files changed, 80 deletions(-) delete mode 100644 test/CodeGen/AMDGPU/GlobalISel/inst-select-insert.mir diff --git a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 55ceb8f666f..8eb49d49b2e 100644 --- a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -178,34 +178,6 @@ bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const { return true; } -bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const { - MachineBasicBlock *BB = I.getParent(); - MachineFunction *MF = BB->getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); - unsigned SubReg = TRI.getSubRegFromChannel(I.getOperand(3).getImm() / 32); - DebugLoc DL = I.getDebugLoc(); - MachineInstr *Ins = BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG)) - .addDef(I.getOperand(0).getReg()) - .addReg(I.getOperand(1).getReg()) - .addReg(I.getOperand(2).getReg()) - .addImm(SubReg); - - for (const MachineOperand &MO : Ins->operands()) { - if (!MO.isReg()) - continue; - if (TargetRegisterInfo::isPhysicalRegister(MO.getReg())) - continue; - - const TargetRegisterClass *RC = - TRI.getConstrainedRegClassForOperand(MO, MRI); - if (!RC) - continue; - RBI.constrainGenericRegister(MO.getReg(), *RC, MRI); - } - I.eraseFromParent(); - return true; -} - bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I, CodeGenCoverage &CoverageInfo) const { unsigned IntrinsicID = I.getOperand(1).getIntrinsicID(); @@ -668,8 +640,6 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I, return selectG_GEP(I); case TargetOpcode::G_IMPLICIT_DEF: return selectG_IMPLICIT_DEF(I); - case TargetOpcode::G_INSERT: - return selectG_INSERT(I); case TargetOpcode::G_INTRINSIC: return selectG_INTRINSIC(I, CoverageInfo); case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: diff --git a/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index f3a835a32a8..449431adc56 100644 --- a/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -67,7 +67,6 @@ private: bool selectG_ADD(MachineInstr &I) const; bool selectG_GEP(MachineInstr &I) const; bool selectG_IMPLICIT_DEF(MachineInstr &I) const; - bool selectG_INSERT(MachineInstr &I) const; bool selectG_INTRINSIC(MachineInstr &I, CodeGenCoverage &CoverageInfo) const; bool selectG_INTRINSIC_W_SIDE_EFFECTS(MachineInstr &I, CodeGenCoverage &CoverageInfo) const; diff --git a/test/CodeGen/AMDGPU/GlobalISel/inst-select-insert.mir b/test/CodeGen/AMDGPU/GlobalISel/inst-select-insert.mir deleted file mode 100644 index 93e35ead4d4..00000000000 --- a/test/CodeGen/AMDGPU/GlobalISel/inst-select-insert.mir +++ /dev/null @@ -1,49 +0,0 @@ -# RUN: llc -march=amdgcn -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s ---- - -name: insert512 -legalized: true -regBankSelected: true - -# CHECK-LABEL: insert512 -# CHECK: [[BASE:%[0-9]+]]:sreg_512 = IMPLICIT_DEF -# CHECK: [[VAL:%[0-9]+]]:sreg_32_xm0 = IMPLICIT_DEF -# CHECK: [[BASE0:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE]], [[VAL]], %subreg.sub0 -# CHECK: [[BASE1:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE0]], [[VAL]], %subreg.sub1 -# CHECK: [[BASE2:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE1]], [[VAL]], %subreg.sub2 -# CHECK: [[BASE3:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE2]], [[VAL]], %subreg.sub3 -# CHECK: [[BASE4:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE3]], [[VAL]], %subreg.sub4 -# CHECK: [[BASE5:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE4]], [[VAL]], %subreg.sub5 -# CHECK: [[BASE6:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE5]], [[VAL]], %subreg.sub6 -# CHECK: [[BASE7:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE6]], [[VAL]], %subreg.sub7 -# CHECK: [[BASE8:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE7]], [[VAL]], %subreg.sub8 -# CHECK: [[BASE9:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE8]], [[VAL]], %subreg.sub9 -# CHECK: [[BASE10:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE9]], [[VAL]], %subreg.sub10 -# CHECK: [[BASE11:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE10]], [[VAL]], %subreg.sub11 -# CHECK: [[BASE12:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE11]], [[VAL]], %subreg.sub12 -# CHECK: [[BASE13:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE12]], [[VAL]], %subreg.sub13 -# CHECK: [[BASE14:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE13]], [[VAL]], %subreg.sub14 -# CHECK: [[BASE15:%[0-9]+]]:sreg_512 = INSERT_SUBREG [[BASE14]], [[VAL]], %subreg.sub15 - -body: | - bb.0: - %0:sgpr(s512) = G_IMPLICIT_DEF - %1:sgpr(s32) = G_IMPLICIT_DEF - %2:sgpr(s512) = G_INSERT %0:sgpr, %1:sgpr(s32), 0 - %3:sgpr(s512) = G_INSERT %2:sgpr, %1:sgpr(s32), 32 - %4:sgpr(s512) = G_INSERT %3:sgpr, %1:sgpr(s32), 64 - %5:sgpr(s512) = G_INSERT %4:sgpr, %1:sgpr(s32), 96 - %6:sgpr(s512) = G_INSERT %5:sgpr, %1:sgpr(s32), 128 - %7:sgpr(s512) = G_INSERT %6:sgpr, %1:sgpr(s32), 160 - %8:sgpr(s512) = G_INSERT %7:sgpr, %1:sgpr(s32), 192 - %9:sgpr(s512) = G_INSERT %8:sgpr, %1:sgpr(s32), 224 - %10:sgpr(s512) = G_INSERT %9:sgpr, %1:sgpr(s32), 256 - %11:sgpr(s512) = G_INSERT %10:sgpr, %1:sgpr(s32), 288 - %12:sgpr(s512) = G_INSERT %11:sgpr, %1:sgpr(s32), 320 - %13:sgpr(s512) = G_INSERT %12:sgpr, %1:sgpr(s32), 352 - %14:sgpr(s512) = G_INSERT %13:sgpr, %1:sgpr(s32), 384 - %15:sgpr(s512) = G_INSERT %14:sgpr, %1:sgpr(s32), 416 - %16:sgpr(s512) = G_INSERT %15:sgpr, %1:sgpr(s32), 448 - %17:sgpr(s512) = G_INSERT %16:sgpr, %1:sgpr(s32), 480 - $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY %17:sgpr(s512) - SI_RETURN_TO_EPILOG $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 -- GitLab From 0af72938856257f7c54320be1bf19873a3cc90b8 Mon Sep 17 00:00:00 2001 From: Matthias Braun Date: Thu, 11 Oct 2018 23:37:58 +0000 Subject: [PATCH 0086/1116] Revert "DwarfDebug: Pick next location in case of missing location at block begin" It originally triggered a stepping problem in the debugger, which could be fixed by adjusting CodeGen/LexicalScopes.cpp however it seems we prefer the previous behavior anyway. See the discussion for details: http://lists.llvm.org/pipermail/llvm-commits/Week-of-Mon-20181008/593833.html This reverts commit r343880. This reverts commit r343874. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344318 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/AsmPrinter/DwarfDebug.cpp | 112 +++++++----------- lib/CodeGen/AsmPrinter/DwarfDebug.h | 3 - test/DebugInfo/AArch64/line-header.ll | 2 +- .../single-constant-use-preserves-dbgloc.ll | 3 +- test/DebugInfo/Mips/delay-slot.ll | 2 +- test/DebugInfo/NVPTX/debug-info.ll | 2 +- test/DebugInfo/X86/dwarf-no-source-loc.ll | 11 +- test/DebugInfo/X86/dwarf-no-source-loc.mir | 74 ------------ 8 files changed, 52 insertions(+), 157 deletions(-) delete mode 100644 test/DebugInfo/X86/dwarf-no-source-loc.mir diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp index ab3559d63cc..94e12658cfe 100644 --- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp +++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp @@ -1371,49 +1371,6 @@ void DwarfDebug::collectEntityInfo(DwarfCompileUnit &TheCU, } } -static const DebugLoc & -findNextDebugLoc(MachineBasicBlock::const_iterator MBBI, - MachineBasicBlock::const_iterator MBBE) { - static DebugLoc NoLocation; - for ( ; MBBI != MBBE; ++MBBI) { - if (MBBI->isDebugInstr()) - continue; - const DebugLoc &DL = MBBI->getDebugLoc(); - if (DL) - return DL; - } - return NoLocation; -} - -void DwarfDebug::emitDebugLoc(const DebugLoc &DL) { - unsigned LastAsmLine = - Asm->OutStreamer->getContext().getCurrentDwarfLoc().getLine(); - - // We have an explicit location, different from the previous location. - // Don't repeat a line-0 record, but otherwise emit the new location. - // (The new location might be an explicit line 0, which we do emit.) - unsigned Line = DL.getLine(); - if (PrevInstLoc && Line == 0 && LastAsmLine == 0) - return; - unsigned Flags = 0; - if (DL == PrologEndLoc) { - Flags |= DWARF2_FLAG_PROLOGUE_END | DWARF2_FLAG_IS_STMT; - PrologEndLoc = DebugLoc(); - } - // If the line changed, we call that a new statement; unless we went to - // line 0 and came back, in which case it is not a new statement. - unsigned OldLine = PrevInstLoc ? PrevInstLoc.getLine() : LastAsmLine; - if (Line && Line != OldLine) - Flags |= DWARF2_FLAG_IS_STMT; - - const MDNode *Scope = DL.getScope(); - recordSourceLine(Line, DL.getCol(), Scope, Flags); - - // If we're not at line 0, remember this location. - if (Line) - PrevInstLoc = DL; -} - // Process beginning of an instruction. void DwarfDebug::beginInstruction(const MachineInstr *MI) { DebugHandlerBase::beginInstruction(MI); @@ -1458,41 +1415,54 @@ void DwarfDebug::beginInstruction(const MachineInstr *MI) { // If we have already emitted a line-0 record, don't repeat it. if (LastAsmLine == 0) return; - // By default we emit nothing to avoid line table bloat. However at the - // beginning of a basic block or after a label it is undesirable to let - // the previous location unchanged. In these cases do a forward search for - // the next valid debug location. - if (UnknownLocations == Default) { - const MachineBasicBlock &MBB = *MI->getParent(); - if (!PrevLabel && PrevInstBB == &MBB) - return; - - const DebugLoc &NextDL = findNextDebugLoc(MI->getIterator(), MBB.end()); - if (NextDL) { - emitDebugLoc(NextDL); - return; - } - } - - // We should emit a line-0 record. // If user said Don't Do That, don't do that. if (UnknownLocations == Disable) return; - // Emit a line-0 record now. - // Preserve the file and column numbers, if we can, to save space in - // the encoded line table. - // Do not update PrevInstLoc, it remembers the last non-0 line. - const MDNode *Scope = nullptr; - unsigned Column = 0; - if (PrevInstLoc) { - Scope = PrevInstLoc.getScope(); - Column = PrevInstLoc.getCol(); + // See if we have a reason to emit a line-0 record now. + // Reasons to emit a line-0 record include: + // - User asked for it (UnknownLocations). + // - Instruction has a label, so it's referenced from somewhere else, + // possibly debug information; we want it to have a source location. + // - Instruction is at the top of a block; we don't want to inherit the + // location from the physically previous (maybe unrelated) block. + if (UnknownLocations == Enable || PrevLabel || + (PrevInstBB && PrevInstBB != MI->getParent())) { + // Preserve the file and column numbers, if we can, to save space in + // the encoded line table. + // Do not update PrevInstLoc, it remembers the last non-0 line. + const MDNode *Scope = nullptr; + unsigned Column = 0; + if (PrevInstLoc) { + Scope = PrevInstLoc.getScope(); + Column = PrevInstLoc.getCol(); + } + recordSourceLine(/*Line=*/0, Column, Scope, /*Flags=*/0); } - recordSourceLine(/*Line=*/0, Column, Scope, /*Flags=*/0); return; } - emitDebugLoc(DL); + // We have an explicit location, different from the previous location. + // Don't repeat a line-0 record, but otherwise emit the new location. + // (The new location might be an explicit line 0, which we do emit.) + if (PrevInstLoc && DL.getLine() == 0 && LastAsmLine == 0) + return; + unsigned Flags = 0; + if (DL == PrologEndLoc) { + Flags |= DWARF2_FLAG_PROLOGUE_END | DWARF2_FLAG_IS_STMT; + PrologEndLoc = DebugLoc(); + } + // If the line changed, we call that a new statement; unless we went to + // line 0 and came back, in which case it is not a new statement. + unsigned OldLine = PrevInstLoc ? PrevInstLoc.getLine() : LastAsmLine; + if (DL.getLine() && DL.getLine() != OldLine) + Flags |= DWARF2_FLAG_IS_STMT; + + const MDNode *Scope = DL.getScope(); + recordSourceLine(DL.getLine(), DL.getCol(), Scope, Flags); + + // If we're not at line 0, remember this location. + if (DL.getLine()) + PrevInstLoc = DL; } static DebugLoc findPrologueEndLoc(const MachineFunction *MF) { diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.h b/lib/CodeGen/AsmPrinter/DwarfDebug.h index e115eb771fb..fecf8056765 100644 --- a/lib/CodeGen/AsmPrinter/DwarfDebug.h +++ b/lib/CodeGen/AsmPrinter/DwarfDebug.h @@ -723,9 +723,6 @@ public: bool tuneForLLDB() const { return DebuggerTuning == DebuggerKind::LLDB; } bool tuneForSCE() const { return DebuggerTuning == DebuggerKind::SCE; } /// @} - -private: - void emitDebugLoc(const DebugLoc &DL); }; } // end namespace llvm diff --git a/test/DebugInfo/AArch64/line-header.ll b/test/DebugInfo/AArch64/line-header.ll index 2ac94728b86..1d9156debf1 100644 --- a/test/DebugInfo/AArch64/line-header.ll +++ b/test/DebugInfo/AArch64/line-header.ll @@ -3,4 +3,4 @@ ; check line table length is correctly calculated for both big and little endian CHECK-LABEL: .debug_line contents: -CHECK: total_length: 0x0000003c +CHECK: total_length: 0x0000003f diff --git a/test/DebugInfo/ARM/single-constant-use-preserves-dbgloc.ll b/test/DebugInfo/ARM/single-constant-use-preserves-dbgloc.ll index fa1dbb531d3..af76c889353 100644 --- a/test/DebugInfo/ARM/single-constant-use-preserves-dbgloc.ll +++ b/test/DebugInfo/ARM/single-constant-use-preserves-dbgloc.ll @@ -31,10 +31,11 @@ if.then: ; preds = %entry if.end: ; preds = %entry ; Materialize the constant. -; CHECK: .loc 1 7 5 +; CHECK: .loc 1 0 ; CHECK-NEXT: mvn r0, #0 ; The backend performs the store to %retval first, for some reason. +; CHECK-NEXT: .loc 1 7 5 ; CHECK-NEXT: str r0, [sp, #4] store i32 -1, i32* %x, align 4, !dbg !19 diff --git a/test/DebugInfo/Mips/delay-slot.ll b/test/DebugInfo/Mips/delay-slot.ll index f8959a2c52b..8f444bce30f 100644 --- a/test/DebugInfo/Mips/delay-slot.ll +++ b/test/DebugInfo/Mips/delay-slot.ll @@ -16,7 +16,7 @@ ; CHECK: 0x0000000000000004 2 0 1 0 0 is_stmt prologue_end ; CHECK: 0x0000000000000024 3 0 1 0 0 is_stmt ; CHECK: 0x0000000000000034 4 0 1 0 0 is_stmt -; CHECK: 0x0000000000000044 5 0 1 0 0 is_stmt +; CHECK: 0x0000000000000048 5 0 1 0 0 is_stmt ; CHECK: 0x0000000000000058 5 0 1 0 0 is_stmt end_sequence diff --git a/test/DebugInfo/NVPTX/debug-info.ll b/test/DebugInfo/NVPTX/debug-info.ll index f80a8426286..d5dee4055f0 100644 --- a/test/DebugInfo/NVPTX/debug-info.ll +++ b/test/DebugInfo/NVPTX/debug-info.ll @@ -36,7 +36,6 @@ ; CHECK: setp.ge.s32 %p{{.+}}, %r{{.+}}, %r{{.+}}; ; CHECK: .loc [[DEBUG_INFO_CU]] 7 7 ; CHECK: @%p{{.+}} bra [[BB:.+]]; -; CHECK: .loc [[DEBUG_INFO_CU]] 8 13 ; CHECK: ld.param.f32 %f{{.+}}, [{{.+}}]; ; CHECK: ld.param.u64 %rd{{.+}}, [{{.+}}]; ; CHECK: cvta.to.global.u64 %rd{{.+}}, %rd{{.+}}; @@ -44,6 +43,7 @@ ; CHECK: cvta.to.global.u64 %rd{{.+}}, %rd{{.+}}; ; CHECK: mul.wide.u32 %rd{{.+}}, %r{{.+}}, 4; ; CHECK: add.s64 %rd{{.+}}, %rd{{.+}}, %rd{{.+}}; +; CHECK: .loc [[DEBUG_INFO_CU]] 8 13 ; CHECK: ld.global.f32 %f{{.+}}, [%rd{{.+}}]; ; CHECK: add.s64 %rd{{.+}}, %rd{{.+}}, %rd{{.+}}; ; CHECK: .loc [[DEBUG_INFO_CU]] 8 19 diff --git a/test/DebugInfo/X86/dwarf-no-source-loc.ll b/test/DebugInfo/X86/dwarf-no-source-loc.ll index 19695ab126b..60d50a391a1 100644 --- a/test/DebugInfo/X86/dwarf-no-source-loc.ll +++ b/test/DebugInfo/X86/dwarf-no-source-loc.ll @@ -40,14 +40,15 @@ if.end: ; preds = %if.then, %entry ret void, !dbg !14 } -; CHECK: .loc 1 7 7 prologue_end +; CHECK: .loc 1 7 7 ; CHECK-NOT: .loc -; CHECK: # %bb.1 -; CHECK-NEXT: .file 2 "/tests{{[/\]+}}include.h" -; CHECK-NEXT: .loc 2 20 5 +; CHECK: .loc 1 0 7 is_stmt 0 ; CHECK-NOT: .loc +; CHECK: .loc 2 20 5 is_stmt 1 ; CHECK: .LBB0_2: -; CHECK: .loc 1 10 3 +; CHECK-NEXT: .loc 2 0 5 is_stmt 0 +; CHECK-NOT: .loc +; CHECK: .loc 1 10 3 is_stmt 1 ; ; DISABLE-NOT: .loc 1 0 diff --git a/test/DebugInfo/X86/dwarf-no-source-loc.mir b/test/DebugInfo/X86/dwarf-no-source-loc.mir deleted file mode 100644 index f6ad6ee6d4c..00000000000 --- a/test/DebugInfo/X86/dwarf-no-source-loc.mir +++ /dev/null @@ -1,74 +0,0 @@ -# RUN: llc -o - %s -start-before=patchable-function -use-unknown-locations=Default | FileCheck %s --check-prefixes=CHECK,DEFAULT -# RUN: llc -o - %s -start-before=patchable-function -use-unknown-locations=Enable | FileCheck %s --check-prefixes=CHECK,ENABLE -# RUN: llc -o - %s -start-before=patchable-function -use-unknown-locations=Disable | FileCheck %s --check-prefixes=CHECK,DISABLE ---- | - target triple = "x86_64--" - - !0 = !DIFile(filename: "dwarf-no-source-loc.mir", directory: "/") - !1 = distinct !DICompileUnit(file: !0, language: DW_LANG_C, emissionKind: LineTablesOnly) - !2 = distinct !DISubprogram(name: "func", unit: !1) - !3 = !DILocation(line: 17, scope: !2) - !4 = !DILocation(line: 42, scope: !2) - - !llvm.dbg.cu = !{!1} - !llvm.module.flags = !{!10, !11} - !10 = !{i32 2, !"Dwarf Version", i32 4} - !11 = !{i32 2, !"Debug Info Version", i32 3} - - define void @func() !dbg !2 { - unreachable - } -... ---- -name: func -body: | - bb.0: - NOOP - NOOP - $eax = MOV32ri 1, debug-location !3 - ; CHECK-LABEL: bb.0 - ; CHECK: nop - ; CHECK: nop - ; CHECK: .loc 1 17 0 prologue_end - ; CHECK: movl $1, %eax - - bb.1: - NOOP - $ebx = MOV32ri 2, debug-location !4 - ; CHECK-LABEL: bb.1 - ; DEFAULT: .loc 1 42 0 - ; ENABLE: .loc 1 0 - ; DISABLE-NOT: .loc 1 0 - ; CHECK: nop - ; ENABLE: .loc 1 42 0 - ; CHECK: movl $2, %ebx - - bb.2: - NOOP - ; CHECK-LABEL: bb.2 - ; DEFAULT: .loc 1 0 0 is_stmt 0 - ; ENABLE: .loc 1 0 0 is_stmt 0 - ; DISABLE-NOT: .loc 1 0 - ; CHECK: nop - - bb.3: - NOOP - $ecx = MOV32ri 3, debug-location !3 - ; CHECK-LABEL: bb.3 - ; CHECK: nop - ; DEFAULT: .loc 1 17 0 is_stmt 1 - ; ENABLE: .loc 1 17 0 is_stmt 1 - ; DISABLE-NOT: .loc 1 0 - ; CHECK: movl $3, %ecx - - bb.4: - NOOP - $edx = MOV32ri 4, debug-location !4 - ; CHECK: bb.4 - ; DEFAULT: .loc 1 42 0 - ; ENABLE: .loc 1 0 0 is_stmt 0 - ; DISABLE-NOT: .loc 1 0 - ; CHECK: nop - ; ENABLE: .loc 1 42 0 is_stmt 1 - ; CHECK: movl $4, %edx -... -- GitLab From bd755d4e272af07dbd5a65ab2d1acf4a3a42e510 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Thu, 11 Oct 2018 23:56:56 +0000 Subject: [PATCH 0087/1116] [DAGCombiner] rearrange extract_element+bitcast fold; NFC I want to add another pattern here that includes scalar_to_vector, so this makes that patch smaller. I was hoping to remove the hasOneUse() check because it shouldn't be necessary for common codegen, but an AMDGPU test has a comment suggesting that the extra check makes things better on one of those targets. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344320 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 14 ++++++++------ test/CodeGen/X86/extract-insert.ll | 4 ++++ 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 16834dc1a26..7ec5fac390b 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -15499,13 +15499,15 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) { // converts. } - // extract_vector_elt (v2i32 (bitcast i64:x)), EltTrunc -> i32 (trunc i64:x) - bool isLE = DAG.getDataLayout().isLittleEndian(); - unsigned EltTrunc = isLE ? 0 : VT.getVectorNumElements() - 1; - if (ConstEltNo && InVec.getOpcode() == ISD::BITCAST && InVec.hasOneUse() && - ConstEltNo->getZExtValue() == EltTrunc && VT.isInteger()) { + if (ConstEltNo && InVec.getOpcode() == ISD::BITCAST) { + // The vector index of the LSBs of the source depend on the endian-ness. + bool IsLE = DAG.getDataLayout().isLittleEndian(); + + // extract_elt (v2i32 (bitcast i64:x)), BCTruncElt -> i32 (trunc i64:x) + unsigned BCTruncElt = IsLE ? 0 : VT.getVectorNumElements() - 1; SDValue BCSrc = InVec.getOperand(0); - if (BCSrc.getValueType().isScalarInteger()) + if (InVec.hasOneUse() && ConstEltNo->getZExtValue() == BCTruncElt && + VT.isInteger() && BCSrc.getValueType().isScalarInteger()) return DAG.getNode(ISD::TRUNCATE, SDLoc(N), NVT, BCSrc); } diff --git a/test/CodeGen/X86/extract-insert.ll b/test/CodeGen/X86/extract-insert.ll index de8ee704b88..b3fb50de718 100644 --- a/test/CodeGen/X86/extract-insert.ll +++ b/test/CodeGen/X86/extract-insert.ll @@ -28,6 +28,10 @@ define i8 @extractelt_bitcast(i32 %x) nounwind { ret i8 %ext } +; TODO: This should have folded to avoid vector ops, but the transform +; is guarded by 'hasOneUse'. That limitation apparently makes some AMDGPU +; codegen better. + define i8 @extractelt_bitcast_extra_use(i32 %x, <4 x i8>* %p) nounwind { ; X86-LABEL: extractelt_bitcast_extra_use: ; X86: # %bb.0: -- GitLab From 3b7de9d1bb445d9f7652afe058e33a1ff3053036 Mon Sep 17 00:00:00 2001 From: Jordan Rupprecht Date: Fri, 12 Oct 2018 00:36:01 +0000 Subject: [PATCH 0088/1116] [llvm-objcopy] Add -F|--target compatibility Summary: This change adds support for the GNU --target flag, which sets both --input-target and --output-target. GNU objcopy doesn't do any checking for whether both --target and --{input,output}-target are used, and so it allows both, e.g. "--target A --output-target B" is equivalent to "--input-target A --output-target B" since the later command line flag would override earlier ones. This may be error prone, so I chose to implement it as an error if both are used. I'm not sure if anyone is actually using both. Reviewers: jakehehrlich, jhenderson, alexshap Reviewed By: jakehehrlich, alexshap Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D53029 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344321 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../llvm-objcopy/input-output-target.test | 22 +++++++++++++++++++ tools/llvm-objcopy/CopyConfig.cpp | 14 ++++++++++-- tools/llvm-objcopy/ObjcopyOpts.td | 4 ++++ 3 files changed, 38 insertions(+), 2 deletions(-) create mode 100644 test/tools/llvm-objcopy/input-output-target.test diff --git a/test/tools/llvm-objcopy/input-output-target.test b/test/tools/llvm-objcopy/input-output-target.test new file mode 100644 index 00000000000..e81770a239a --- /dev/null +++ b/test/tools/llvm-objcopy/input-output-target.test @@ -0,0 +1,22 @@ +# RUN: echo abcd > %t.txt + +# Preserve input to verify it is not modified +# RUN: cp %t.txt %t-copy.txt + +# -F is equivalent to -I -O +# RUN: llvm-objcopy -F binary -B i386:x86-64 %t.txt %t.2.txt +# RUN: cmp %t-copy.txt %t.2.txt + +# --target is equivalent to --input-target --output-target +# RUN: llvm-objcopy --target binary -B i386:x86-64 %t.txt %t.3.txt +# RUN: cmp %t-copy.txt %t.3.txt + +# TODO: check --target and --input-target/--output-target are incompatible +# RUN: not llvm-objcopy --target binary --input-target binary -B i386:x86-64 \ +# RUN: %t.txt %t.4.txt 2>&1 \ +# RUN: | FileCheck %s --check-prefix=BAD-FLAG +# RUN: not llvm-objcopy --target binary --output-target binary -B i386:x86-64 \ +# RUN: %t.txt %t.4.txt 2>&1 \ +# RUN: | FileCheck %s --check-prefix=BAD-FLAG + +# BAD-FLAG: --target cannot be used with --input-target or --output-target. diff --git a/tools/llvm-objcopy/CopyConfig.cpp b/tools/llvm-objcopy/CopyConfig.cpp index d814df10525..2c3551ba026 100644 --- a/tools/llvm-objcopy/CopyConfig.cpp +++ b/tools/llvm-objcopy/CopyConfig.cpp @@ -247,8 +247,18 @@ DriverConfig parseObjcopyOptions(ArrayRef ArgsArr) { CopyConfig Config; Config.InputFilename = Positional[0]; Config.OutputFilename = Positional[Positional.size() == 1 ? 0 : 1]; - Config.InputFormat = InputArgs.getLastArgValue(OBJCOPY_input_target); - Config.OutputFormat = InputArgs.getLastArgValue(OBJCOPY_output_target); + if (InputArgs.hasArg(OBJCOPY_target) && + (InputArgs.hasArg(OBJCOPY_input_target) || + InputArgs.hasArg(OBJCOPY_output_target))) + error("--target cannot be used with --input-target or --output-target"); + + if (InputArgs.hasArg(OBJCOPY_target)) { + Config.InputFormat = InputArgs.getLastArgValue(OBJCOPY_target); + Config.OutputFormat = InputArgs.getLastArgValue(OBJCOPY_target); + } else { + Config.InputFormat = InputArgs.getLastArgValue(OBJCOPY_input_target); + Config.OutputFormat = InputArgs.getLastArgValue(OBJCOPY_output_target); + } if (Config.InputFormat == "binary") { auto BinaryArch = InputArgs.getLastArgValue(OBJCOPY_binary_architecture); if (BinaryArch.empty()) diff --git a/tools/llvm-objcopy/ObjcopyOpts.td b/tools/llvm-objcopy/ObjcopyOpts.td index 18b270b7758..f6c8a959e8b 100644 --- a/tools/llvm-objcopy/ObjcopyOpts.td +++ b/tools/llvm-objcopy/ObjcopyOpts.td @@ -10,6 +10,10 @@ defm binary_architecture : Eq<"binary-architecture">, HelpText<"Used when transforming an architecture-less format (such as binary) to another format">; def B : JoinedOrSeparate<["-"], "B">, Alias; +defm target : Eq<"target">, + HelpText<"Format of the input and output file">, + Values<"binary">; +def F : JoinedOrSeparate<[ "-" ], "F">, Alias; defm input_target : Eq<"input-target">, HelpText<"Format of the input file">, Values<"binary">; -- GitLab From 441f8c5b1f48b72ed2b92ccb370b7e6fb73def30 Mon Sep 17 00:00:00 2001 From: Eugene Leviant Date: Fri, 12 Oct 2018 07:24:02 +0000 Subject: [PATCH 0089/1116] [ThinLTO] Don't import GV which contains blockaddress Differential revision: https://reviews.llvm.org/D53139 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344325 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Analysis/ModuleSummaryAnalysis.cpp | 19 ++++++++++++++++--- lib/Transforms/IPO/FunctionImport.cpp | 3 +-- .../X86/Inputs/globals-import-blockaddr.ll | 12 ++++++++++++ test/ThinLTO/X86/globals-import-blockaddr.ll | 18 ++++++++++++++++++ 4 files changed, 47 insertions(+), 5 deletions(-) create mode 100644 test/ThinLTO/X86/Inputs/globals-import-blockaddr.ll create mode 100644 test/ThinLTO/X86/globals-import-blockaddr.ll diff --git a/lib/Analysis/ModuleSummaryAnalysis.cpp b/lib/Analysis/ModuleSummaryAnalysis.cpp index bca40043fd9..3eb150becfa 100644 --- a/lib/Analysis/ModuleSummaryAnalysis.cpp +++ b/lib/Analysis/ModuleSummaryAnalysis.cpp @@ -74,9 +74,17 @@ cl::opt FSEC( // Walk through the operands of a given User via worklist iteration and populate // the set of GlobalValue references encountered. Invoked either on an // Instruction or a GlobalVariable (which walks its initializer). -static void findRefEdges(ModuleSummaryIndex &Index, const User *CurUser, +// Return true if any of the operands contains blockaddress. This is important +// to know when computing summary for global var, because if global variable +// references basic block address we can't import it separately from function +// containing that basic block. For simplicity we currently don't import such +// global vars at all. When importing function we aren't interested if any +// instruction in it takes an address of any basic block, because instruction +// can only take an address of basic block located in the same function. +static bool findRefEdges(ModuleSummaryIndex &Index, const User *CurUser, SetVector &RefEdges, SmallPtrSet &Visited) { + bool HasBlockAddress = false; SmallVector Worklist; Worklist.push_back(CurUser); @@ -92,8 +100,10 @@ static void findRefEdges(ModuleSummaryIndex &Index, const User *CurUser, const User *Operand = dyn_cast(OI); if (!Operand) continue; - if (isa(Operand)) + if (isa(Operand)) { + HasBlockAddress = true; continue; + } if (auto *GV = dyn_cast(Operand)) { // We have a reference to a global value. This should be added to // the reference set unless it is a callee. Callees are handled @@ -105,6 +115,7 @@ static void findRefEdges(ModuleSummaryIndex &Index, const User *CurUser, Worklist.push_back(Operand); } } + return HasBlockAddress; } static CalleeInfo::HotnessType getHotness(uint64_t ProfileCount, @@ -369,7 +380,7 @@ computeVariableSummary(ModuleSummaryIndex &Index, const GlobalVariable &V, DenseSet &CantBePromoted) { SetVector RefEdges; SmallPtrSet Visited; - findRefEdges(Index, &V, RefEdges, Visited); + bool HasBlockAddress = findRefEdges(Index, &V, RefEdges, Visited); bool NonRenamableLocal = isNonRenamableLocal(V); GlobalValueSummary::GVFlags Flags(V.getLinkage(), NonRenamableLocal, /* Live = */ false, V.isDSOLocal()); @@ -377,6 +388,8 @@ computeVariableSummary(ModuleSummaryIndex &Index, const GlobalVariable &V, llvm::make_unique(Flags, RefEdges.takeVector()); if (NonRenamableLocal) CantBePromoted.insert(V.getGUID()); + if (HasBlockAddress) + GVarSummary->setNotEligibleToImport(); Index.addGlobalValueSummary(V, std::move(GVarSummary)); } diff --git a/lib/Transforms/IPO/FunctionImport.cpp b/lib/Transforms/IPO/FunctionImport.cpp index 8f8c85e1b18..366ac2b95f4 100644 --- a/lib/Transforms/IPO/FunctionImport.cpp +++ b/lib/Transforms/IPO/FunctionImport.cpp @@ -278,8 +278,7 @@ static void computeImportForReferencedGlobals( for (auto &RefSummary : VI.getSummaryList()) if (RefSummary->getSummaryKind() == GlobalValueSummary::GlobalVarKind && - // Don't try to import regular LTO summaries added to dummy module. - !RefSummary->modulePath().empty() && + !RefSummary->notEligibleToImport() && !GlobalValue::isInterposableLinkage(RefSummary->linkage()) && RefSummary->refs().empty()) { ImportList[RefSummary->modulePath()].insert(VI.getGUID()); diff --git a/test/ThinLTO/X86/Inputs/globals-import-blockaddr.ll b/test/ThinLTO/X86/Inputs/globals-import-blockaddr.ll new file mode 100644 index 00000000000..fe1fa70ee83 --- /dev/null +++ b/test/ThinLTO/X86/Inputs/globals-import-blockaddr.ll @@ -0,0 +1,12 @@ +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@label_addr = internal constant [1 x i8*] [i8* blockaddress(@foo, %lb)], align 8 + +; Function Attrs: noinline norecurse nounwind optnone uwtable +define dso_local [1 x i8*]* @foo() { + br label %lb + +lb: + ret [1 x i8*]* @label_addr +} diff --git a/test/ThinLTO/X86/globals-import-blockaddr.ll b/test/ThinLTO/X86/globals-import-blockaddr.ll new file mode 100644 index 00000000000..d4ed674030a --- /dev/null +++ b/test/ThinLTO/X86/globals-import-blockaddr.ll @@ -0,0 +1,18 @@ +; RUN: opt -module-summary %s -o %t1.bc +; RUN: opt -module-summary %p/Inputs/globals-import-blockaddr.ll -o %t2.bc +; RUN: llvm-lto2 run -save-temps %t1.bc -r=%t1.bc,foo,l -r=%t1.bc,main,pl %t2.bc -r=%t2.bc,foo,pl -o %t3 +; RUN: llvm-dis %t3.1.3.import.bc -o - | FileCheck %s + +; Verify that we haven't imported GV containing blockaddress +; CHECK: @label_addr.llvm.0 = external hidden constant + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +declare dso_local [1 x i8*]* @foo(); + +define dso_local i32 @main() { + %p = call [1 x i8*]* @foo() + %v = ptrtoint [1 x i8*]* %p to i32 + ret i32 %v +} -- GitLab From edac9f00e9e101be1520a77ebd51e4f274b020b3 Mon Sep 17 00:00:00 2001 From: Stefan Maksimovic Date: Fri, 12 Oct 2018 08:18:38 +0000 Subject: [PATCH 0090/1116] [mips] Mark fmaxl as a long double emulation routine Failure was discovered upon running projects/compiler-rt/test/builtins/Unit/divtc3_test.c in a stage2 compiler build. When compiling projects/compiler-rt/lib/builtins/divtc3.c, a call to fmaxl within the divtc3 implementation had its return values read from registers $2 and $3 instead of $f0 and $f2. Include fmaxl in the list of long double emulation routines to have its return value correctly interpreted as f128. Almost exact issue here: https://reviews.llvm.org/D17760 Differential Revision: https://reviews.llvm.org/D52649 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344326 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/Mips/MipsCCState.cpp | 8 ++++---- test/CodeGen/Mips/cconv/fmaxl_call.ll | 25 +++++++++++++++++++++++++ 2 files changed, 29 insertions(+), 4 deletions(-) create mode 100644 test/CodeGen/Mips/cconv/fmaxl_call.ll diff --git a/lib/Target/Mips/MipsCCState.cpp b/lib/Target/Mips/MipsCCState.cpp index 81a1cced93b..90cb3f437bd 100644 --- a/lib/Target/Mips/MipsCCState.cpp +++ b/lib/Target/Mips/MipsCCState.cpp @@ -24,10 +24,10 @@ static bool isF128SoftLibCall(const char *CallSym) { "__lttf2", "__multf3", "__netf2", "__powitf2", "__subtf3", "__trunctfdf2", "__trunctfsf2", "__unordtf2", "ceill", "copysignl", "cosl", "exp2l", - "expl", "floorl", "fmal", "fmodl", - "log10l", "log2l", "logl", "nearbyintl", - "powl", "rintl", "roundl", "sinl", - "sqrtl", "truncl"}; + "expl", "floorl", "fmal", "fmaxl", + "fmodl", "log10l", "log2l", "logl", + "nearbyintl", "powl", "rintl", "roundl", + "sinl", "sqrtl", "truncl"}; // Check that LibCalls is sorted alphabetically. auto Comp = [](const char *S1, const char *S2) { return strcmp(S1, S2) < 0; }; diff --git a/test/CodeGen/Mips/cconv/fmaxl_call.ll b/test/CodeGen/Mips/cconv/fmaxl_call.ll new file mode 100644 index 00000000000..0e3078edae4 --- /dev/null +++ b/test/CodeGen/Mips/cconv/fmaxl_call.ll @@ -0,0 +1,25 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=mips64el-unknown-linux-gnu -mcpu=mips64 < %s | FileCheck %s + +define fp128 @call_fmaxl(fp128 %a, fp128 %b) { +; CHECK-LABEL: call_fmaxl: +; CHECK: # %bb.0: +; CHECK-NEXT: daddiu $sp, $sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill +; CHECK-NEXT: .cfi_offset 31, -8 +; CHECK-NEXT: jal fmaxl +; CHECK-NEXT: nop +; CHECK-NEXT: mov.d $f12, $f0 +; CHECK-NEXT: jal f +; CHECK-NEXT: mov.d $f13, $f2 +; CHECK-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; CHECK-NEXT: jr $ra +; CHECK-NEXT: daddiu $sp, $sp, 16 + %1 = call fp128 @llvm.maxnum.f128(fp128 %a, fp128 %b) + %2 = call fp128 @f(fp128 %1) + ret fp128 %2 +} + +declare fp128 @llvm.maxnum.f128(fp128, fp128) +declare fp128 @f(fp128) -- GitLab From 66c3f51a52733500b8f08dfc2beabe845c57d467 Mon Sep 17 00:00:00 2001 From: Tim Northover Date: Fri, 12 Oct 2018 09:01:59 +0000 Subject: [PATCH 0091/1116] SCCP: avoid caching DenseMap entry that might be invalidated. Later calls to getValueState might insert entries into the ValueState map and cause reallocation, invalidating a reference. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344327 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Scalar/SCCP.cpp | 8 ++-- test/Transforms/SCCP/latticeval-invalidate.ll | 41 +++++++++++++++++++ 2 files changed, 46 insertions(+), 3 deletions(-) create mode 100644 test/Transforms/SCCP/latticeval-invalidate.ll diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp index d024e03b80a..7196bc82edc 100644 --- a/lib/Transforms/Scalar/SCCP.cpp +++ b/lib/Transforms/Scalar/SCCP.cpp @@ -1017,8 +1017,9 @@ void SCCPSolver::visitBinaryOperator(Instruction &I) { // Handle ICmpInst instruction. void SCCPSolver::visitCmpInst(CmpInst &I) { - LatticeVal &IV = ValueState[&I]; - if (IV.isOverdefined()) return; + // Do not cache this lookup, getValueState calls later in the function might + // invalidate the reference. + if (ValueState[&I].isOverdefined()) return; Value *Op1 = I.getOperand(0); Value *Op2 = I.getOperand(1); @@ -1046,7 +1047,8 @@ void SCCPSolver::visitCmpInst(CmpInst &I) { } // If operands are still unknown, wait for it to resolve. - if (!V1State.isOverdefined() && !V2State.isOverdefined() && !IV.isConstant()) + if (!V1State.isOverdefined() && !V2State.isOverdefined() && + !ValueState[&I].isConstant()) return; markOverdefined(&I); diff --git a/test/Transforms/SCCP/latticeval-invalidate.ll b/test/Transforms/SCCP/latticeval-invalidate.ll new file mode 100644 index 00000000000..19ea425312f --- /dev/null +++ b/test/Transforms/SCCP/latticeval-invalidate.ll @@ -0,0 +1,41 @@ +; RUN: opt -S -sccp %s + +@A = external constant i32 + +define void @test1() { +BB4: + %A20 = alloca i1 + %A15 = alloca i64 + %A7 = alloca i64 + %A3 = alloca i32** + %P = getelementptr i32, i32* @A, i32 0 + %B = ptrtoint i32* %P to i64 + %B8 = shl i64 %B, 9223372036854775807 + %G10 = getelementptr i32*, i32** undef, i64 %B + %B10 = urem i64 %B, %B8 + %B12 = shl i64 %B, %B + %BB = and i64 %B, %B8 + %B1 = xor i64 %B, %B + %B23 = lshr i64 %B8, undef + %C5 = icmp uge i64 %B, %B10 + %C17 = fcmp ord double 4.940660e-324, 0x7FEFFFFFFFFFFFFF + %C2 = icmp uge i1 %C17, false + %G = getelementptr i32, i32* %P, i1 %C17 + %X = select i1 false, i712 0, i712 1 + %C4 = icmp ule i1 true, false + %B3 = xor i1 %C17, %C2 + %C33 = icmp slt i1 false, %C5 + %B15 = sub i64 %B8, %B23 + %C18 = icmp slt i64 undef, %BB + %G29 = getelementptr i32**, i32*** undef, i64 %B15 + %C35 = icmp eq i1 %C17, undef + %C31 = icmp ult i1 %C35, %C5 + %C29 = icmp sle i1 true, %C5 + %C16 = icmp ne i16 -1, -32768 + %A24 = alloca i1 + %A21 = alloca i1 + %A25 = alloca i32** + %C7 = icmp ule i1 %C4, %B3 + %C14 = icmp slt i64 %B8, 0 + ret void +} -- GitLab From 39e3cf3d167b7b607e7490409984809a36649f38 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 12 Oct 2018 10:20:16 +0000 Subject: [PATCH 0092/1116] [X86] Ignore float/double non-temporal loads (PR39256) Scalar non-temporal loads were asserting instead of just being ignored. Reduced from https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=10895 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344331 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelDAGToDAG.cpp | 3 +++ test/CodeGen/X86/nontemporal-loads.ll | 32 +++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index f8ec4a2bcfc..ede1c0bd7df 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -441,6 +441,9 @@ namespace { switch (StoreSize) { default: llvm_unreachable("Unsupported store size"); + case 4: + case 8: + return false; case 16: return Subtarget->hasSSE41(); case 32: diff --git a/test/CodeGen/X86/nontemporal-loads.ll b/test/CodeGen/X86/nontemporal-loads.ll index 37ff7115ac9..56428979568 100644 --- a/test/CodeGen/X86/nontemporal-loads.ll +++ b/test/CodeGen/X86/nontemporal-loads.ll @@ -1911,4 +1911,36 @@ define <16 x i32> @test_masked_v16i32(i8 * %addr, <16 x i32> %old, <16 x i32> %m ret <16 x i32>%res } +; Reduced from https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=10895 +define i32 @PR39256(float* %ptr) { +; SSE-LABEL: PR39256: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: ucomiss {{.*}}(%rip), %xmm0 +; SSE-NEXT: setb (%rax) +; SSE-NEXT: movl $-2147483648, %eax # imm = 0x80000000 +; SSE-NEXT: retq +; +; AVX-LABEL: PR39256: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vucomiss {{.*}}(%rip), %xmm0 +; AVX-NEXT: setb (%rax) +; AVX-NEXT: movl $-2147483648, %eax # imm = 0x80000000 +; AVX-NEXT: retq +; +; AVX512-LABEL: PR39256: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: vucomiss {{.*}}(%rip), %xmm0 +; AVX512-NEXT: setb (%rax) +; AVX512-NEXT: movl $-2147483648, %eax # imm = 0x80000000 +; AVX512-NEXT: retq +entry: + %l = load float, float* %ptr, !nontemporal !1 + %C = fcmp ult float %l, 0x36A0000000000000 + store i1 %C, i1* undef + ret i32 -2147483648 +} + !1 = !{i32 1} -- GitLab From 0b50ad3f83305d5904d9f0ffa94424134a7b6d82 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 12 Oct 2018 10:26:59 +0000 Subject: [PATCH 0093/1116] [X86][AVX] Add examples of shuffles that can be reduced to a cross-lane shuffle followed by a in-lane permute Suitable for lowering by D53148 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344332 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/CodeGen/X86/vector-shuffle-256-v16.ll | 26 ++++++++++++++++++ test/CodeGen/X86/vector-shuffle-256-v32.ll | 31 ++++++++++++++++++++++ 2 files changed, 57 insertions(+) diff --git a/test/CodeGen/X86/vector-shuffle-256-v16.ll b/test/CodeGen/X86/vector-shuffle-256-v16.ll index de587beadc1..90970f15fea 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -4052,6 +4052,32 @@ define <16 x i16> @shuffle_v16i16_05_06_07_16_17_18_19_28_13_14_15_24_25_26_27_2 ret <16 x i16> %shuffle } +define <16 x i16> @shuffle_v16i16_00_02_04_06_01_03_05_07_31_30_29_28_27_26_25_24(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_02_04_06_01_03_05_07_31_30_29_28_27_26_25_24: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,2,3,6,7,10,11,14,15] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_02_04_06_01_03_05_07_31_30_29_28_27_26_25_24: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,2,3,6,7,10,11,14,15] +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v16i16_00_02_04_06_01_03_05_07_31_30_29_28_27_26_25_24: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,1,3,5,7,31,30,29,28,27,26,25,24] +; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 +; AVX512VL-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> + ret <16 x i16> %shuffle +} + define <16 x i16> @shuffle_v16i16_uu_06_uu_uu_17_18_19_uu_uu_14_uu_uu_25_26_27_uu(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_uu_06_uu_uu_17_18_19_uu_uu_14_uu_uu_25_26_27_uu: ; AVX1: # %bb.0: diff --git a/test/CodeGen/X86/vector-shuffle-256-v32.ll b/test/CodeGen/X86/vector-shuffle-256-v32.ll index 3e36b4a3b6a..5e9f30a727d 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -2495,6 +2495,37 @@ define <32 x i8> @shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_ ret <32 x i8> %shuffle } +define <32 x i8> @shuffle_v32i8_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00_32_34_36_38_40_42_44_46_33_35_37_39_41_43_45_47(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00_32_34_36_38_40_42_44_46_33_35_37_39_41_43_45_47: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00_32_34_36_38_40_42_44_46_33_35_37_39_41_43_45_47: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512VLBW-LABEL: shuffle_v32i8_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00_32_34_36_38_40_42_44_46_33_35_37_39_41_43_45_47: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15] +; AVX512VLBW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VLBW-NEXT: retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00_32_34_36_38_40_42_44_46_33_35_37_39_41_43_45_47: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,32,34,36,38,40,42,44,46,33,35,37,39,41,43,45,47] +; AVX512VLVBMI-NEXT: vpermt2b %ymm1, %ymm2, %ymm0 +; AVX512VLVBMI-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> + ret <32 x i8> %shuffle +} + define <32 x i8> @shuffle_v32i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_32_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_48(<32 x i8> %a) { ; AVX1-LABEL: shuffle_v32i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_32_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_48: ; AVX1: # %bb.0: -- GitLab From f39b0d9784b8e8acf20d7046101eb2e229b20a8a Mon Sep 17 00:00:00 2001 From: Andrea Di Biagio Date: Fri, 12 Oct 2018 11:23:04 +0000 Subject: [PATCH 0094/1116] [tblgen][llvm-mca] Add the ability to describe move elimination candidates via tablegen. This patch adds the ability to identify instructions that are "move elimination candidates". It also allows scheduling models to describe processor register files that allow move elimination. A move elimination candidate is an instruction that can be eliminated at register renaming stage. Each subtarget can specify which instructions are move elimination candidates with the help of tablegen class "IsOptimizableRegisterMove" (see llvm/Target/TargetInstrPredicate.td). For example, on X86, BtVer2 allows both GPR and MMX/SSE moves to be eliminated. The definition of 'IsOptimizableRegisterMove' for BtVer2 looks like this: ``` def : IsOptimizableRegisterMove<[ InstructionEquivalenceClass<[ // GPR variants. MOV32rr, MOV64rr, // MMX variants. MMX_MOVQ64rr, // SSE variants. MOVAPSrr, MOVUPSrr, MOVAPDrr, MOVUPDrr, MOVDQArr, MOVDQUrr, // AVX variants. VMOVAPSrr, VMOVUPSrr, VMOVAPDrr, VMOVUPDrr, VMOVDQArr, VMOVDQUrr ], CheckNot> > ]>; ``` Definitions of IsOptimizableRegisterMove from processor models of a same Target are processed by the SubtargetEmitter to auto-generate a target-specific override for each of the following predicate methods: ``` bool TargetSubtargetInfo::isOptimizableRegisterMove(const MachineInstr *MI) const; bool MCInstrAnalysis::isOptimizableRegisterMove(const MCInst &MI, unsigned CPUID) const; ``` By default, those methods return false (i.e. conservatively assume that there are no move elimination candidates). Tablegen class RegisterFile has been extended with the following information: - The set of register classes that allow move elimination. - Maxium number of moves that can be eliminated every cycle. - Whether move elimination is restricted to moves from registers that are known to be zero. This patch is structured in three part: A first part (which is mostly boilerplate) adds the new 'isOptimizableRegisterMove' target hooks, and extends existing register file descriptors in MC by introducing new fields to describe properties related to move elimination. A second part, uses the new tablegen constructs to describe move elimination in the BtVer2 scheduling model. A third part, teaches llm-mca how to query the new 'isOptimizableRegisterMove' hook to mark instructions that are candidates for move elimination. It also teaches class RegisterFile how to describe constraints on move elimination at PRF granularity. llvm-mca tests for btver2 show differences before/after this patch. Differential Revision: https://reviews.llvm.org/D53134 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344334 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/CodeGen/TargetSubtargetInfo.h | 13 +++ include/llvm/MC/MCInstrAnalysis.h | 11 ++ include/llvm/MC/MCSchedule.h | 7 ++ include/llvm/Target/TargetInstrPredicate.td | 8 +- include/llvm/Target/TargetSchedule.td | 30 ++++- lib/Target/X86/X86ScheduleBtVer2.td | 34 +++++- .../X86/BtVer2/reg-move-elimination-1.s | 24 ++-- .../X86/BtVer2/reg-move-elimination-2.s | 104 +++++++++--------- .../X86/BtVer2/reg-move-elimination-3.s | 86 +++++++-------- .../X86/BtVer2/reg-move-elimination-4.s | 67 ++++++----- .../X86/BtVer2/reg-move-elimination-5.s | 67 ++++++----- .../lib/HardwareUnits/RegisterFile.cpp | 14 ++- tools/llvm-mca/lib/InstrBuilder.cpp | 2 + utils/TableGen/CodeGenSchedule.cpp | 14 ++- utils/TableGen/CodeGenSchedule.h | 15 ++- utils/TableGen/SubtargetEmitter.cpp | 14 ++- 16 files changed, 315 insertions(+), 195 deletions(-) diff --git a/include/llvm/CodeGen/TargetSubtargetInfo.h b/include/llvm/CodeGen/TargetSubtargetInfo.h index e28673de225..968e4c4b810 100644 --- a/include/llvm/CodeGen/TargetSubtargetInfo.h +++ b/include/llvm/CodeGen/TargetSubtargetInfo.h @@ -169,6 +169,19 @@ public: return isZeroIdiom(MI, Mask); } + /// Returns true if MI is a candidate for move elimination. + /// + /// A candidate for move elimination may be optimized out at register renaming + /// stage. Subtargets can specify the set of optimizable moves by + /// instantiating tablegen class `IsOptimizableRegisterMove` (see + /// llvm/Target/TargetInstrPredicate.td). + /// + /// SubtargetEmitter is responsible for processing all the definitions of class + /// IsOptimizableRegisterMove, and auto-generate an override for this method. + virtual bool isOptimizableRegisterMove(const MachineInstr *MI) const { + return false; + } + /// True if the subtarget should run MachineScheduler after aggressive /// coalescing. /// diff --git a/include/llvm/MC/MCInstrAnalysis.h b/include/llvm/MC/MCInstrAnalysis.h index 950a1afeef5..200f10f7d64 100644 --- a/include/llvm/MC/MCInstrAnalysis.h +++ b/include/llvm/MC/MCInstrAnalysis.h @@ -136,6 +136,17 @@ public: return isZeroIdiom(MI, Mask, CPUID); } + /// Returns true if MI is a candidate for move elimination. + /// + /// Different subtargets may apply different constraints to optimizable + /// register moves. For example, on most X86 subtargets, a candidate for move + /// elimination cannot specify the same register for both source and + /// destination. + virtual bool isOptimizableRegisterMove(const MCInst &MI, + unsigned CPUID) const { + return false; + } + /// Given a branch instruction try to get the address the branch /// targets. Return true on success, and the address in Target. virtual bool diff --git a/include/llvm/MC/MCSchedule.h b/include/llvm/MC/MCSchedule.h index 9f53a468903..8990c2e3c0d 100644 --- a/include/llvm/MC/MCSchedule.h +++ b/include/llvm/MC/MCSchedule.h @@ -142,6 +142,7 @@ struct MCSchedClassDesc { struct MCRegisterCostEntry { unsigned RegisterClassID; unsigned Cost; + bool AllowMoveElimination; }; /// A register file descriptor. @@ -159,6 +160,12 @@ struct MCRegisterFileDesc { uint16_t NumRegisterCostEntries; // Index of the first cost entry in MCExtraProcessorInfo::RegisterCostTable. uint16_t RegisterCostEntryIdx; + // A value of zero means: there is no limit in the number of moves that can be + // eliminated every cycle. + uint16_t MaxMovesEliminatedPerCycle; + // Ture if this register file only knows how to optimize register moves from + // known zero registers. + bool AllowZeroMoveEliminationOnly; }; /// Provide extra details about the machine processor. diff --git a/include/llvm/Target/TargetInstrPredicate.td b/include/llvm/Target/TargetInstrPredicate.td index c4b14eba776..f70af259603 100644 --- a/include/llvm/Target/TargetInstrPredicate.td +++ b/include/llvm/Target/TargetInstrPredicate.td @@ -313,7 +313,7 @@ class STIPredicate; @@ -323,8 +323,14 @@ def IsDepBreakingDecl : STIPredicateDecl<"isDependencyBreaking">; } // UpdatesOpcodeMask +def IsOptimizableRegisterMoveDecl + : STIPredicateDecl<"isOptimizableRegisterMove">; + class IsZeroIdiomFunction classes> : STIPredicate; class IsDepBreakingFunction classes> : STIPredicate; + +class IsOptimizableRegisterMove classes> + : STIPredicate; diff --git a/include/llvm/Target/TargetSchedule.td b/include/llvm/Target/TargetSchedule.td index 7d7ce2dabe0..77b1927f932 100644 --- a/include/llvm/Target/TargetSchedule.td +++ b/include/llvm/Target/TargetSchedule.td @@ -460,6 +460,10 @@ class SchedAlias { // - The number of physical registers which can be used for register renaming // purpose. // - The cost of a register rename. +// - The set of registers that allow move elimination. +// - The maximum number of moves that can be eliminated every cycle. +// - Whether move elimination is limited to register moves whose input +// is known to be zero. // // The cost of a rename is the number of physical registers allocated by the // register alias table to map the new definition. By default, register can be @@ -506,11 +510,35 @@ class SchedAlias { // partial write is combined with the previous super-register definition. We // should add support for these cases, and correctly model merge problems with // partial register accesses. +// +// Field MaxMovesEliminatedPerCycle specifies how many moves can be eliminated +// every cycle. A default value of zero for that field means: there is no limit +// to the number of moves that can be eliminated by this register file. +// +// An instruction MI is a candidate for move elimination if a call to +// method TargetSubtargetInfo::isOptimizableRegisterMove(MI) returns true (see +// llvm/CodeGen/TargetSubtargetInfo.h, and llvm/MC/MCInstrAnalysis.h). +// +// Subtargets can instantiate tablegen class IsOptimizableRegisterMove (see +// llvm/Target/TargetInstrPredicate.td) to customize the set of move elimination +// candidates. By default, no instruction is a valid move elimination candidate. +// +// A register move MI is eliminated only if: +// - MI is a move elimination candidate. +// - The destination register is from a register class that allows move +// elimination (see field `AllowMoveElimination` below). +// - Constraints on the move kind, and the maximum number of moves that can be +// eliminated per cycle are all met. + class RegisterFile Classes = [], - list Costs = []> { + list Costs = [], list AllowMoveElim = [], + int MaxMoveElimPerCy = 0, bit AllowZeroMoveElimOnly = 0> { list RegClasses = Classes; list RegCosts = Costs; + list AllowMoveElimination = AllowMoveElim; int NumPhysRegs = numPhysRegs; + int MaxMovesEliminatedPerCycle = MaxMoveElimPerCy; + bit AllowZeroMoveEliminationOnly = AllowZeroMoveElimOnly; SchedMachineModel SchedModel = ?; } diff --git a/lib/Target/X86/X86ScheduleBtVer2.td b/lib/Target/X86/X86ScheduleBtVer2.td index 2c1a4b6c7f5..33a6b01546d 100644 --- a/lib/Target/X86/X86ScheduleBtVer2.td +++ b/lib/Target/X86/X86ScheduleBtVer2.td @@ -48,12 +48,22 @@ def JFPU1 : ProcResource<1>; // Vector/FPU Pipe1: VALU1/STC/FPM // part of it. // Reference: Section 21.10 "AMD Bobcat and Jaguar pipeline: Partial register // access" - Agner Fog's "microarchitecture.pdf". -def JIntegerPRF : RegisterFile<64, [GR64, CCR]>; +def JIntegerPRF : RegisterFile<64, [GR64, CCR], [1, 1], [1, 0], + 0, // Max moves that can be eliminated per cycle. + 1>; // Restrict move elimination to zero regs. // The Jaguar FP Retire Queue renames SIMD and FP uOps onto a pool of 72 SSE // registers. Operations on 256-bit data types are cracked into two COPs. // Reference: www.realworldtech.com/jaguar/4/ -def JFpuPRF: RegisterFile<72, [VR64, VR128, VR256], [1, 1, 2]>; + +// The PRF in the floating point unit can eliminate a move from a MMX or SSE +// register that is know to be zero (i.e. it has been zeroed using a zero-idiom +// dependency breaking instruction, or via VZEROALL). +// Reference: Section 21.8 "AMD Bobcat and Jaguar pipeline: Dependency-breaking +// instructions" - Agner Fog's "microarchitecture.pdf" +def JFpuPRF: RegisterFile<72, [VR64, VR128, VR256], [1, 1, 2], [1, 1, 0], + 0, // Max moves that can be eliminated per cycle. + 1>; // Restrict move elimination to zero regs. // The retire control unit (RCU) can track up to 64 macro-ops in-flight. It can // retire up to two macro-ops per cycle. @@ -805,4 +815,24 @@ def : IsDepBreakingFunction<[ ], ZeroIdiomPredicate> ]>; +def : IsOptimizableRegisterMove<[ + InstructionEquivalenceClass<[ + // GPR variants. + MOV32rr, MOV64rr, + + // MMX variants. + MMX_MOVQ64rr, + + // SSE variants. + MOVAPSrr, MOVUPSrr, + MOVAPDrr, MOVUPDrr, + MOVDQArr, MOVDQUrr, + + // AVX variants. + VMOVAPSrr, VMOVUPSrr, + VMOVAPDrr, VMOVUPDrr, + VMOVDQArr, VMOVDQUrr + ], TruePred > +]>; + } // SchedModel diff --git a/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-1.s b/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-1.s index d2588bef30e..3b38173ebca 100644 --- a/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-1.s +++ b/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-1.s @@ -32,13 +32,13 @@ vaddps %xmm1, %xmm1, %xmm2 # CHECK-NEXT: 1 3 1.00 vaddps %xmm1, %xmm1, %xmm2 # CHECK: Register File statistics: -# CHECK-NEXT: Total number of mappings created: 6 -# CHECK-NEXT: Max number of mappings used: 5 +# CHECK-NEXT: Total number of mappings created: 3 +# CHECK-NEXT: Max number of mappings used: 3 # CHECK: * Register File #1 -- JFpuPRF: # CHECK-NEXT: Number of physical registers: 72 -# CHECK-NEXT: Total number of mappings created: 6 -# CHECK-NEXT: Max number of mappings used: 5 +# CHECK-NEXT: Total number of mappings created: 3 +# CHECK-NEXT: Max number of mappings used: 3 # CHECK: * Register File #2 -- JIntegerPRF: # CHECK-NEXT: Number of physical registers: 64 @@ -63,25 +63,25 @@ vaddps %xmm1, %xmm1, %xmm2 # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] -# CHECK-NEXT: - - - 1.00 1.00 1.00 1.00 - - - - - - - +# CHECK-NEXT: - - - 1.00 - 1.00 - - - - - - - - # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions: # CHECK-NEXT: - - - - - - - - - - - - - - vxorps %xmm0, %xmm0, %xmm0 -# CHECK-NEXT: - - - - 1.00 - 1.00 - - - - - - - vmovaps %xmm0, %xmm1 +# CHECK-NEXT: - - - - - - - - - - - - - - vmovaps %xmm0, %xmm1 # CHECK-NEXT: - - - 1.00 - 1.00 - - - - - - - - vaddps %xmm1, %xmm1, %xmm2 # CHECK: Timeline view: # CHECK-NEXT: Index 0123456789 # CHECK: [0,0] DR . . vxorps %xmm0, %xmm0, %xmm0 -# CHECK-NEXT: [0,1] DeER . . vmovaps %xmm0, %xmm1 +# CHECK-NEXT: [0,1] DR . . vmovaps %xmm0, %xmm1 # CHECK-NEXT: [0,2] .DeeeER . vaddps %xmm1, %xmm1, %xmm2 # CHECK-NEXT: [1,0] .D----R . vxorps %xmm0, %xmm0, %xmm0 -# CHECK-NEXT: [1,1] . DeE--R . vmovaps %xmm0, %xmm1 -# CHECK-NEXT: [1,2] . D=eeeER. vaddps %xmm1, %xmm1, %xmm2 +# CHECK-NEXT: [1,1] . D----R . vmovaps %xmm0, %xmm1 +# CHECK-NEXT: [1,2] . DeeeER . vaddps %xmm1, %xmm1, %xmm2 # CHECK-NEXT: [2,0] . D----R. vxorps %xmm0, %xmm0, %xmm0 -# CHECK-NEXT: [2,1] . DeE---R vmovaps %xmm0, %xmm1 +# CHECK-NEXT: [2,1] . D----R. vmovaps %xmm0, %xmm1 # CHECK-NEXT: [2,2] . DeeeER vaddps %xmm1, %xmm1, %xmm2 # CHECK: Average Wait times (based on the timeline view): @@ -92,5 +92,5 @@ vaddps %xmm1, %xmm1, %xmm2 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 3 0.0 0.0 2.7 vxorps %xmm0, %xmm0, %xmm0 -# CHECK-NEXT: 1. 3 1.0 1.0 1.7 vmovaps %xmm0, %xmm1 -# CHECK-NEXT: 2. 3 1.3 0.0 0.0 vaddps %xmm1, %xmm1, %xmm2 +# CHECK-NEXT: 1. 3 0.0 0.0 2.7 vmovaps %xmm0, %xmm1 +# CHECK-NEXT: 2. 3 1.0 1.0 0.0 vaddps %xmm1, %xmm1, %xmm2 diff --git a/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-2.s b/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-2.s index 33cd3972194..096fe6c5a8f 100644 --- a/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-2.s +++ b/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-2.s @@ -14,12 +14,12 @@ movdqu %xmm5, %xmm0 # CHECK: Iterations: 3 # CHECK-NEXT: Instructions: 27 -# CHECK-NEXT: Total Cycles: 19 +# CHECK-NEXT: Total Cycles: 15 # CHECK-NEXT: Total uOps: 27 # CHECK: Dispatch Width: 2 -# CHECK-NEXT: uOps Per Cycle: 1.42 -# CHECK-NEXT: IPC: 1.42 +# CHECK-NEXT: uOps Per Cycle: 1.80 +# CHECK-NEXT: IPC: 1.80 # CHECK-NEXT: Block RThroughput: 4.5 # CHECK: Instruction Info: @@ -42,13 +42,13 @@ movdqu %xmm5, %xmm0 # CHECK-NEXT: 1 1 0.50 movdqu %xmm5, %xmm0 # CHECK: Register File statistics: -# CHECK-NEXT: Total number of mappings created: 21 -# CHECK-NEXT: Max number of mappings used: 8 +# CHECK-NEXT: Total number of mappings created: 0 +# CHECK-NEXT: Max number of mappings used: 0 # CHECK: * Register File #1 -- JFpuPRF: # CHECK-NEXT: Number of physical registers: 72 -# CHECK-NEXT: Total number of mappings created: 21 -# CHECK-NEXT: Max number of mappings used: 8 +# CHECK-NEXT: Total number of mappings created: 0 +# CHECK-NEXT: Max number of mappings used: 0 # CHECK: * Register File #2 -- JIntegerPRF: # CHECK-NEXT: Number of physical registers: 64 @@ -73,51 +73,51 @@ movdqu %xmm5, %xmm0 # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] -# CHECK-NEXT: - - - 2.00 2.00 3.33 3.67 - - - - 1.33 1.67 - +# CHECK-NEXT: - - - - - - - - - - - - - - # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions: # CHECK-NEXT: - - - - - - - - - - - - - - pxor %mm0, %mm0 -# CHECK-NEXT: - - - - - - 1.00 - - - - - 1.00 - movq %mm0, %mm1 +# CHECK-NEXT: - - - - - - - - - - - - - - movq %mm0, %mm1 # CHECK-NEXT: - - - - - - - - - - - - - - xorps %xmm0, %xmm0 -# CHECK-NEXT: - - - - 1.00 0.33 0.67 - - - - - - - movaps %xmm0, %xmm1 -# CHECK-NEXT: - - - 1.00 - 0.33 0.67 - - - - - - - movups %xmm1, %xmm2 -# CHECK-NEXT: - - - - 1.00 0.67 0.33 - - - - - - - movapd %xmm2, %xmm3 -# CHECK-NEXT: - - - 1.00 - 0.33 0.67 - - - - - - - movupd %xmm3, %xmm4 -# CHECK-NEXT: - - - - - 1.00 - - - - - 1.00 - - movdqa %xmm4, %xmm5 -# CHECK-NEXT: - - - - - 0.67 0.33 - - - - 0.33 0.67 - movdqu %xmm5, %xmm0 +# CHECK-NEXT: - - - - - - - - - - - - - - movaps %xmm0, %xmm1 +# CHECK-NEXT: - - - - - - - - - - - - - - movups %xmm1, %xmm2 +# CHECK-NEXT: - - - - - - - - - - - - - - movapd %xmm2, %xmm3 +# CHECK-NEXT: - - - - - - - - - - - - - - movupd %xmm3, %xmm4 +# CHECK-NEXT: - - - - - - - - - - - - - - movdqa %xmm4, %xmm5 +# CHECK-NEXT: - - - - - - - - - - - - - - movdqu %xmm5, %xmm0 # CHECK: Timeline view: -# CHECK-NEXT: 012345678 +# CHECK-NEXT: 01234 # CHECK-NEXT: Index 0123456789 -# CHECK: [0,0] DR . . . . pxor %mm0, %mm0 -# CHECK-NEXT: [0,1] DeER . . . . movq %mm0, %mm1 -# CHECK-NEXT: [0,2] .D-R . . . . xorps %xmm0, %xmm0 -# CHECK-NEXT: [0,3] .DeER. . . . movaps %xmm0, %xmm1 -# CHECK-NEXT: [0,4] . DeER . . . movups %xmm1, %xmm2 -# CHECK-NEXT: [0,5] . D=eER . . . movapd %xmm2, %xmm3 -# CHECK-NEXT: [0,6] . D=eER . . . movupd %xmm3, %xmm4 -# CHECK-NEXT: [0,7] . D==eER . . . movdqa %xmm4, %xmm5 -# CHECK-NEXT: [0,8] . D==eER. . . movdqu %xmm5, %xmm0 -# CHECK-NEXT: [1,0] . D----R. . . pxor %mm0, %mm0 -# CHECK-NEXT: [1,1] . DeE--R . . movq %mm0, %mm1 -# CHECK-NEXT: [1,2] . D----R . . xorps %xmm0, %xmm0 -# CHECK-NEXT: [1,3] . .DeE--R . . movaps %xmm0, %xmm1 -# CHECK-NEXT: [1,4] . .D=eE-R . . movups %xmm1, %xmm2 -# CHECK-NEXT: [1,5] . . D=eE-R . . movapd %xmm2, %xmm3 -# CHECK-NEXT: [1,6] . . D==eER . . movupd %xmm3, %xmm4 -# CHECK-NEXT: [1,7] . . D==eER . . movdqa %xmm4, %xmm5 -# CHECK-NEXT: [1,8] . . D===eER. . movdqu %xmm5, %xmm0 -# CHECK-NEXT: [2,0] . . D----R. . pxor %mm0, %mm0 -# CHECK-NEXT: [2,1] . . DeE---R . movq %mm0, %mm1 -# CHECK-NEXT: [2,2] . . D----R . xorps %xmm0, %xmm0 -# CHECK-NEXT: [2,3] . . DeE---R . movaps %xmm0, %xmm1 -# CHECK-NEXT: [2,4] . . .DeE--R . movups %xmm1, %xmm2 -# CHECK-NEXT: [2,5] . . .D=eE--R. movapd %xmm2, %xmm3 -# CHECK-NEXT: [2,6] . . . D=eE-R. movupd %xmm3, %xmm4 -# CHECK-NEXT: [2,7] . . . D==eE-R movdqa %xmm4, %xmm5 -# CHECK-NEXT: [2,8] . . . D==eER movdqu %xmm5, %xmm0 +# CHECK: [0,0] DR . . . pxor %mm0, %mm0 +# CHECK-NEXT: [0,1] DR . . . movq %mm0, %mm1 +# CHECK-NEXT: [0,2] .DR . . . xorps %xmm0, %xmm0 +# CHECK-NEXT: [0,3] .DR . . . movaps %xmm0, %xmm1 +# CHECK-NEXT: [0,4] . DR . . . movups %xmm1, %xmm2 +# CHECK-NEXT: [0,5] . DR . . . movapd %xmm2, %xmm3 +# CHECK-NEXT: [0,6] . DR. . . movupd %xmm3, %xmm4 +# CHECK-NEXT: [0,7] . DR. . . movdqa %xmm4, %xmm5 +# CHECK-NEXT: [0,8] . DR . . movdqu %xmm5, %xmm0 +# CHECK-NEXT: [1,0] . DR . . pxor %mm0, %mm0 +# CHECK-NEXT: [1,1] . DR . . movq %mm0, %mm1 +# CHECK-NEXT: [1,2] . DR . . xorps %xmm0, %xmm0 +# CHECK-NEXT: [1,3] . .DR . . movaps %xmm0, %xmm1 +# CHECK-NEXT: [1,4] . .DR . . movups %xmm1, %xmm2 +# CHECK-NEXT: [1,5] . . DR . . movapd %xmm2, %xmm3 +# CHECK-NEXT: [1,6] . . DR . . movupd %xmm3, %xmm4 +# CHECK-NEXT: [1,7] . . DR. . movdqa %xmm4, %xmm5 +# CHECK-NEXT: [1,8] . . DR. . movdqu %xmm5, %xmm0 +# CHECK-NEXT: [2,0] . . DR . pxor %mm0, %mm0 +# CHECK-NEXT: [2,1] . . DR . movq %mm0, %mm1 +# CHECK-NEXT: [2,2] . . DR . xorps %xmm0, %xmm0 +# CHECK-NEXT: [2,3] . . DR . movaps %xmm0, %xmm1 +# CHECK-NEXT: [2,4] . . .DR . movups %xmm1, %xmm2 +# CHECK-NEXT: [2,5] . . .DR . movapd %xmm2, %xmm3 +# CHECK-NEXT: [2,6] . . . DR. movupd %xmm3, %xmm4 +# CHECK-NEXT: [2,7] . . . DR. movdqa %xmm4, %xmm5 +# CHECK-NEXT: [2,8] . . . DR movdqu %xmm5, %xmm0 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -126,12 +126,12 @@ movdqu %xmm5, %xmm0 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 3 0.0 0.0 2.7 pxor %mm0, %mm0 -# CHECK-NEXT: 1. 3 1.0 1.0 1.7 movq %mm0, %mm1 -# CHECK-NEXT: 2. 3 0.0 0.0 3.0 xorps %xmm0, %xmm0 -# CHECK-NEXT: 3. 3 1.0 1.0 1.7 movaps %xmm0, %xmm1 -# CHECK-NEXT: 4. 3 1.3 0.0 1.0 movups %xmm1, %xmm2 -# CHECK-NEXT: 5. 3 2.0 0.0 1.0 movapd %xmm2, %xmm3 -# CHECK-NEXT: 6. 3 2.3 0.0 0.3 movupd %xmm3, %xmm4 -# CHECK-NEXT: 7. 3 3.0 0.0 0.3 movdqa %xmm4, %xmm5 -# CHECK-NEXT: 8. 3 3.3 0.0 0.0 movdqu %xmm5, %xmm0 +# CHECK-NEXT: 0. 3 0.0 0.0 0.0 pxor %mm0, %mm0 +# CHECK-NEXT: 1. 3 0.0 0.0 0.0 movq %mm0, %mm1 +# CHECK-NEXT: 2. 3 0.0 0.0 0.0 xorps %xmm0, %xmm0 +# CHECK-NEXT: 3. 3 0.0 0.0 0.0 movaps %xmm0, %xmm1 +# CHECK-NEXT: 4. 3 0.0 0.0 0.0 movups %xmm1, %xmm2 +# CHECK-NEXT: 5. 3 0.0 0.0 0.0 movapd %xmm2, %xmm3 +# CHECK-NEXT: 6. 3 0.0 0.0 0.0 movupd %xmm3, %xmm4 +# CHECK-NEXT: 7. 3 0.0 0.0 0.0 movdqa %xmm4, %xmm5 +# CHECK-NEXT: 8. 3 0.0 0.0 0.0 movdqu %xmm5, %xmm0 diff --git a/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-3.s b/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-3.s index e3e0abc75e7..3d64bfd0bfd 100644 --- a/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-3.s +++ b/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-3.s @@ -11,12 +11,12 @@ vmovdqu %xmm5, %xmm0 # CHECK: Iterations: 3 # CHECK-NEXT: Instructions: 21 -# CHECK-NEXT: Total Cycles: 16 +# CHECK-NEXT: Total Cycles: 12 # CHECK-NEXT: Total uOps: 21 # CHECK: Dispatch Width: 2 -# CHECK-NEXT: uOps Per Cycle: 1.31 -# CHECK-NEXT: IPC: 1.31 +# CHECK-NEXT: uOps Per Cycle: 1.75 +# CHECK-NEXT: IPC: 1.75 # CHECK-NEXT: Block RThroughput: 3.5 # CHECK: Instruction Info: @@ -37,13 +37,13 @@ vmovdqu %xmm5, %xmm0 # CHECK-NEXT: 1 1 0.50 vmovdqu %xmm5, %xmm0 # CHECK: Register File statistics: -# CHECK-NEXT: Total number of mappings created: 18 -# CHECK-NEXT: Max number of mappings used: 9 +# CHECK-NEXT: Total number of mappings created: 0 +# CHECK-NEXT: Max number of mappings used: 0 # CHECK: * Register File #1 -- JFpuPRF: # CHECK-NEXT: Number of physical registers: 72 -# CHECK-NEXT: Total number of mappings created: 18 -# CHECK-NEXT: Max number of mappings used: 9 +# CHECK-NEXT: Total number of mappings created: 0 +# CHECK-NEXT: Max number of mappings used: 0 # CHECK: * Register File #2 -- JIntegerPRF: # CHECK-NEXT: Number of physical registers: 64 @@ -68,43 +68,43 @@ vmovdqu %xmm5, %xmm0 # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] -# CHECK-NEXT: - - - 2.00 2.00 3.00 3.00 - - - - 1.00 1.00 - +# CHECK-NEXT: - - - - - - - - - - - - - - # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions: # CHECK-NEXT: - - - - - - - - - - - - - - vxorps %xmm0, %xmm0, %xmm0 -# CHECK-NEXT: - - - - 1.00 0.33 0.67 - - - - - - - vmovaps %xmm0, %xmm1 -# CHECK-NEXT: - - - 1.00 - 0.67 0.33 - - - - - - - vmovups %xmm1, %xmm2 -# CHECK-NEXT: - - - - 1.00 - 1.00 - - - - - - - vmovapd %xmm2, %xmm3 -# CHECK-NEXT: - - - 1.00 - 1.00 - - - - - - - - vmovupd %xmm3, %xmm4 -# CHECK-NEXT: - - - - - 0.33 0.67 - - - - - 1.00 - vmovdqa %xmm4, %xmm5 -# CHECK-NEXT: - - - - - 0.67 0.33 - - - - 1.00 - - vmovdqu %xmm5, %xmm0 +# CHECK-NEXT: - - - - - - - - - - - - - - vmovaps %xmm0, %xmm1 +# CHECK-NEXT: - - - - - - - - - - - - - - vmovups %xmm1, %xmm2 +# CHECK-NEXT: - - - - - - - - - - - - - - vmovapd %xmm2, %xmm3 +# CHECK-NEXT: - - - - - - - - - - - - - - vmovupd %xmm3, %xmm4 +# CHECK-NEXT: - - - - - - - - - - - - - - vmovdqa %xmm4, %xmm5 +# CHECK-NEXT: - - - - - - - - - - - - - - vmovdqu %xmm5, %xmm0 # CHECK: Timeline view: -# CHECK-NEXT: 012345 +# CHECK-NEXT: 01 # CHECK-NEXT: Index 0123456789 -# CHECK: [0,0] DR . . . vxorps %xmm0, %xmm0, %xmm0 -# CHECK-NEXT: [0,1] DeER . . . vmovaps %xmm0, %xmm1 -# CHECK-NEXT: [0,2] .DeER. . . vmovups %xmm1, %xmm2 -# CHECK-NEXT: [0,3] .D=eER . . vmovapd %xmm2, %xmm3 -# CHECK-NEXT: [0,4] . D=eER . . vmovupd %xmm3, %xmm4 -# CHECK-NEXT: [0,5] . D==eER . . vmovdqa %xmm4, %xmm5 -# CHECK-NEXT: [0,6] . D==eER . . vmovdqu %xmm5, %xmm0 -# CHECK-NEXT: [1,0] . D----R . . vxorps %xmm0, %xmm0, %xmm0 -# CHECK-NEXT: [1,1] . DeE--R. . vmovaps %xmm0, %xmm1 -# CHECK-NEXT: [1,2] . D=eE-R. . vmovups %xmm1, %xmm2 -# CHECK-NEXT: [1,3] . D=eE-R . vmovapd %xmm2, %xmm3 -# CHECK-NEXT: [1,4] . D==eER . vmovupd %xmm3, %xmm4 -# CHECK-NEXT: [1,5] . .D==eER . vmovdqa %xmm4, %xmm5 -# CHECK-NEXT: [1,6] . .D===eER . vmovdqu %xmm5, %xmm0 -# CHECK-NEXT: [2,0] . . D----R . vxorps %xmm0, %xmm0, %xmm0 -# CHECK-NEXT: [2,1] . . DeE---R . vmovaps %xmm0, %xmm1 -# CHECK-NEXT: [2,2] . . DeE--R . vmovups %xmm1, %xmm2 -# CHECK-NEXT: [2,3] . . D=eE--R. vmovapd %xmm2, %xmm3 -# CHECK-NEXT: [2,4] . . D=eE-R. vmovupd %xmm3, %xmm4 -# CHECK-NEXT: [2,5] . . D==eE-R vmovdqa %xmm4, %xmm5 -# CHECK-NEXT: [2,6] . . D==eER vmovdqu %xmm5, %xmm0 +# CHECK: [0,0] DR . .. vxorps %xmm0, %xmm0, %xmm0 +# CHECK-NEXT: [0,1] DR . .. vmovaps %xmm0, %xmm1 +# CHECK-NEXT: [0,2] .DR . .. vmovups %xmm1, %xmm2 +# CHECK-NEXT: [0,3] .DR . .. vmovapd %xmm2, %xmm3 +# CHECK-NEXT: [0,4] . DR . .. vmovupd %xmm3, %xmm4 +# CHECK-NEXT: [0,5] . DR . .. vmovdqa %xmm4, %xmm5 +# CHECK-NEXT: [0,6] . DR. .. vmovdqu %xmm5, %xmm0 +# CHECK-NEXT: [1,0] . DR. .. vxorps %xmm0, %xmm0, %xmm0 +# CHECK-NEXT: [1,1] . DR .. vmovaps %xmm0, %xmm1 +# CHECK-NEXT: [1,2] . DR .. vmovups %xmm1, %xmm2 +# CHECK-NEXT: [1,3] . DR .. vmovapd %xmm2, %xmm3 +# CHECK-NEXT: [1,4] . DR .. vmovupd %xmm3, %xmm4 +# CHECK-NEXT: [1,5] . .DR .. vmovdqa %xmm4, %xmm5 +# CHECK-NEXT: [1,6] . .DR .. vmovdqu %xmm5, %xmm0 +# CHECK-NEXT: [2,0] . . DR .. vxorps %xmm0, %xmm0, %xmm0 +# CHECK-NEXT: [2,1] . . DR .. vmovaps %xmm0, %xmm1 +# CHECK-NEXT: [2,2] . . DR.. vmovups %xmm1, %xmm2 +# CHECK-NEXT: [2,3] . . DR.. vmovapd %xmm2, %xmm3 +# CHECK-NEXT: [2,4] . . DR. vmovupd %xmm3, %xmm4 +# CHECK-NEXT: [2,5] . . DR. vmovdqa %xmm4, %xmm5 +# CHECK-NEXT: [2,6] . . DR vmovdqu %xmm5, %xmm0 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -113,10 +113,10 @@ vmovdqu %xmm5, %xmm0 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 3 0.0 0.0 2.7 vxorps %xmm0, %xmm0, %xmm0 -# CHECK-NEXT: 1. 3 1.0 1.0 1.7 vmovaps %xmm0, %xmm1 -# CHECK-NEXT: 2. 3 1.3 0.0 1.0 vmovups %xmm1, %xmm2 -# CHECK-NEXT: 3. 3 2.0 0.0 1.0 vmovapd %xmm2, %xmm3 -# CHECK-NEXT: 4. 3 2.3 0.0 0.3 vmovupd %xmm3, %xmm4 -# CHECK-NEXT: 5. 3 3.0 0.0 0.3 vmovdqa %xmm4, %xmm5 -# CHECK-NEXT: 6. 3 3.3 0.0 0.0 vmovdqu %xmm5, %xmm0 +# CHECK-NEXT: 0. 3 0.0 0.0 0.0 vxorps %xmm0, %xmm0, %xmm0 +# CHECK-NEXT: 1. 3 0.0 0.0 0.0 vmovaps %xmm0, %xmm1 +# CHECK-NEXT: 2. 3 0.0 0.0 0.0 vmovups %xmm1, %xmm2 +# CHECK-NEXT: 3. 3 0.0 0.0 0.0 vmovapd %xmm2, %xmm3 +# CHECK-NEXT: 4. 3 0.0 0.0 0.0 vmovupd %xmm3, %xmm4 +# CHECK-NEXT: 5. 3 0.0 0.0 0.0 vmovdqa %xmm4, %xmm5 +# CHECK-NEXT: 6. 3 0.0 0.0 0.0 vmovdqu %xmm5, %xmm0 diff --git a/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-4.s b/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-4.s index 72ca7693c5f..223b4c2c239 100644 --- a/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-4.s +++ b/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-4.s @@ -9,12 +9,12 @@ mov %edx, %eax # CHECK: Iterations: 3 # CHECK-NEXT: Instructions: 15 -# CHECK-NEXT: Total Cycles: 12 +# CHECK-NEXT: Total Cycles: 9 # CHECK-NEXT: Total uOps: 15 # CHECK: Dispatch Width: 2 -# CHECK-NEXT: uOps Per Cycle: 1.25 -# CHECK-NEXT: IPC: 1.25 +# CHECK-NEXT: uOps Per Cycle: 1.67 +# CHECK-NEXT: IPC: 1.67 # CHECK-NEXT: Block RThroughput: 2.5 # CHECK: Instruction Info: @@ -33,8 +33,8 @@ mov %edx, %eax # CHECK-NEXT: 1 1 0.50 movl %edx, %eax # CHECK: Register File statistics: -# CHECK-NEXT: Total number of mappings created: 12 -# CHECK-NEXT: Max number of mappings used: 7 +# CHECK-NEXT: Total number of mappings created: 0 +# CHECK-NEXT: Max number of mappings used: 0 # CHECK: * Register File #1 -- JFpuPRF: # CHECK-NEXT: Number of physical registers: 72 @@ -43,8 +43,8 @@ mov %edx, %eax # CHECK: * Register File #2 -- JIntegerPRF: # CHECK-NEXT: Number of physical registers: 64 -# CHECK-NEXT: Total number of mappings created: 12 -# CHECK-NEXT: Max number of mappings used: 7 +# CHECK-NEXT: Total number of mappings created: 0 +# CHECK-NEXT: Max number of mappings used: 0 # CHECK: Resources: # CHECK-NEXT: [0] - JALU0 @@ -64,35 +64,34 @@ mov %edx, %eax # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] -# CHECK-NEXT: 2.00 2.00 - - - - - - - - - - - - +# CHECK-NEXT: - - - - - - - - - - - - - - # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions: # CHECK-NEXT: - - - - - - - - - - - - - - xorl %eax, %eax -# CHECK-NEXT: 0.33 0.67 - - - - - - - - - - - - movl %eax, %ebx -# CHECK-NEXT: 1.00 - - - - - - - - - - - - - movl %ebx, %ecx -# CHECK-NEXT: - 1.00 - - - - - - - - - - - - movl %ecx, %edx -# CHECK-NEXT: 0.67 0.33 - - - - - - - - - - - - movl %edx, %eax +# CHECK-NEXT: - - - - - - - - - - - - - - movl %eax, %ebx +# CHECK-NEXT: - - - - - - - - - - - - - - movl %ebx, %ecx +# CHECK-NEXT: - - - - - - - - - - - - - - movl %ecx, %edx +# CHECK-NEXT: - - - - - - - - - - - - - - movl %edx, %eax # CHECK: Timeline view: -# CHECK-NEXT: 01 -# CHECK-NEXT: Index 0123456789 +# CHECK-NEXT: Index 012345678 -# CHECK: [0,0] DR . .. xorl %eax, %eax -# CHECK-NEXT: [0,1] DeER . .. movl %eax, %ebx -# CHECK-NEXT: [0,2] .DeER. .. movl %ebx, %ecx -# CHECK-NEXT: [0,3] .D=eER .. movl %ecx, %edx -# CHECK-NEXT: [0,4] . D=eER .. movl %edx, %eax -# CHECK-NEXT: [1,0] . D---R .. xorl %eax, %eax -# CHECK-NEXT: [1,1] . DeE-R .. movl %eax, %ebx -# CHECK-NEXT: [1,2] . D=eER .. movl %ebx, %ecx -# CHECK-NEXT: [1,3] . D=eER .. movl %ecx, %edx -# CHECK-NEXT: [1,4] . D==eER.. movl %edx, %eax -# CHECK-NEXT: [2,0] . D---R.. xorl %eax, %eax -# CHECK-NEXT: [2,1] . DeE--R. movl %eax, %ebx -# CHECK-NEXT: [2,2] . .DeE-R. movl %ebx, %ecx -# CHECK-NEXT: [2,3] . .D=eE-R movl %ecx, %edx -# CHECK-NEXT: [2,4] . . D=eER movl %edx, %eax +# CHECK: [0,0] DR . . xorl %eax, %eax +# CHECK-NEXT: [0,1] DR . . movl %eax, %ebx +# CHECK-NEXT: [0,2] .DR . . movl %ebx, %ecx +# CHECK-NEXT: [0,3] .DR . . movl %ecx, %edx +# CHECK-NEXT: [0,4] . DR . . movl %edx, %eax +# CHECK-NEXT: [1,0] . DR . . xorl %eax, %eax +# CHECK-NEXT: [1,1] . DR. . movl %eax, %ebx +# CHECK-NEXT: [1,2] . DR. . movl %ebx, %ecx +# CHECK-NEXT: [1,3] . DR . movl %ecx, %edx +# CHECK-NEXT: [1,4] . DR . movl %edx, %eax +# CHECK-NEXT: [2,0] . DR . xorl %eax, %eax +# CHECK-NEXT: [2,1] . DR . movl %eax, %ebx +# CHECK-NEXT: [2,2] . .DR. movl %ebx, %ecx +# CHECK-NEXT: [2,3] . .DR. movl %ecx, %edx +# CHECK-NEXT: [2,4] . . DR movl %edx, %eax # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -101,8 +100,8 @@ mov %edx, %eax # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 3 0.0 0.0 2.0 xorl %eax, %eax -# CHECK-NEXT: 1. 3 1.0 1.0 1.0 movl %eax, %ebx -# CHECK-NEXT: 2. 3 1.3 0.0 0.3 movl %ebx, %ecx -# CHECK-NEXT: 3. 3 2.0 0.0 0.3 movl %ecx, %edx -# CHECK-NEXT: 4. 3 2.3 0.0 0.0 movl %edx, %eax +# CHECK-NEXT: 0. 3 0.0 0.0 0.0 xorl %eax, %eax +# CHECK-NEXT: 1. 3 0.0 0.0 0.0 movl %eax, %ebx +# CHECK-NEXT: 2. 3 0.0 0.0 0.0 movl %ebx, %ecx +# CHECK-NEXT: 3. 3 0.0 0.0 0.0 movl %ecx, %edx +# CHECK-NEXT: 4. 3 0.0 0.0 0.0 movl %edx, %eax diff --git a/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-5.s b/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-5.s index 7d6b75f7c3f..ab873c7c43f 100644 --- a/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-5.s +++ b/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-5.s @@ -9,12 +9,12 @@ mov %rdx, %rax # CHECK: Iterations: 3 # CHECK-NEXT: Instructions: 15 -# CHECK-NEXT: Total Cycles: 12 +# CHECK-NEXT: Total Cycles: 9 # CHECK-NEXT: Total uOps: 15 # CHECK: Dispatch Width: 2 -# CHECK-NEXT: uOps Per Cycle: 1.25 -# CHECK-NEXT: IPC: 1.25 +# CHECK-NEXT: uOps Per Cycle: 1.67 +# CHECK-NEXT: IPC: 1.67 # CHECK-NEXT: Block RThroughput: 2.5 # CHECK: Instruction Info: @@ -33,8 +33,8 @@ mov %rdx, %rax # CHECK-NEXT: 1 1 0.50 movq %rdx, %rax # CHECK: Register File statistics: -# CHECK-NEXT: Total number of mappings created: 12 -# CHECK-NEXT: Max number of mappings used: 7 +# CHECK-NEXT: Total number of mappings created: 0 +# CHECK-NEXT: Max number of mappings used: 0 # CHECK: * Register File #1 -- JFpuPRF: # CHECK-NEXT: Number of physical registers: 72 @@ -43,8 +43,8 @@ mov %rdx, %rax # CHECK: * Register File #2 -- JIntegerPRF: # CHECK-NEXT: Number of physical registers: 64 -# CHECK-NEXT: Total number of mappings created: 12 -# CHECK-NEXT: Max number of mappings used: 7 +# CHECK-NEXT: Total number of mappings created: 0 +# CHECK-NEXT: Max number of mappings used: 0 # CHECK: Resources: # CHECK-NEXT: [0] - JALU0 @@ -64,35 +64,34 @@ mov %rdx, %rax # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] -# CHECK-NEXT: 2.00 2.00 - - - - - - - - - - - - +# CHECK-NEXT: - - - - - - - - - - - - - - # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions: # CHECK-NEXT: - - - - - - - - - - - - - - xorq %rax, %rax -# CHECK-NEXT: 0.33 0.67 - - - - - - - - - - - - movq %rax, %rbx -# CHECK-NEXT: 1.00 - - - - - - - - - - - - - movq %rbx, %rcx -# CHECK-NEXT: - 1.00 - - - - - - - - - - - - movq %rcx, %rdx -# CHECK-NEXT: 0.67 0.33 - - - - - - - - - - - - movq %rdx, %rax +# CHECK-NEXT: - - - - - - - - - - - - - - movq %rax, %rbx +# CHECK-NEXT: - - - - - - - - - - - - - - movq %rbx, %rcx +# CHECK-NEXT: - - - - - - - - - - - - - - movq %rcx, %rdx +# CHECK-NEXT: - - - - - - - - - - - - - - movq %rdx, %rax # CHECK: Timeline view: -# CHECK-NEXT: 01 -# CHECK-NEXT: Index 0123456789 +# CHECK-NEXT: Index 012345678 -# CHECK: [0,0] DR . .. xorq %rax, %rax -# CHECK-NEXT: [0,1] DeER . .. movq %rax, %rbx -# CHECK-NEXT: [0,2] .DeER. .. movq %rbx, %rcx -# CHECK-NEXT: [0,3] .D=eER .. movq %rcx, %rdx -# CHECK-NEXT: [0,4] . D=eER .. movq %rdx, %rax -# CHECK-NEXT: [1,0] . D---R .. xorq %rax, %rax -# CHECK-NEXT: [1,1] . DeE-R .. movq %rax, %rbx -# CHECK-NEXT: [1,2] . D=eER .. movq %rbx, %rcx -# CHECK-NEXT: [1,3] . D=eER .. movq %rcx, %rdx -# CHECK-NEXT: [1,4] . D==eER.. movq %rdx, %rax -# CHECK-NEXT: [2,0] . D---R.. xorq %rax, %rax -# CHECK-NEXT: [2,1] . DeE--R. movq %rax, %rbx -# CHECK-NEXT: [2,2] . .DeE-R. movq %rbx, %rcx -# CHECK-NEXT: [2,3] . .D=eE-R movq %rcx, %rdx -# CHECK-NEXT: [2,4] . . D=eER movq %rdx, %rax +# CHECK: [0,0] DR . . xorq %rax, %rax +# CHECK-NEXT: [0,1] DR . . movq %rax, %rbx +# CHECK-NEXT: [0,2] .DR . . movq %rbx, %rcx +# CHECK-NEXT: [0,3] .DR . . movq %rcx, %rdx +# CHECK-NEXT: [0,4] . DR . . movq %rdx, %rax +# CHECK-NEXT: [1,0] . DR . . xorq %rax, %rax +# CHECK-NEXT: [1,1] . DR. . movq %rax, %rbx +# CHECK-NEXT: [1,2] . DR. . movq %rbx, %rcx +# CHECK-NEXT: [1,3] . DR . movq %rcx, %rdx +# CHECK-NEXT: [1,4] . DR . movq %rdx, %rax +# CHECK-NEXT: [2,0] . DR . xorq %rax, %rax +# CHECK-NEXT: [2,1] . DR . movq %rax, %rbx +# CHECK-NEXT: [2,2] . .DR. movq %rbx, %rcx +# CHECK-NEXT: [2,3] . .DR. movq %rcx, %rdx +# CHECK-NEXT: [2,4] . . DR movq %rdx, %rax # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -101,8 +100,8 @@ mov %rdx, %rax # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 3 0.0 0.0 2.0 xorq %rax, %rax -# CHECK-NEXT: 1. 3 1.0 1.0 1.0 movq %rax, %rbx -# CHECK-NEXT: 2. 3 1.3 0.0 0.3 movq %rbx, %rcx -# CHECK-NEXT: 3. 3 2.0 0.0 0.3 movq %rcx, %rdx -# CHECK-NEXT: 4. 3 2.3 0.0 0.0 movq %rdx, %rax +# CHECK-NEXT: 0. 3 0.0 0.0 0.0 xorq %rax, %rax +# CHECK-NEXT: 1. 3 0.0 0.0 0.0 movq %rax, %rbx +# CHECK-NEXT: 2. 3 0.0 0.0 0.0 movq %rbx, %rcx +# CHECK-NEXT: 3. 3 0.0 0.0 0.0 movq %rcx, %rdx +# CHECK-NEXT: 4. 3 0.0 0.0 0.0 movq %rdx, %rax diff --git a/tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp b/tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp index 4cfe1a50f53..481e2e18fa9 100644 --- a/tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp +++ b/tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp @@ -73,7 +73,8 @@ void RegisterFile::addRegisterFile(const MCRegisterFileDesc &RF, // registers in register file #0 through the command line flag // `-register-file-size`. unsigned RegisterFileIndex = RegisterFiles.size(); - RegisterFiles.emplace_back(RF.NumPhysRegs); + RegisterFiles.emplace_back(RF.NumPhysRegs, RF.MaxMovesEliminatedPerCycle, + RF.AllowZeroMoveEliminationOnly); // Special case where there is no register class identifier in the set. // An empty set of register classes means: this register file contains all @@ -99,6 +100,7 @@ void RegisterFile::addRegisterFile(const MCRegisterFileDesc &RF, } IPC = std::make_pair(RegisterFileIndex, RCE.Cost); Entry.RenameAs = Reg; + Entry.AllowMoveElimination = RCE.AllowMoveElimination; // Assume the same cost for each sub-register. for (MCSubRegIterator I(Reg, &MRI); I.isValid(); ++I) { @@ -273,10 +275,6 @@ bool RegisterFile::tryEliminateMove(WriteState &WS, const ReadState &RS) { const RegisterMapping &RMFrom = RegisterMappings[RS.getRegisterID()]; const RegisterMapping &RMTo = RegisterMappings[WS.getRegisterID()]; - // Early exit if the PRF doesn't support move elimination for this register. - if (!RMTo.second.AllowMoveElimination) - return false; - // From and To must be owned by the same PRF. const RegisterRenamingInfo &RRIFrom = RMFrom.second; const RegisterRenamingInfo &RRITo = RMTo.second; @@ -298,9 +296,13 @@ bool RegisterFile::tryEliminateMove(WriteState &WS, const ReadState &RS) { // For now, we assume that there is a strong correlation between registers // that allow move elimination, and how those same registers are renamed in // hardware. - if (RRITo.RenameAs && RRITo.RenameAs != WS.getRegisterID()) + if (RRITo.RenameAs && RRITo.RenameAs != WS.getRegisterID()) { + // Early exit if the PRF doesn't support move elimination for this register. + if (!RegisterMappings[RRITo.RenameAs].second.AllowMoveElimination) + return false; if (!WS.clearsSuperRegisters()) return false; + } RegisterMappingTracker &RMT = RegisterFiles[RegisterFileIndex]; if (RMT.MaxMoveEliminatedPerCycle && diff --git a/tools/llvm-mca/lib/InstrBuilder.cpp b/tools/llvm-mca/lib/InstrBuilder.cpp index 0a26f40b940..1cb020a9f6d 100644 --- a/tools/llvm-mca/lib/InstrBuilder.cpp +++ b/tools/llvm-mca/lib/InstrBuilder.cpp @@ -463,6 +463,8 @@ InstrBuilder::createInstruction(const MCInst &MCI) { bool IsZeroIdiom = MCIA.isZeroIdiom(MCI, Mask, ProcID); bool IsDepBreaking = IsZeroIdiom || MCIA.isDependencyBreaking(MCI, Mask, ProcID); + if (MCIA.isOptimizableRegisterMove(MCI, ProcID)) + NewIS->setOptimizableMove(); // Initialize Reads first. for (const ReadDescriptor &RD : D.Reads) { diff --git a/utils/TableGen/CodeGenSchedule.cpp b/utils/TableGen/CodeGenSchedule.cpp index f8d7d9ad3d3..e94ed760fc4 100644 --- a/utils/TableGen/CodeGenSchedule.cpp +++ b/utils/TableGen/CodeGenSchedule.cpp @@ -1759,6 +1759,10 @@ void CodeGenSchedModels::collectRegisterFiles() { CodeGenProcModel &PM = getProcModel(RF->getValueAsDef("SchedModel")); PM.RegisterFiles.emplace_back(CodeGenRegisterFile(RF->getName(),RF)); CodeGenRegisterFile &CGRF = PM.RegisterFiles.back(); + CGRF.MaxMovesEliminatedPerCycle = + RF->getValueAsInt("MaxMovesEliminatedPerCycle"); + CGRF.AllowZeroMoveEliminationOnly = + RF->getValueAsBit("AllowZeroMoveEliminationOnly"); // Now set the number of physical registers as well as the cost of registers // in each register class. @@ -1770,9 +1774,17 @@ void CodeGenSchedModels::collectRegisterFiles() { RecVec RegisterClasses = RF->getValueAsListOfDefs("RegClasses"); std::vector RegisterCosts = RF->getValueAsListOfInts("RegCosts"); + ListInit *MoveElimInfo = RF->getValueAsListInit("AllowMoveElimination"); for (unsigned I = 0, E = RegisterClasses.size(); I < E; ++I) { int Cost = RegisterCosts.size() > I ? RegisterCosts[I] : 1; - CGRF.Costs.emplace_back(RegisterClasses[I], Cost); + + bool AllowMoveElim = false; + if (MoveElimInfo->size() > I) { + BitInit *Val = cast(MoveElimInfo->getElement(I)); + AllowMoveElim = Val->getValue(); + } + + CGRF.Costs.emplace_back(RegisterClasses[I], Cost, AllowMoveElim); } } } diff --git a/utils/TableGen/CodeGenSchedule.h b/utils/TableGen/CodeGenSchedule.h index c2af28bbaa0..39443bb35e9 100644 --- a/utils/TableGen/CodeGenSchedule.h +++ b/utils/TableGen/CodeGenSchedule.h @@ -167,8 +167,9 @@ struct CodeGenSchedClass { struct CodeGenRegisterCost { Record *RCDef; unsigned Cost; - CodeGenRegisterCost(Record *RC, unsigned RegisterCost) - : RCDef(RC), Cost(RegisterCost) {} + bool AllowMoveElimination; + CodeGenRegisterCost(Record *RC, unsigned RegisterCost, bool AllowMoveElim = false) + : RCDef(RC), Cost(RegisterCost), AllowMoveElimination(AllowMoveElim) {} CodeGenRegisterCost(const CodeGenRegisterCost &) = default; CodeGenRegisterCost &operator=(const CodeGenRegisterCost &) = delete; }; @@ -181,12 +182,18 @@ struct CodeGenRegisterCost { struct CodeGenRegisterFile { std::string Name; Record *RegisterFileDef; + unsigned MaxMovesEliminatedPerCycle; + bool AllowZeroMoveEliminationOnly; unsigned NumPhysRegs; std::vector Costs; - CodeGenRegisterFile(StringRef name, Record *def) - : Name(name), RegisterFileDef(def), NumPhysRegs(0) {} + CodeGenRegisterFile(StringRef name, Record *def, unsigned MaxMoveElimPerCy = 0, + bool AllowZeroMoveElimOnly = false) + : Name(name), RegisterFileDef(def), + MaxMovesEliminatedPerCycle(MaxMoveElimPerCy), + AllowZeroMoveEliminationOnly(AllowZeroMoveElimOnly), + NumPhysRegs(0) {} bool hasDefaultCosts() const { return Costs.empty(); } }; diff --git a/utils/TableGen/SubtargetEmitter.cpp b/utils/TableGen/SubtargetEmitter.cpp index ef0428eeed0..d1ea968590f 100644 --- a/utils/TableGen/SubtargetEmitter.cpp +++ b/utils/TableGen/SubtargetEmitter.cpp @@ -653,7 +653,7 @@ SubtargetEmitter::EmitRegisterFileTables(const CodeGenProcModel &ProcModel, return 0; // Print the RegisterCost table first. - OS << "\n// {RegisterClassID, Register Cost}\n"; + OS << "\n// {RegisterClassID, Register Cost, AllowMoveElimination }\n"; OS << "static const llvm::MCRegisterCostEntry " << ProcModel.ModelName << "RegisterCosts" << "[] = {\n"; @@ -668,24 +668,28 @@ SubtargetEmitter::EmitRegisterFileTables(const CodeGenProcModel &ProcModel, Record *Rec = RC.RCDef; if (Rec->getValue("Namespace")) OS << Rec->getValueAsString("Namespace") << "::"; - OS << Rec->getName() << "RegClassID, " << RC.Cost << "},\n"; + OS << Rec->getName() << "RegClassID, " << RC.Cost << ", " + << RC.AllowMoveElimination << "},\n"; } } OS << "};\n"; // Now generate a table with register file info. - OS << "\n // {Name, #PhysRegs, #CostEntries, IndexToCostTbl}\n"; + OS << "\n // {Name, #PhysRegs, #CostEntries, IndexToCostTbl, " + << "MaxMovesEliminatedPerCycle, AllowZeroMoveEliminationOnly }\n"; OS << "static const llvm::MCRegisterFileDesc " << ProcModel.ModelName << "RegisterFiles" << "[] = {\n" - << " { \"InvalidRegisterFile\", 0, 0, 0 },\n"; + << " { \"InvalidRegisterFile\", 0, 0, 0, 0, 0 },\n"; unsigned CostTblIndex = 0; for (const CodeGenRegisterFile &RD : ProcModel.RegisterFiles) { OS << " { "; OS << '"' << RD.Name << '"' << ", " << RD.NumPhysRegs << ", "; unsigned NumCostEntries = RD.Costs.size(); - OS << NumCostEntries << ", " << CostTblIndex << "},\n"; + OS << NumCostEntries << ", " << CostTblIndex << ", " + << RD.MaxMovesEliminatedPerCycle << ", " + << RD.AllowZeroMoveEliminationOnly << "},\n"; CostTblIndex += NumCostEntries; } OS << "};\n"; -- GitLab From d584a99dbb2a9af55ddaa41265a512c8c634b7cf Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 12 Oct 2018 12:10:34 +0000 Subject: [PATCH 0095/1116] [X86][SSE] Add extract_subvector(PSHUFB) -> PSHUFB(extract_subvector()) combine Fixes PR32160 by reducing the size of PSHUFB if we only use one of the lanes. This approach can probably be generalized to handle any target shuffle (and any subvector index) but we have no test coverage at the moment. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344336 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 12 ++++++++++++ test/CodeGen/X86/vector-trunc.ll | 6 ++---- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index c6ab4fb70f6..15bd238833d 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -40306,6 +40306,18 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, : ISD::SIGN_EXTEND_VECTOR_INREG; return DAG.getNode(ExtOp, SDLoc(N), OpVT, InVec.getOperand(0)); } + if (InOpcode == ISD::BITCAST) { + // TODO - do this for target shuffles in general. + SDValue InVecBC = peekThroughOneUseBitcasts(InVec); + if (InVecBC.getOpcode() == X86ISD::PSHUFB && OpVT.is128BitVector()) { + SDLoc DL(N); + SDValue SubPSHUFB = + DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, + extract128BitVector(InVecBC.getOperand(0), 0, DAG, DL), + extract128BitVector(InVecBC.getOperand(1), 0, DAG, DL)); + return DAG.getBitcast(OpVT, SubPSHUFB); + } + } } return SDValue(); diff --git a/test/CodeGen/X86/vector-trunc.ll b/test/CodeGen/X86/vector-trunc.ll index 0d00f8af5a8..db3692f318f 100644 --- a/test/CodeGen/X86/vector-trunc.ll +++ b/test/CodeGen/X86/vector-trunc.ll @@ -1922,16 +1922,14 @@ define <8 x i16> @PR32160(<8 x i32> %x) { ; ; AVX2-SLOW-LABEL: PR32160: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] ; AVX2-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: PR32160: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9] ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; -- GitLab From 9a6d7be910df45c53626f8f4272c69ad971abc87 Mon Sep 17 00:00:00 2001 From: Sam Parker Date: Fri, 12 Oct 2018 12:26:37 +0000 Subject: [PATCH 0096/1116] Fix documentation of MachineInstr::getNumOperands The documentation stated "Access to explicit operands of the instruction." This is misleading, as it also lists implicit operands. Patch by Philip Ginsbach. Differential Revision: https://reviews.llvm.org/D35481 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344338 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/CodeGen/MachineInstr.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/llvm/CodeGen/MachineInstr.h b/include/llvm/CodeGen/MachineInstr.h index 7c4e771ce72..ea1a2a536fc 100644 --- a/include/llvm/CodeGen/MachineInstr.h +++ b/include/llvm/CodeGen/MachineInstr.h @@ -408,7 +408,7 @@ public: /// Returns the opcode of this MachineInstr. unsigned getOpcode() const { return MCID->Opcode; } - /// Access to explicit operands of the instruction. + /// Retuns the total number of operands. unsigned getNumOperands() const { return NumOperands; } const MachineOperand& getOperand(unsigned i) const { -- GitLab From 638941f488dd9d1b1b9d49913240fdc2beec56a0 Mon Sep 17 00:00:00 2001 From: Andrea Di Biagio Date: Fri, 12 Oct 2018 12:38:27 +0000 Subject: [PATCH 0097/1116] [llvm-mca] Remove method RegisterFileStatistics::initializeRegisterFileInfo(). NFC git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344339 91177308-0d34-0410-b5e6-96231b3b80d8 --- tools/llvm-mca/Views/RegisterFileStatistics.cpp | 3 ++- tools/llvm-mca/Views/RegisterFileStatistics.h | 7 +------ 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/tools/llvm-mca/Views/RegisterFileStatistics.cpp b/tools/llvm-mca/Views/RegisterFileStatistics.cpp index 7dbc76a51e1..cd540e9dc60 100644 --- a/tools/llvm-mca/Views/RegisterFileStatistics.cpp +++ b/tools/llvm-mca/Views/RegisterFileStatistics.cpp @@ -19,7 +19,8 @@ using namespace llvm; namespace mca { -void RegisterFileStatistics::initializeRegisterFileInfo() { +RegisterFileStatistics::RegisterFileStatistics(const llvm::MCSubtargetInfo &sti) + : STI(sti) { const MCSchedModel &SM = STI.getSchedModel(); RegisterFileUsage Empty = {0, 0, 0}; if (!SM.hasExtraProcessorInfo()) { diff --git a/tools/llvm-mca/Views/RegisterFileStatistics.h b/tools/llvm-mca/Views/RegisterFileStatistics.h index 3dcac4d4f75..1e89d66dc50 100644 --- a/tools/llvm-mca/Views/RegisterFileStatistics.h +++ b/tools/llvm-mca/Views/RegisterFileStatistics.h @@ -51,15 +51,10 @@ class RegisterFileStatistics : public View { // There is one entry for each register file implemented by the processor. llvm::SmallVector RegisterFiles; - void initializeRegisterFileInfo(); - public: - RegisterFileStatistics(const llvm::MCSubtargetInfo &sti) : STI(sti) { - initializeRegisterFileInfo(); - } + RegisterFileStatistics(const llvm::MCSubtargetInfo &sti); void onEvent(const HWInstructionEvent &Event) override; - void printView(llvm::raw_ostream &OS) const override; }; } // namespace mca -- GitLab From cadc63b548f93cf19407baa6d1760eb6c668c183 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 12 Oct 2018 13:24:51 +0000 Subject: [PATCH 0098/1116] [X86][AVX] Regenerate tzcnt tests git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344341 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/CodeGen/X86/vector-tzcnt-256.ll | 138 +++++---------------------- 1 file changed, 24 insertions(+), 114 deletions(-) diff --git a/test/CodeGen/X86/vector-tzcnt-256.ll b/test/CodeGen/X86/vector-tzcnt-256.ll index 775a7a359ab..b1173fa4b88 100644 --- a/test/CodeGen/X86/vector-tzcnt-256.ll +++ b/test/CodeGen/X86/vector-tzcnt-256.ll @@ -1370,145 +1370,55 @@ define <4 x i64> @foldv4i64u() nounwind { } define <8 x i32> @foldv8i32() nounwind { -; AVX-LABEL: foldv8i32: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] -; AVX-NEXT: retq -; -; BITALG_NOVLX-LABEL: foldv8i32: -; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] -; BITALG_NOVLX-NEXT: retq -; -; BITALG-LABEL: foldv8i32: -; BITALG: # %bb.0: -; BITALG-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] -; BITALG-NEXT: retq -; -; X32-AVX-LABEL: foldv8i32: -; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] -; X32-AVX-NEXT: retl +; ALL-LABEL: foldv8i32: +; ALL: # %bb.0: +; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] +; ALL-NEXT: ret{{[l|q]}} %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> , i1 0) ret <8 x i32> %out } define <8 x i32> @foldv8i32u() nounwind { -; AVX-LABEL: foldv8i32u: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] -; AVX-NEXT: retq -; -; BITALG_NOVLX-LABEL: foldv8i32u: -; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] -; BITALG_NOVLX-NEXT: retq -; -; BITALG-LABEL: foldv8i32u: -; BITALG: # %bb.0: -; BITALG-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] -; BITALG-NEXT: retq -; -; X32-AVX-LABEL: foldv8i32u: -; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] -; X32-AVX-NEXT: retl +; ALL-LABEL: foldv8i32u: +; ALL: # %bb.0: +; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] +; ALL-NEXT: ret{{[l|q]}} %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> , i1 -1) ret <8 x i32> %out } define <16 x i16> @foldv16i16() nounwind { -; AVX-LABEL: foldv16i16: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] -; AVX-NEXT: retq -; -; BITALG_NOVLX-LABEL: foldv16i16: -; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] -; BITALG_NOVLX-NEXT: retq -; -; BITALG-LABEL: foldv16i16: -; BITALG: # %bb.0: -; BITALG-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] -; BITALG-NEXT: retq -; -; X32-AVX-LABEL: foldv16i16: -; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] -; X32-AVX-NEXT: retl +; ALL-LABEL: foldv16i16: +; ALL: # %bb.0: +; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] +; ALL-NEXT: ret{{[l|q]}} %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> , i1 0) ret <16 x i16> %out } define <16 x i16> @foldv16i16u() nounwind { -; AVX-LABEL: foldv16i16u: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] -; AVX-NEXT: retq -; -; BITALG_NOVLX-LABEL: foldv16i16u: -; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] -; BITALG_NOVLX-NEXT: retq -; -; BITALG-LABEL: foldv16i16u: -; BITALG: # %bb.0: -; BITALG-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] -; BITALG-NEXT: retq -; -; X32-AVX-LABEL: foldv16i16u: -; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] -; X32-AVX-NEXT: retl +; ALL-LABEL: foldv16i16u: +; ALL: # %bb.0: +; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] +; ALL-NEXT: ret{{[l|q]}} %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> , i1 -1) ret <16 x i16> %out } define <32 x i8> @foldv32i8() nounwind { -; AVX-LABEL: foldv32i8: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] -; AVX-NEXT: retq -; -; BITALG_NOVLX-LABEL: foldv32i8: -; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] -; BITALG_NOVLX-NEXT: retq -; -; BITALG-LABEL: foldv32i8: -; BITALG: # %bb.0: -; BITALG-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] -; BITALG-NEXT: retq -; -; X32-AVX-LABEL: foldv32i8: -; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] -; X32-AVX-NEXT: retl +; ALL-LABEL: foldv32i8: +; ALL: # %bb.0: +; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] +; ALL-NEXT: ret{{[l|q]}} %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> , i1 0) ret <32 x i8> %out } define <32 x i8> @foldv32i8u() nounwind { -; AVX-LABEL: foldv32i8u: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] -; AVX-NEXT: retq -; -; BITALG_NOVLX-LABEL: foldv32i8u: -; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] -; BITALG_NOVLX-NEXT: retq -; -; BITALG-LABEL: foldv32i8u: -; BITALG: # %bb.0: -; BITALG-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] -; BITALG-NEXT: retq -; -; X32-AVX-LABEL: foldv32i8u: -; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] -; X32-AVX-NEXT: retl +; ALL-LABEL: foldv32i8u: +; ALL: # %bb.0: +; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] +; ALL-NEXT: ret{{[l|q]}} %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> , i1 -1) ret <32 x i8> %out } -- GitLab From 29956bffea8f12634f457488cfe5071fed92d120 Mon Sep 17 00:00:00 2001 From: Max Moroz Date: Fri, 12 Oct 2018 13:59:31 +0000 Subject: [PATCH 0099/1116] [SanitizerCoverage] Make Inline8bit and TracePC counters dead stripping resistant. Summary: Otherwise, at least on Mac, the linker eliminates unused symbols which causes libFuzzer to error out due to a mismatch of the sizes of coverage tables. Issue in Chromium: https://bugs.chromium.org/p/chromium/issues/detail?id=892167 Reviewers: morehouse, kcc, george.karpenkov Reviewed By: morehouse Subscribers: kubamracek, llvm-commits Differential Revision: https://reviews.llvm.org/D53113 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344345 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Instrumentation/SanitizerCoverage.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp index 2a055920c3e..bf461c61ede 100644 --- a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp +++ b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp @@ -594,6 +594,7 @@ GlobalVariable *SanitizerCoverageModule::CreateFunctionLocalArrayInSection( Array->setSection(getSectionName(Section)); Array->setAlignment(Ty->isPointerTy() ? DL->getPointerSize() : Ty->getPrimitiveSizeInBits() / 8); + GlobalsToAppendToUsed.push_back(Array); GlobalsToAppendToCompilerUsed.push_back(Array); MDNode *MD = MDNode::get(F.getContext(), ValueAsMetadata::get(&F)); Array->addMetadata(LLVMContext::MD_associated, *MD); @@ -631,14 +632,14 @@ SanitizerCoverageModule::CreatePCArray(Function &F, void SanitizerCoverageModule::CreateFunctionLocalArrays( Function &F, ArrayRef AllBlocks) { - if (Options.TracePCGuard) { + if (Options.TracePCGuard) FunctionGuardArray = CreateFunctionLocalArrayInSection( AllBlocks.size(), F, Int32Ty, SanCovGuardsSectionName); - GlobalsToAppendToUsed.push_back(FunctionGuardArray); - } + if (Options.Inline8bitCounters) Function8bitCounterArray = CreateFunctionLocalArrayInSection( AllBlocks.size(), F, Int8Ty, SanCovCountersSectionName); + if (Options.PCTable) FunctionPCsArray = CreatePCArray(F, AllBlocks); } -- GitLab From 706d3da44f2997d8fada9a5cd3ef89cf91883ccf Mon Sep 17 00:00:00 2001 From: Hiroshi Inoue Date: Fri, 12 Oct 2018 14:02:20 +0000 Subject: [PATCH 0100/1116] [PowerPC] avoid masking already-zero bits in BitPermutationSelector The current BitPermutationSelector generates a code to build a value by tracking two types of bits: ConstZero and Variable. ConstZero means a bit we need to mask off and Variable is a bit we copy from an input value. This patch add third type of bits VariableKnownToBeZero caused by AssertZext node or zero-extending load node. VariableKnownToBeZero means a bit comes from an input value, but it is known to be already zero. So we do not need to mask them. VariableKnownToBeZero enhances flexibility to group bits, since we can avoid redundant masking for these bits. This patch also renames "HasZero" to "NeedMask" since now we may skip masking even when we have zeros (of type VariableKnownToBeZero). Differential Revision: https://reviews.llvm.org/D48025 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344347 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/PowerPC/PPCISelDAGToDAG.cpp | 119 ++++++++++++++++++--- test/CodeGen/PowerPC/addi-offset-fold.ll | 5 +- test/CodeGen/PowerPC/bitfieldinsert.ll | 35 +++++- test/CodeGen/PowerPC/ppc64le-aggregates.ll | 10 +- test/CodeGen/PowerPC/rlwimi-dyn-and.ll | 2 +- 5 files changed, 143 insertions(+), 28 deletions(-) diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index af17bb5f165..5ec7b102884 100644 --- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -1083,9 +1083,14 @@ class BitPermutationSelector { // lowest-order bit. unsigned Idx; + // ConstZero means a bit we need to mask off. + // Variable is a bit comes from an input variable. + // VariableKnownToBeZero is also a bit comes from an input variable, + // but it is known to be already zero. So we do not need to mask them. enum Kind { ConstZero, - Variable + Variable, + VariableKnownToBeZero } K; ValueBit(SDValue V, unsigned I, Kind K = Variable) @@ -1094,11 +1099,11 @@ class BitPermutationSelector { : V(SDValue(nullptr, 0)), Idx(UINT32_MAX), K(K) {} bool isZero() const { - return K == ConstZero; + return K == ConstZero || K == VariableKnownToBeZero; } bool hasValue() const { - return K == Variable; + return K == Variable || K == VariableKnownToBeZero; } SDValue getValue() const { @@ -1248,8 +1253,14 @@ class BitPermutationSelector { for (unsigned i = 0; i < NumBits; ++i) if (((Mask >> i) & 1) == 1) Bits[i] = (*LHSBits)[i]; - else - Bits[i] = ValueBit(ValueBit::ConstZero); + else { + // AND instruction masks this bit. If the input is already zero, + // we have nothing to do here. Otherwise, make the bit ConstZero. + if ((*LHSBits)[i].isZero()) + Bits[i] = (*LHSBits)[i]; + else + Bits[i] = ValueBit(ValueBit::ConstZero); + } return std::make_pair(Interesting, &Bits); } @@ -1259,8 +1270,26 @@ class BitPermutationSelector { const auto &RHSBits = *getValueBits(V.getOperand(1), NumBits).second; bool AllDisjoint = true; - for (unsigned i = 0; i < NumBits; ++i) - if (LHSBits[i].isZero()) + SDValue LastVal = SDValue(); + unsigned LastIdx = 0; + for (unsigned i = 0; i < NumBits; ++i) { + if (LHSBits[i].isZero() && RHSBits[i].isZero()) { + // If both inputs are known to be zero and one is ConstZero and + // another is VariableKnownToBeZero, we can select whichever + // we like. To minimize the number of bit groups, we select + // VariableKnownToBeZero if this bit is the next bit of the same + // input variable from the previous bit. Otherwise, we select + // ConstZero. + if (LHSBits[i].hasValue() && LHSBits[i].getValue() == LastVal && + LHSBits[i].getValueBitIndex() == LastIdx + 1) + Bits[i] = LHSBits[i]; + else if (RHSBits[i].hasValue() && RHSBits[i].getValue() == LastVal && + RHSBits[i].getValueBitIndex() == LastIdx + 1) + Bits[i] = RHSBits[i]; + else + Bits[i] = ValueBit(ValueBit::ConstZero); + } + else if (LHSBits[i].isZero()) Bits[i] = RHSBits[i]; else if (RHSBits[i].isZero()) Bits[i] = LHSBits[i]; @@ -1268,6 +1297,16 @@ class BitPermutationSelector { AllDisjoint = false; break; } + // We remember the value and bit index of this bit. + if (Bits[i].hasValue()) { + LastVal = Bits[i].getValue(); + LastIdx = Bits[i].getValueBitIndex(); + } + else { + if (LastVal) LastVal = SDValue(); + LastIdx = 0; + } + } if (!AllDisjoint) break; @@ -1293,6 +1332,44 @@ class BitPermutationSelector { return std::make_pair(Interesting, &Bits); } + case ISD::AssertZext: { + // For AssertZext, we look through the operand and + // mark the bits known to be zero. + const SmallVector *LHSBits; + std::tie(Interesting, LHSBits) = getValueBits(V.getOperand(0), + NumBits); + + EVT FromType = cast(V.getOperand(1))->getVT(); + const unsigned NumValidBits = FromType.getSizeInBits(); + for (unsigned i = 0; i < NumValidBits; ++i) + Bits[i] = (*LHSBits)[i]; + + // These bits are known to be zero. + for (unsigned i = NumValidBits; i < NumBits; ++i) + Bits[i] = ValueBit((*LHSBits)[i].getValue(), + (*LHSBits)[i].getValueBitIndex(), + ValueBit::VariableKnownToBeZero); + + return std::make_pair(Interesting, &Bits); + } + case ISD::LOAD: + LoadSDNode *LD = cast(V); + if (ISD::isZEXTLoad(V.getNode()) && V.getResNo() == 0) { + EVT VT = LD->getMemoryVT(); + const unsigned NumValidBits = VT.getSizeInBits(); + + for (unsigned i = 0; i < NumValidBits; ++i) + Bits[i] = ValueBit(V, i); + + // These bits are known to be zero. + for (unsigned i = NumValidBits; i < NumBits; ++i) + Bits[i] = ValueBit(V, i, ValueBit::VariableKnownToBeZero); + + // Zero-extending load itself cannot be optimized. So, it is not + // interesting by itself though it gives useful information. + return std::make_pair(Interesting = false, &Bits); + } + break; } for (unsigned i = 0; i < NumBits; ++i) @@ -1304,7 +1381,7 @@ class BitPermutationSelector { // For each value (except the constant ones), compute the left-rotate amount // to get it from its original to final position. void computeRotationAmounts() { - HasZeros = false; + NeedMask = false; RLAmt.resize(Bits.size()); for (unsigned i = 0; i < Bits.size(); ++i) if (Bits[i].hasValue()) { @@ -1314,7 +1391,7 @@ class BitPermutationSelector { else RLAmt[i] = Bits.size() - (VBI - i); } else if (Bits[i].isZero()) { - HasZeros = true; + NeedMask = true; RLAmt[i] = UINT32_MAX; } else { llvm_unreachable("Unknown value bit type"); @@ -1330,6 +1407,7 @@ class BitPermutationSelector { unsigned LastRLAmt = RLAmt[0]; SDValue LastValue = Bits[0].hasValue() ? Bits[0].getValue() : SDValue(); unsigned LastGroupStartIdx = 0; + bool IsGroupOfZeros = !Bits[LastGroupStartIdx].hasValue(); for (unsigned i = 1; i < Bits.size(); ++i) { unsigned ThisRLAmt = RLAmt[i]; SDValue ThisValue = Bits[i].hasValue() ? Bits[i].getValue() : SDValue(); @@ -1342,10 +1420,20 @@ class BitPermutationSelector { LastGroupStartIdx = 0; } + // If this bit is known to be zero and the current group is a bit group + // of zeros, we do not need to terminate the current bit group even the + // Value or RLAmt does not match here. Instead, we terminate this group + // when the first non-zero bit appears later. + if (IsGroupOfZeros && Bits[i].isZero()) + continue; + // If this bit has the same underlying value and the same rotate factor as // the last one, then they're part of the same group. if (ThisRLAmt == LastRLAmt && ThisValue == LastValue) - continue; + // We cannot continue the current group if this bits is not known to + // be zero in a bit group of zeros. + if (!(IsGroupOfZeros && ThisValue && !Bits[i].isZero())) + continue; if (LastValue.getNode()) BitGroups.push_back(BitGroup(LastValue, LastRLAmt, LastGroupStartIdx, @@ -1353,6 +1441,7 @@ class BitPermutationSelector { LastRLAmt = ThisRLAmt; LastValue = ThisValue; LastGroupStartIdx = i; + IsGroupOfZeros = !Bits[LastGroupStartIdx].hasValue(); } if (LastValue.getNode()) BitGroups.push_back(BitGroup(LastValue, LastRLAmt, LastGroupStartIdx, @@ -1698,7 +1787,7 @@ class BitPermutationSelector { // If we've not yet selected a 'starting' instruction, and we have no zeros // to fill in, select the (Value, RLAmt) with the highest priority (largest // number of groups), and start with this rotated value. - if ((!HasZeros || LateMask) && !Res) { + if ((!NeedMask || LateMask) && !Res) { ValueRotInfo &VRI = ValueRotsVec[0]; if (VRI.RLAmt) { if (InstCnt) *InstCnt += 1; @@ -2077,7 +2166,7 @@ class BitPermutationSelector { // If we've not yet selected a 'starting' instruction, and we have no zeros // to fill in, select the (Value, RLAmt) with the highest priority (largest // number of groups), and start with this rotated value. - if ((!HasZeros || LateMask) && !Res) { + if ((!NeedMask || LateMask) && !Res) { // If we have both Repl32 groups and non-Repl32 groups, the non-Repl32 // groups will come first, and so the VRI representing the largest number // of groups might not be first (it might be the first Repl32 groups). @@ -2230,7 +2319,7 @@ class BitPermutationSelector { SmallVector Bits; - bool HasZeros; + bool NeedMask; SmallVector RLAmt; SmallVector BitGroups; @@ -2259,10 +2348,10 @@ public: " selection for: "); LLVM_DEBUG(N->dump(CurDAG)); - // Fill it RLAmt and set HasZeros. + // Fill it RLAmt and set NeedMask. computeRotationAmounts(); - if (!HasZeros) + if (!NeedMask) return Select(N, false); // We currently have two techniques for handling results with zeros: early diff --git a/test/CodeGen/PowerPC/addi-offset-fold.ll b/test/CodeGen/PowerPC/addi-offset-fold.ll index 7af99203694..db2fb0eee7c 100644 --- a/test/CodeGen/PowerPC/addi-offset-fold.ll +++ b/test/CodeGen/PowerPC/addi-offset-fold.ll @@ -27,10 +27,9 @@ entry: ; FIXME: We don't need to do these stores at all. ; CHECK-DAG: std 3, -24(1) ; CHECK-DAG: stb 4, -16(1) -; CHECK-DAG: sldi [[REG3:[0-9]+]], 4, 32 ; CHECK-DAG: lwz [[REG2:[0-9]+]], -20(1) -; CHECK-DAG: or [[REG4:[0-9]+]], [[REG2]], [[REG3]] -; CHECK: rldicl 3, [[REG4]], 33, 57 +; CHECK-DAG: rlwinm 3, [[REG2]], 1, 31, 31 +; CHECK: rlwimi 3, 4, 1, 25, 30 ; CHECK: blr } diff --git a/test/CodeGen/PowerPC/bitfieldinsert.ll b/test/CodeGen/PowerPC/bitfieldinsert.ll index e654c7d8a0c..76a648b6f13 100644 --- a/test/CodeGen/PowerPC/bitfieldinsert.ll +++ b/test/CodeGen/PowerPC/bitfieldinsert.ll @@ -1,6 +1,35 @@ ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 < %s | FileCheck %s ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 < %s | FileCheck %s +; equivalent C code +; struct s64 { +; int a:5; +; int b:16; +; long c:42; +; }; +; void bitfieldinsert64(struct s *p, unsigned short v) { +; p->b = v; +; } + +%struct.s64 = type { i64 } + +define void @bitfieldinsert64(%struct.s64* nocapture %p, i16 zeroext %v) { +; CHECK-LABEL: @bitfieldinsert64 +; CHECK: ld [[REG1:[0-9]+]], 0(3) +; CHECK-NEXT: rlwimi [[REG1]], 4, 5, 11, 26 +; CHECK-NEXT: std [[REG1]], 0(3) +; CHECK-NEXT: blr +entry: + %0 = getelementptr inbounds %struct.s64, %struct.s64* %p, i64 0, i32 0 + %1 = zext i16 %v to i64 + %bf.load = load i64, i64* %0, align 8 + %bf.shl = shl nuw nsw i64 %1, 5 + %bf.clear = and i64 %bf.load, -2097121 + %bf.set = or i64 %bf.clear, %bf.shl + store i64 %bf.set, i64* %0, align 8 + ret void +} + ; bitfieldinsert32: Test for rlwimi ; equivalent C code ; struct s32 { @@ -17,9 +46,9 @@ define void @bitfieldinsert32(%struct.s32* nocapture %p, i32 zeroext %v) { ; CHECK-LABEL: @bitfieldinsert32 ; CHECK: lwz [[REG1:[0-9]+]], 0(3) -; CHECK: rlwimi [[REG1]], 4, 8, 8, 23 -; CHECK: stw [[REG1]], 0(3) -; CHECK: blr +; CHECK-NEXT: rlwimi [[REG1]], 4, 8, 8, 23 +; CHECK-NEXT: stw [[REG1]], 0(3) +; CHECK-NEXT: blr entry: %0 = getelementptr inbounds %struct.s32, %struct.s32* %p, i64 0, i32 0 %bf.load = load i32, i32* %0, align 4 diff --git a/test/CodeGen/PowerPC/ppc64le-aggregates.ll b/test/CodeGen/PowerPC/ppc64le-aggregates.ll index 91119786b1f..a35250526c7 100644 --- a/test/CodeGen/PowerPC/ppc64le-aggregates.ll +++ b/test/CodeGen/PowerPC/ppc64le-aggregates.ll @@ -236,14 +236,12 @@ entry: ; CHECK-DAG: stfs 6, [[OFF1:[0-9]+]](1) ; CHECK-DAG: stfs 7, [[OFF2:[0-9]+]](1) ; CHECK-DAG: stfs 8, [[OFF3:[0-9]+]](1) -; CHECK-DAG: lwz [[REG0:[0-9]+]], [[OFF0]](1) +; CHECK-DAG: lwz 9, [[OFF0]](1) ; CHECK-DAG: lwz [[REG1:[0-9]+]], [[OFF1]](1) -; CHECK-DAG: lwz [[REG2:[0-9]+]], [[OFF2]](1) +; CHECK-DAG: lwz 10, [[OFF2]](1) ; CHECK-DAG: lwz [[REG3:[0-9]+]], [[OFF3]](1) -; CHECK-DAG: sldi [[REG1]], [[REG1]], 32 -; CHECK-DAG: sldi [[REG3]], [[REG3]], 32 -; CHECK-DAG: or 9, [[REG0]], [[REG1]] -; CHECK-DAG: or 10, [[REG2]], [[REG3]] +; CHECK-DAG: rldimi 9, [[REG1]], 32, 0 +; CHECK-DAG: rldimi 10, [[REG3]], 32, 0 ; CHECK: bl test1 declare void @test1([8 x float], [8 x float]) diff --git a/test/CodeGen/PowerPC/rlwimi-dyn-and.ll b/test/CodeGen/PowerPC/rlwimi-dyn-and.ll index 0d7501afc27..6e2802f6ff9 100644 --- a/test/CodeGen/PowerPC/rlwimi-dyn-and.ll +++ b/test/CodeGen/PowerPC/rlwimi-dyn-and.ll @@ -39,7 +39,7 @@ next: ret i32 %conv174 ; CHECK-LABEL: @test2 -; CHECK: slwi 3, {{[0-9]+}}, 7 +; CHECK: rlwinm 3, {{[0-9]+}}, 7, 17, 24 ; CHECK: rlwimi 3, {{[0-9]+}}, 15, 16, 16 ; CHECK: blr } -- GitLab From c4e53cf2f6736bb5ff869810f26b30daacb5f78c Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 12 Oct 2018 14:18:47 +0000 Subject: [PATCH 0101/1116] [X86][SSE] LowerVectorCTPOP - pull out repeated byte sum stage. Pull out repeated byte sum stage for popcount of vector elements > 8bits. This allows us to simplify the LUT/BITMATH popcnt code to always assume vXi8 vectors, and also improves avx512bitalg codegen which only has access to vpopcntb/vpopcntw. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344348 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 81 ++++------ test/CodeGen/X86/vec_ctbits.ll | 84 +++++----- test/CodeGen/X86/vector-lzcnt-128.ll | 144 ++++++++--------- test/CodeGen/X86/vector-popcnt-128.ll | 112 ++++++------- test/CodeGen/X86/vector-popcnt-256.ll | 38 +---- test/CodeGen/X86/vector-popcnt-512.ll | 18 +-- test/CodeGen/X86/vector-tzcnt-128.ll | 220 ++++++++++---------------- test/CodeGen/X86/vector-tzcnt-256.ll | 72 +-------- test/CodeGen/X86/vector-tzcnt-512.ll | 36 +---- 9 files changed, 292 insertions(+), 513 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 15bd238833d..d2971d0f861 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -25023,7 +25023,8 @@ static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); MVT EltVT = VT.getVectorElementType(); - unsigned VecSize = VT.getSizeInBits(); + int NumElts = VT.getVectorNumElements(); + assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported."); // Implement a lookup table in register by using an algorithm based on: // http://wm.ite.pl/articles/sse-popcount.html @@ -25035,56 +25036,37 @@ static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL, // masked out higher ones) for each byte. PSHUFB is used separately with both // to index the in-register table. Next, both are added and the result is a // i8 vector where each element contains the pop count for input byte. - // - // To obtain the pop count for elements != i8, we follow up with the same - // approach and use additional tricks as described below. - // const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2, /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3, /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3, /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4}; - int NumByteElts = VecSize / 8; - MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts); - SDValue In = DAG.getBitcast(ByteVecVT, Op); SmallVector LUTVec; - for (int i = 0; i < NumByteElts; ++i) + for (int i = 0; i < NumElts; ++i) LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8)); - SDValue InRegLUT = DAG.getBuildVector(ByteVecVT, DL, LUTVec); - SDValue M0F = DAG.getConstant(0x0F, DL, ByteVecVT); + SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec); + SDValue M0F = DAG.getConstant(0x0F, DL, VT); // High nibbles - SDValue FourV = DAG.getConstant(4, DL, ByteVecVT); - SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV); + SDValue FourV = DAG.getConstant(4, DL, VT); + SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV); // Low nibbles - SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F); + SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F); // The input vector is used as the shuffle mask that index elements into the // LUT. After counting low and high nibbles, add the vector to obtain the // final pop count per i8 element. - SDValue HighPopCnt = - DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles); - SDValue LowPopCnt = - DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles); - SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt); - - if (EltVT == MVT::i8) - return PopCnt; - - return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG); + SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles); + SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles); + return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt); } static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); - assert(VT.is128BitVector() && - "Only 128-bit vector bitmath lowering supported."); - - int VecSize = VT.getSizeInBits(); - MVT EltVT = VT.getVectorElementType(); - int Len = EltVT.getSizeInBits(); + assert(VT == MVT::v16i8 && "Only v16i8 vector CTPOP lowering supported."); // This is the vectorized version of the "best" algorithm from // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel @@ -25108,36 +25090,27 @@ static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL, // x86, so set the SRL type to have elements at least i16 wide. This is // correct because all of our SRLs are followed immediately by a mask anyways // that handles any bits that sneak into the high bits of the byte elements. - MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16); - + MVT SrlVT = MVT::v8i16; SDValue V = Op; // v = v - ((v >> 1) & 0x55555555...) SDValue Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1)); - SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55))); + SDValue And = GetMask(Srl, APInt(8, 0x55)); V = DAG.getNode(ISD::SUB, DL, VT, V, And); // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...) - SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33))); + SDValue AndLHS = GetMask(V, APInt(8, 0x33)); Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2)); - SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33))); + SDValue AndRHS = GetMask(Srl, APInt(8, 0x33)); V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS); // v = (v + (v >> 4)) & 0x0F0F0F0F... Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4)); SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl); - V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F))); + V = GetMask(Add, APInt(8, 0x0F)); - // At this point, V contains the byte-wise population count, and we are - // merely doing a horizontal sum if necessary to get the wider element - // counts. - if (EltVT == MVT::i8) - return V; - - return LowerHorizontalByteSum( - DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget, - DAG); + return V; } // Please ensure that any codegen change from LowerVectorCTPOP is reflected in @@ -25163,12 +25136,6 @@ static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget, } } - if (!Subtarget.hasSSSE3()) { - // We can't use the fast LUT approach, so fall back on vectorized bitmath. - assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!"); - return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG); - } - // Decompose 256-bit ops into smaller 128-bit ops. if (VT.is256BitVector() && !Subtarget.hasInt256()) return Lower256IntUnary(Op, DAG); @@ -25177,6 +25144,18 @@ static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget, if (VT.is512BitVector() && !Subtarget.hasBWI()) return Lower512IntUnary(Op, DAG); + // For element types greater than i8, do vXi8 pop counts and a bytesum. + if (VT.getScalarType() != MVT::i8) { + MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8); + SDValue ByteOp = DAG.getBitcast(ByteVT, Op0); + SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp); + return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG); + } + + // We can't use the fast LUT approach, so fall back on vectorized bitmath. + if (!Subtarget.hasSSSE3()) + return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG); + return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG); } diff --git a/test/CodeGen/X86/vec_ctbits.ll b/test/CodeGen/X86/vec_ctbits.ll index 781c61b5789..978a40cbb26 100644 --- a/test/CodeGen/X86/vec_ctbits.ll +++ b/test/CodeGen/X86/vec_ctbits.ll @@ -15,18 +15,18 @@ define <2 x i64> @footz(<2 x i64> %a) nounwind { ; CHECK-NEXT: pcmpeqd %xmm3, %xmm3 ; CHECK-NEXT: paddq %xmm2, %xmm3 ; CHECK-NEXT: movdqa %xmm3, %xmm0 -; CHECK-NEXT: psrlq $1, %xmm0 +; CHECK-NEXT: psrlw $1, %xmm0 ; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 -; CHECK-NEXT: psubq %xmm0, %xmm3 -; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323] +; CHECK-NEXT: psubb %xmm0, %xmm3 +; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; CHECK-NEXT: movdqa %xmm3, %xmm2 ; CHECK-NEXT: pand %xmm0, %xmm2 -; CHECK-NEXT: psrlq $2, %xmm3 +; CHECK-NEXT: psrlw $2, %xmm3 ; CHECK-NEXT: pand %xmm0, %xmm3 -; CHECK-NEXT: paddq %xmm2, %xmm3 +; CHECK-NEXT: paddb %xmm2, %xmm3 ; CHECK-NEXT: movdqa %xmm3, %xmm0 -; CHECK-NEXT: psrlq $4, %xmm0 -; CHECK-NEXT: paddq %xmm3, %xmm0 +; CHECK-NEXT: psrlw $4, %xmm0 +; CHECK-NEXT: paddb %xmm3, %xmm0 ; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 ; CHECK-NEXT: psadbw %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -58,18 +58,18 @@ define <2 x i64> @foolz(<2 x i64> %a) nounwind { ; CHECK-NEXT: pcmpeqd %xmm1, %xmm1 ; CHECK-NEXT: pxor %xmm0, %xmm1 ; CHECK-NEXT: movdqa %xmm1, %xmm0 -; CHECK-NEXT: psrlq $1, %xmm0 +; CHECK-NEXT: psrlw $1, %xmm0 ; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 -; CHECK-NEXT: psubq %xmm0, %xmm1 -; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323] +; CHECK-NEXT: psubb %xmm0, %xmm1 +; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; CHECK-NEXT: movdqa %xmm1, %xmm2 ; CHECK-NEXT: pand %xmm0, %xmm2 -; CHECK-NEXT: psrlq $2, %xmm1 +; CHECK-NEXT: psrlw $2, %xmm1 ; CHECK-NEXT: pand %xmm0, %xmm1 -; CHECK-NEXT: paddq %xmm2, %xmm1 +; CHECK-NEXT: paddb %xmm2, %xmm1 ; CHECK-NEXT: movdqa %xmm1, %xmm2 -; CHECK-NEXT: psrlq $4, %xmm2 -; CHECK-NEXT: paddq %xmm1, %xmm2 +; CHECK-NEXT: psrlw $4, %xmm2 +; CHECK-NEXT: paddb %xmm1, %xmm2 ; CHECK-NEXT: pand {{.*}}(%rip), %xmm2 ; CHECK-NEXT: pxor %xmm0, %xmm0 ; CHECK-NEXT: psadbw %xmm2, %xmm0 @@ -83,18 +83,18 @@ define <2 x i64> @foopop(<2 x i64> %a) nounwind { ; CHECK-LABEL: foopop: ; CHECK: # %bb.0: ; CHECK-NEXT: movdqa %xmm0, %xmm1 -; CHECK-NEXT: psrlq $1, %xmm1 +; CHECK-NEXT: psrlw $1, %xmm1 ; CHECK-NEXT: pand {{.*}}(%rip), %xmm1 -; CHECK-NEXT: psubq %xmm1, %xmm0 -; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [3689348814741910323,3689348814741910323] +; CHECK-NEXT: psubb %xmm1, %xmm0 +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; CHECK-NEXT: movdqa %xmm0, %xmm2 ; CHECK-NEXT: pand %xmm1, %xmm2 -; CHECK-NEXT: psrlq $2, %xmm0 +; CHECK-NEXT: psrlw $2, %xmm0 ; CHECK-NEXT: pand %xmm1, %xmm0 -; CHECK-NEXT: paddq %xmm2, %xmm0 +; CHECK-NEXT: paddb %xmm2, %xmm0 ; CHECK-NEXT: movdqa %xmm0, %xmm1 -; CHECK-NEXT: psrlq $4, %xmm1 -; CHECK-NEXT: paddq %xmm0, %xmm1 +; CHECK-NEXT: psrlw $4, %xmm1 +; CHECK-NEXT: paddb %xmm0, %xmm1 ; CHECK-NEXT: pand {{.*}}(%rip), %xmm1 ; CHECK-NEXT: pxor %xmm0, %xmm0 ; CHECK-NEXT: psadbw %xmm0, %xmm1 @@ -119,18 +119,18 @@ define <2 x i32> @promtz(<2 x i32> %a) nounwind { ; CHECK-NEXT: pcmpeqd %xmm3, %xmm3 ; CHECK-NEXT: paddq %xmm2, %xmm3 ; CHECK-NEXT: movdqa %xmm3, %xmm0 -; CHECK-NEXT: psrlq $1, %xmm0 +; CHECK-NEXT: psrlw $1, %xmm0 ; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 -; CHECK-NEXT: psubq %xmm0, %xmm3 -; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323] +; CHECK-NEXT: psubb %xmm0, %xmm3 +; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; CHECK-NEXT: movdqa %xmm3, %xmm2 ; CHECK-NEXT: pand %xmm0, %xmm2 -; CHECK-NEXT: psrlq $2, %xmm3 +; CHECK-NEXT: psrlw $2, %xmm3 ; CHECK-NEXT: pand %xmm0, %xmm3 -; CHECK-NEXT: paddq %xmm2, %xmm3 +; CHECK-NEXT: paddb %xmm2, %xmm3 ; CHECK-NEXT: movdqa %xmm3, %xmm0 -; CHECK-NEXT: psrlq $4, %xmm0 -; CHECK-NEXT: paddq %xmm3, %xmm0 +; CHECK-NEXT: psrlw $4, %xmm0 +; CHECK-NEXT: paddb %xmm3, %xmm0 ; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 ; CHECK-NEXT: psadbw %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -164,18 +164,18 @@ define <2 x i32> @promlz(<2 x i32> %a) nounwind { ; CHECK-NEXT: pcmpeqd %xmm2, %xmm2 ; CHECK-NEXT: pxor %xmm0, %xmm2 ; CHECK-NEXT: movdqa %xmm2, %xmm0 -; CHECK-NEXT: psrlq $1, %xmm0 +; CHECK-NEXT: psrlw $1, %xmm0 ; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 -; CHECK-NEXT: psubq %xmm0, %xmm2 -; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323] +; CHECK-NEXT: psubb %xmm0, %xmm2 +; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; CHECK-NEXT: movdqa %xmm2, %xmm3 ; CHECK-NEXT: pand %xmm0, %xmm3 -; CHECK-NEXT: psrlq $2, %xmm2 +; CHECK-NEXT: psrlw $2, %xmm2 ; CHECK-NEXT: pand %xmm0, %xmm2 -; CHECK-NEXT: paddq %xmm3, %xmm2 +; CHECK-NEXT: paddb %xmm3, %xmm2 ; CHECK-NEXT: movdqa %xmm2, %xmm0 -; CHECK-NEXT: psrlq $4, %xmm0 -; CHECK-NEXT: paddq %xmm2, %xmm0 +; CHECK-NEXT: psrlw $4, %xmm0 +; CHECK-NEXT: paddb %xmm2, %xmm0 ; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 ; CHECK-NEXT: psadbw %xmm1, %xmm0 ; CHECK-NEXT: psubq {{.*}}(%rip), %xmm0 @@ -191,18 +191,18 @@ define <2 x i32> @prompop(<2 x i32> %a) nounwind { ; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 ; CHECK-NEXT: pxor %xmm2, %xmm2 ; CHECK-NEXT: movdqa %xmm0, %xmm1 -; CHECK-NEXT: psrlq $1, %xmm1 +; CHECK-NEXT: psrlw $1, %xmm1 ; CHECK-NEXT: pand {{.*}}(%rip), %xmm1 -; CHECK-NEXT: psubq %xmm1, %xmm0 -; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [3689348814741910323,3689348814741910323] +; CHECK-NEXT: psubb %xmm1, %xmm0 +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; CHECK-NEXT: movdqa %xmm0, %xmm3 ; CHECK-NEXT: pand %xmm1, %xmm3 -; CHECK-NEXT: psrlq $2, %xmm0 +; CHECK-NEXT: psrlw $2, %xmm0 ; CHECK-NEXT: pand %xmm1, %xmm0 -; CHECK-NEXT: paddq %xmm3, %xmm0 +; CHECK-NEXT: paddb %xmm3, %xmm0 ; CHECK-NEXT: movdqa %xmm0, %xmm1 -; CHECK-NEXT: psrlq $4, %xmm1 -; CHECK-NEXT: paddq %xmm0, %xmm1 +; CHECK-NEXT: psrlw $4, %xmm1 +; CHECK-NEXT: paddb %xmm0, %xmm1 ; CHECK-NEXT: pand {{.*}}(%rip), %xmm1 ; CHECK-NEXT: psadbw %xmm2, %xmm1 ; CHECK-NEXT: movdqa %xmm1, %xmm0 diff --git a/test/CodeGen/X86/vector-lzcnt-128.ll b/test/CodeGen/X86/vector-lzcnt-128.ll index dc945c84b19..34ea33d576c 100644 --- a/test/CodeGen/X86/vector-lzcnt-128.ll +++ b/test/CodeGen/X86/vector-lzcnt-128.ll @@ -37,18 +37,18 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrlq $1, %xmm0 +; SSE2-NEXT: psrlw $1, %xmm0 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: psubq %xmm0, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323] +; SSE2-NEXT: psubb %xmm0, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: psrlq $2, %xmm1 +; SSE2-NEXT: psrlw $2, %xmm1 ; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: paddq %xmm2, %xmm1 +; SSE2-NEXT: paddb %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psrlq $4, %xmm2 -; SSE2-NEXT: paddq %xmm1, %xmm2 +; SSE2-NEXT: psrlw $4, %xmm2 +; SSE2-NEXT: paddb %xmm1, %xmm2 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: psadbw %xmm2, %xmm0 @@ -77,18 +77,18 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { ; SSE3-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE3-NEXT: pxor %xmm0, %xmm1 ; SSE3-NEXT: movdqa %xmm1, %xmm0 -; SSE3-NEXT: psrlq $1, %xmm0 +; SSE3-NEXT: psrlw $1, %xmm0 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE3-NEXT: psubq %xmm0, %xmm1 -; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323] +; SSE3-NEXT: psubb %xmm0, %xmm1 +; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE3-NEXT: movdqa %xmm1, %xmm2 ; SSE3-NEXT: pand %xmm0, %xmm2 -; SSE3-NEXT: psrlq $2, %xmm1 +; SSE3-NEXT: psrlw $2, %xmm1 ; SSE3-NEXT: pand %xmm0, %xmm1 -; SSE3-NEXT: paddq %xmm2, %xmm1 +; SSE3-NEXT: paddb %xmm2, %xmm1 ; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: psrlq $4, %xmm2 -; SSE3-NEXT: paddq %xmm1, %xmm2 +; SSE3-NEXT: psrlw $4, %xmm2 +; SSE3-NEXT: paddb %xmm1, %xmm2 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm2 ; SSE3-NEXT: pxor %xmm0, %xmm0 ; SSE3-NEXT: psadbw %xmm2, %xmm0 @@ -303,18 +303,18 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind { ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrlq $1, %xmm0 +; SSE2-NEXT: psrlw $1, %xmm0 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: psubq %xmm0, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323] +; SSE2-NEXT: psubb %xmm0, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: psrlq $2, %xmm1 +; SSE2-NEXT: psrlw $2, %xmm1 ; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: paddq %xmm2, %xmm1 +; SSE2-NEXT: paddb %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psrlq $4, %xmm2 -; SSE2-NEXT: paddq %xmm1, %xmm2 +; SSE2-NEXT: psrlw $4, %xmm2 +; SSE2-NEXT: paddb %xmm1, %xmm2 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: psadbw %xmm2, %xmm0 @@ -343,18 +343,18 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind { ; SSE3-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE3-NEXT: pxor %xmm0, %xmm1 ; SSE3-NEXT: movdqa %xmm1, %xmm0 -; SSE3-NEXT: psrlq $1, %xmm0 +; SSE3-NEXT: psrlw $1, %xmm0 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE3-NEXT: psubq %xmm0, %xmm1 -; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323] +; SSE3-NEXT: psubb %xmm0, %xmm1 +; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE3-NEXT: movdqa %xmm1, %xmm2 ; SSE3-NEXT: pand %xmm0, %xmm2 -; SSE3-NEXT: psrlq $2, %xmm1 +; SSE3-NEXT: psrlw $2, %xmm1 ; SSE3-NEXT: pand %xmm0, %xmm1 -; SSE3-NEXT: paddq %xmm2, %xmm1 +; SSE3-NEXT: paddb %xmm2, %xmm1 ; SSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE3-NEXT: psrlq $4, %xmm2 -; SSE3-NEXT: paddq %xmm1, %xmm2 +; SSE3-NEXT: psrlw $4, %xmm2 +; SSE3-NEXT: paddb %xmm1, %xmm2 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm2 ; SSE3-NEXT: pxor %xmm0, %xmm0 ; SSE3-NEXT: psadbw %xmm2, %xmm0 @@ -566,18 +566,18 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 ; SSE2-NEXT: pxor %xmm1, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: psrld $1, %xmm0 +; SSE2-NEXT: psrlw $1, %xmm0 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: psubd %xmm0, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459] +; SSE2-NEXT: psubb %xmm0, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: psrld $2, %xmm2 +; SSE2-NEXT: psrlw $2, %xmm2 ; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: paddd %xmm1, %xmm2 +; SSE2-NEXT: paddb %xmm1, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: psrld $4, %xmm0 -; SSE2-NEXT: paddd %xmm2, %xmm0 +; SSE2-NEXT: psrlw $4, %xmm0 +; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 @@ -608,18 +608,18 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; SSE3-NEXT: pcmpeqd %xmm2, %xmm2 ; SSE3-NEXT: pxor %xmm1, %xmm2 ; SSE3-NEXT: movdqa %xmm2, %xmm0 -; SSE3-NEXT: psrld $1, %xmm0 +; SSE3-NEXT: psrlw $1, %xmm0 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE3-NEXT: psubd %xmm0, %xmm2 -; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459] +; SSE3-NEXT: psubb %xmm0, %xmm2 +; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE3-NEXT: movdqa %xmm2, %xmm1 ; SSE3-NEXT: pand %xmm0, %xmm1 -; SSE3-NEXT: psrld $2, %xmm2 +; SSE3-NEXT: psrlw $2, %xmm2 ; SSE3-NEXT: pand %xmm0, %xmm2 -; SSE3-NEXT: paddd %xmm1, %xmm2 +; SSE3-NEXT: paddb %xmm1, %xmm2 ; SSE3-NEXT: movdqa %xmm2, %xmm0 -; SSE3-NEXT: psrld $4, %xmm0 -; SSE3-NEXT: paddd %xmm2, %xmm0 +; SSE3-NEXT: psrlw $4, %xmm0 +; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE3-NEXT: pxor %xmm1, %xmm1 ; SSE3-NEXT: movdqa %xmm0, %xmm2 @@ -808,18 +808,18 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind { ; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 ; SSE2-NEXT: pxor %xmm1, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: psrld $1, %xmm0 +; SSE2-NEXT: psrlw $1, %xmm0 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: psubd %xmm0, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459] +; SSE2-NEXT: psubb %xmm0, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: psrld $2, %xmm2 +; SSE2-NEXT: psrlw $2, %xmm2 ; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: paddd %xmm1, %xmm2 +; SSE2-NEXT: paddb %xmm1, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: psrld $4, %xmm0 -; SSE2-NEXT: paddd %xmm2, %xmm0 +; SSE2-NEXT: psrlw $4, %xmm0 +; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 @@ -850,18 +850,18 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind { ; SSE3-NEXT: pcmpeqd %xmm2, %xmm2 ; SSE3-NEXT: pxor %xmm1, %xmm2 ; SSE3-NEXT: movdqa %xmm2, %xmm0 -; SSE3-NEXT: psrld $1, %xmm0 +; SSE3-NEXT: psrlw $1, %xmm0 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE3-NEXT: psubd %xmm0, %xmm2 -; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459] +; SSE3-NEXT: psubb %xmm0, %xmm2 +; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE3-NEXT: movdqa %xmm2, %xmm1 ; SSE3-NEXT: pand %xmm0, %xmm1 -; SSE3-NEXT: psrld $2, %xmm2 +; SSE3-NEXT: psrlw $2, %xmm2 ; SSE3-NEXT: pand %xmm0, %xmm2 -; SSE3-NEXT: paddd %xmm1, %xmm2 +; SSE3-NEXT: paddb %xmm1, %xmm2 ; SSE3-NEXT: movdqa %xmm2, %xmm0 -; SSE3-NEXT: psrld $4, %xmm0 -; SSE3-NEXT: paddd %xmm2, %xmm0 +; SSE3-NEXT: psrlw $4, %xmm0 +; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE3-NEXT: pxor %xmm1, %xmm1 ; SSE3-NEXT: movdqa %xmm0, %xmm2 @@ -1049,16 +1049,16 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: psrlw $1, %xmm0 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: psubw %xmm0, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [13107,13107,13107,13107,13107,13107,13107,13107] +; SSE2-NEXT: psubb %xmm0, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm0, %xmm2 ; SSE2-NEXT: psrlw $2, %xmm1 ; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: paddw %xmm2, %xmm1 +; SSE2-NEXT: paddb %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: psrlw $4, %xmm2 -; SSE2-NEXT: paddw %xmm1, %xmm2 +; SSE2-NEXT: paddb %xmm1, %xmm2 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: psllw $8, %xmm0 @@ -1085,16 +1085,16 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { ; SSE3-NEXT: movdqa %xmm1, %xmm0 ; SSE3-NEXT: psrlw $1, %xmm0 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE3-NEXT: psubw %xmm0, %xmm1 -; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [13107,13107,13107,13107,13107,13107,13107,13107] +; SSE3-NEXT: psubb %xmm0, %xmm1 +; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE3-NEXT: movdqa %xmm1, %xmm2 ; SSE3-NEXT: pand %xmm0, %xmm2 ; SSE3-NEXT: psrlw $2, %xmm1 ; SSE3-NEXT: pand %xmm0, %xmm1 -; SSE3-NEXT: paddw %xmm2, %xmm1 +; SSE3-NEXT: paddb %xmm2, %xmm1 ; SSE3-NEXT: movdqa %xmm1, %xmm2 ; SSE3-NEXT: psrlw $4, %xmm2 -; SSE3-NEXT: paddw %xmm1, %xmm2 +; SSE3-NEXT: paddb %xmm1, %xmm2 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm2 ; SSE3-NEXT: movdqa %xmm2, %xmm0 ; SSE3-NEXT: psllw $8, %xmm0 @@ -1255,16 +1255,16 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind { ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: psrlw $1, %xmm0 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: psubw %xmm0, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [13107,13107,13107,13107,13107,13107,13107,13107] +; SSE2-NEXT: psubb %xmm0, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm0, %xmm2 ; SSE2-NEXT: psrlw $2, %xmm1 ; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: paddw %xmm2, %xmm1 +; SSE2-NEXT: paddb %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: psrlw $4, %xmm2 -; SSE2-NEXT: paddw %xmm1, %xmm2 +; SSE2-NEXT: paddb %xmm1, %xmm2 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: psllw $8, %xmm0 @@ -1291,16 +1291,16 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind { ; SSE3-NEXT: movdqa %xmm1, %xmm0 ; SSE3-NEXT: psrlw $1, %xmm0 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE3-NEXT: psubw %xmm0, %xmm1 -; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [13107,13107,13107,13107,13107,13107,13107,13107] +; SSE3-NEXT: psubb %xmm0, %xmm1 +; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE3-NEXT: movdqa %xmm1, %xmm2 ; SSE3-NEXT: pand %xmm0, %xmm2 ; SSE3-NEXT: psrlw $2, %xmm1 ; SSE3-NEXT: pand %xmm0, %xmm1 -; SSE3-NEXT: paddw %xmm2, %xmm1 +; SSE3-NEXT: paddb %xmm2, %xmm1 ; SSE3-NEXT: movdqa %xmm1, %xmm2 ; SSE3-NEXT: psrlw $4, %xmm2 -; SSE3-NEXT: paddw %xmm1, %xmm2 +; SSE3-NEXT: paddb %xmm1, %xmm2 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm2 ; SSE3-NEXT: movdqa %xmm2, %xmm0 ; SSE3-NEXT: psllw $8, %xmm0 diff --git a/test/CodeGen/X86/vector-popcnt-128.ll b/test/CodeGen/X86/vector-popcnt-128.ll index df42ebf2728..16539f1b2d4 100644 --- a/test/CodeGen/X86/vector-popcnt-128.ll +++ b/test/CodeGen/X86/vector-popcnt-128.ll @@ -14,18 +14,18 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { ; SSE2-LABEL: testv2i64: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlq $1, %xmm1 +; SSE2-NEXT: psrlw $1, %xmm1 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: psubq %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3689348814741910323,3689348814741910323] +; SSE2-NEXT: psubb %xmm1, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: psrlq $2, %xmm0 +; SSE2-NEXT: psrlw $2, %xmm0 ; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: paddq %xmm2, %xmm0 +; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlq $4, %xmm1 -; SSE2-NEXT: paddq %xmm0, %xmm1 +; SSE2-NEXT: psrlw $4, %xmm1 +; SSE2-NEXT: paddb %xmm0, %xmm1 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: psadbw %xmm0, %xmm1 @@ -35,18 +35,18 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { ; SSE3-LABEL: testv2i64: ; SSE3: # %bb.0: ; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlq $1, %xmm1 +; SSE3-NEXT: psrlw $1, %xmm1 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: psubq %xmm1, %xmm0 -; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [3689348814741910323,3689348814741910323] +; SSE3-NEXT: psubb %xmm1, %xmm0 +; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE3-NEXT: movdqa %xmm0, %xmm2 ; SSE3-NEXT: pand %xmm1, %xmm2 -; SSE3-NEXT: psrlq $2, %xmm0 +; SSE3-NEXT: psrlw $2, %xmm0 ; SSE3-NEXT: pand %xmm1, %xmm0 -; SSE3-NEXT: paddq %xmm2, %xmm0 +; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlq $4, %xmm1 -; SSE3-NEXT: paddq %xmm0, %xmm1 +; SSE3-NEXT: psrlw $4, %xmm1 +; SSE3-NEXT: paddb %xmm0, %xmm1 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE3-NEXT: pxor %xmm0, %xmm0 ; SSE3-NEXT: psadbw %xmm0, %xmm1 @@ -128,28 +128,16 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { ; ; BITALG_NOVLX-LABEL: testv2i64: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; BITALG_NOVLX-NEXT: vpand %xmm1, %xmm0, %xmm2 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; BITALG_NOVLX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; BITALG_NOVLX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; BITALG_NOVLX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: testv2i64: ; BITALG: # %bb.0: -; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; BITALG-NEXT: vpand %xmm1, %xmm0, %xmm2 -; BITALG-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; BITALG-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; BITALG-NEXT: vpsrlw $4, %xmm0, %xmm0 -; BITALG-NEXT: vpand %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; BITALG-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; BITALG-NEXT: retq @@ -161,18 +149,18 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; SSE2-LABEL: testv4i32: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $1, %xmm1 +; SSE2-NEXT: psrlw $1, %xmm1 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: psubd %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [858993459,858993459,858993459,858993459] +; SSE2-NEXT: psubb %xmm1, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: psrld $2, %xmm0 +; SSE2-NEXT: psrlw $2, %xmm0 ; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: paddd %xmm2, %xmm0 +; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $4, %xmm1 -; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: psrlw $4, %xmm1 +; SSE2-NEXT: paddb %xmm0, %xmm1 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm2 @@ -187,18 +175,18 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; SSE3-LABEL: testv4i32: ; SSE3: # %bb.0: ; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrld $1, %xmm1 +; SSE3-NEXT: psrlw $1, %xmm1 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: psubd %xmm1, %xmm0 -; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [858993459,858993459,858993459,858993459] +; SSE3-NEXT: psubb %xmm1, %xmm0 +; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE3-NEXT: movdqa %xmm0, %xmm2 ; SSE3-NEXT: pand %xmm1, %xmm2 -; SSE3-NEXT: psrld $2, %xmm0 +; SSE3-NEXT: psrlw $2, %xmm0 ; SSE3-NEXT: pand %xmm1, %xmm0 -; SSE3-NEXT: paddd %xmm2, %xmm0 +; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrld $4, %xmm1 -; SSE3-NEXT: paddd %xmm0, %xmm1 +; SSE3-NEXT: psrlw $4, %xmm1 +; SSE3-NEXT: paddb %xmm0, %xmm1 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE3-NEXT: pxor %xmm0, %xmm0 ; SSE3-NEXT: movdqa %xmm1, %xmm2 @@ -303,32 +291,20 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; ; BITALG_NOVLX-LABEL: testv4i32: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; BITALG_NOVLX-NEXT: vpand %xmm1, %xmm0, %xmm2 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; BITALG_NOVLX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; BITALG_NOVLX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; BITALG_NOVLX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 ; BITALG_NOVLX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: testv4i32: ; BITALG: # %bb.0: -; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; BITALG-NEXT: vpand %xmm1, %xmm0, %xmm2 -; BITALG-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; BITALG-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; BITALG-NEXT: vpsrlw $4, %xmm0, %xmm0 -; BITALG-NEXT: vpand %xmm1, %xmm0, %xmm0 -; BITALG-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; BITALG-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; BITALG-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 @@ -346,16 +322,16 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $1, %xmm1 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: psubw %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [13107,13107,13107,13107,13107,13107,13107,13107] +; SSE2-NEXT: psubb %xmm1, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: psrlw $2, %xmm0 ; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: paddw %xmm2, %xmm0 +; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddw %xmm0, %xmm1 +; SSE2-NEXT: paddb %xmm0, %xmm1 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: psllw $8, %xmm0 @@ -368,16 +344,16 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $1, %xmm1 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: psubw %xmm1, %xmm0 -; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [13107,13107,13107,13107,13107,13107,13107,13107] +; SSE3-NEXT: psubb %xmm1, %xmm0 +; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE3-NEXT: movdqa %xmm0, %xmm2 ; SSE3-NEXT: pand %xmm1, %xmm2 ; SSE3-NEXT: psrlw $2, %xmm0 ; SSE3-NEXT: pand %xmm1, %xmm0 -; SSE3-NEXT: paddw %xmm2, %xmm0 +; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddw %xmm0, %xmm1 +; SSE3-NEXT: paddb %xmm0, %xmm1 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE3-NEXT: movdqa %xmm1, %xmm0 ; SSE3-NEXT: psllw $8, %xmm0 diff --git a/test/CodeGen/X86/vector-popcnt-256.ll b/test/CodeGen/X86/vector-popcnt-256.ll index b2cc2f1ebed..570f59673d1 100644 --- a/test/CodeGen/X86/vector-popcnt-256.ll +++ b/test/CodeGen/X86/vector-popcnt-256.ll @@ -58,28 +58,15 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind { ; ; BITALG_NOVLX-LABEL: testv4i64: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; BITALG_NOVLX-NEXT: vpand %ymm1, %ymm0, %ymm2 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; BITALG_NOVLX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; BITALG_NOVLX-NEXT: vpsrlw $4, %ymm0, %ymm0 -; BITALG_NOVLX-NEXT: vpand %ymm1, %ymm0, %ymm0 -; BITALG_NOVLX-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; BITALG_NOVLX-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: testv4i64: ; BITALG: # %bb.0: -; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; BITALG-NEXT: vpand %ymm1, %ymm0, %ymm2 -; BITALG-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; BITALG-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; BITALG-NEXT: vpsrlw $4, %ymm0, %ymm0 -; BITALG-NEXT: vpand %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; BITALG-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; BITALG-NEXT: retq @@ -151,14 +138,8 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { ; ; BITALG_NOVLX-LABEL: testv8i32: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; BITALG_NOVLX-NEXT: vpand %ymm1, %ymm0, %ymm2 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; BITALG_NOVLX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; BITALG_NOVLX-NEXT: vpsrlw $4, %ymm0, %ymm0 -; BITALG_NOVLX-NEXT: vpand %ymm1, %ymm0, %ymm0 -; BITALG_NOVLX-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; BITALG_NOVLX-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; BITALG_NOVLX-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 @@ -169,14 +150,7 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { ; ; BITALG-LABEL: testv8i32: ; BITALG: # %bb.0: -; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; BITALG-NEXT: vpand %ymm1, %ymm0, %ymm2 -; BITALG-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; BITALG-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; BITALG-NEXT: vpsrlw $4, %ymm0, %ymm0 -; BITALG-NEXT: vpand %ymm1, %ymm0, %ymm0 -; BITALG-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; BITALG-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; BITALG-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 diff --git a/test/CodeGen/X86/vector-popcnt-512.ll b/test/CodeGen/X86/vector-popcnt-512.ll index df5edc13c3e..eae9e6c79bd 100644 --- a/test/CodeGen/X86/vector-popcnt-512.ll +++ b/test/CodeGen/X86/vector-popcnt-512.ll @@ -50,14 +50,7 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind { ; ; BITALG-LABEL: testv8i64: ; BITALG: # %bb.0: -; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; BITALG-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; BITALG-NEXT: vpshufb %zmm2, %zmm3, %zmm2 -; BITALG-NEXT: vpsrlw $4, %zmm0, %zmm0 -; BITALG-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; BITALG-NEXT: vpshufb %zmm0, %zmm3, %zmm0 -; BITALG-NEXT: vpaddb %zmm2, %zmm0, %zmm0 +; BITALG-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 ; BITALG-NEXT: retq @@ -122,14 +115,7 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind { ; ; BITALG-LABEL: testv16i32: ; BITALG: # %bb.0: -; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; BITALG-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; BITALG-NEXT: vpshufb %zmm2, %zmm3, %zmm2 -; BITALG-NEXT: vpsrlw $4, %zmm0, %zmm0 -; BITALG-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; BITALG-NEXT: vpshufb %zmm0, %zmm3, %zmm0 -; BITALG-NEXT: vpaddb %zmm2, %zmm0, %zmm0 +; BITALG-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] ; BITALG-NEXT: vpsadbw %zmm1, %zmm2, %zmm2 diff --git a/test/CodeGen/X86/vector-tzcnt-128.ll b/test/CodeGen/X86/vector-tzcnt-128.ll index d19c10d68bc..a532794f89d 100644 --- a/test/CodeGen/X86/vector-tzcnt-128.ll +++ b/test/CodeGen/X86/vector-tzcnt-128.ll @@ -25,18 +25,18 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { ; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 ; SSE2-NEXT: paddq %xmm2, %xmm3 ; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: psrlq $1, %xmm0 +; SSE2-NEXT: psrlw $1, %xmm0 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: psubq %xmm0, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323] +; SSE2-NEXT: psubb %xmm0, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE2-NEXT: movdqa %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: psrlq $2, %xmm3 +; SSE2-NEXT: psrlw $2, %xmm3 ; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: paddq %xmm2, %xmm3 +; SSE2-NEXT: paddb %xmm2, %xmm3 ; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: psrlq $4, %xmm0 -; SSE2-NEXT: paddq %xmm3, %xmm0 +; SSE2-NEXT: psrlw $4, %xmm0 +; SSE2-NEXT: paddb %xmm3, %xmm0 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: psadbw %xmm1, %xmm0 ; SSE2-NEXT: retq @@ -50,18 +50,18 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { ; SSE3-NEXT: pcmpeqd %xmm3, %xmm3 ; SSE3-NEXT: paddq %xmm2, %xmm3 ; SSE3-NEXT: movdqa %xmm3, %xmm0 -; SSE3-NEXT: psrlq $1, %xmm0 +; SSE3-NEXT: psrlw $1, %xmm0 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE3-NEXT: psubq %xmm0, %xmm3 -; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323] +; SSE3-NEXT: psubb %xmm0, %xmm3 +; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE3-NEXT: movdqa %xmm3, %xmm2 ; SSE3-NEXT: pand %xmm0, %xmm2 -; SSE3-NEXT: psrlq $2, %xmm3 +; SSE3-NEXT: psrlw $2, %xmm3 ; SSE3-NEXT: pand %xmm0, %xmm3 -; SSE3-NEXT: paddq %xmm2, %xmm3 +; SSE3-NEXT: paddb %xmm2, %xmm3 ; SSE3-NEXT: movdqa %xmm3, %xmm0 -; SSE3-NEXT: psrlq $4, %xmm0 -; SSE3-NEXT: paddq %xmm3, %xmm0 +; SSE3-NEXT: psrlw $4, %xmm0 +; SSE3-NEXT: paddb %xmm3, %xmm0 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE3-NEXT: psadbw %xmm1, %xmm0 ; SSE3-NEXT: retq @@ -155,15 +155,9 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { ; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; BITALG_NOVLX-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm3 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; BITALG_NOVLX-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; BITALG_NOVLX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpshufb %xmm0, %xmm4, %xmm0 -; BITALG_NOVLX-NEXT: vpaddb %xmm3, %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: testv2i64: @@ -173,14 +167,7 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { ; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm0 ; BITALG-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; BITALG-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; BITALG-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm3 -; BITALG-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; BITALG-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; BITALG-NEXT: vpsrlw $4, %xmm0, %xmm0 -; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm0 -; BITALG-NEXT: vpshufb %xmm0, %xmm4, %xmm0 -; BITALG-NEXT: vpaddb %xmm3, %xmm0, %xmm0 +; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; BITALG-NEXT: retq ; @@ -217,18 +204,18 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind { ; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 ; SSE2-NEXT: paddq %xmm2, %xmm3 ; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: psrlq $1, %xmm0 +; SSE2-NEXT: psrlw $1, %xmm0 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: psubq %xmm0, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323] +; SSE2-NEXT: psubb %xmm0, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE2-NEXT: movdqa %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: psrlq $2, %xmm3 +; SSE2-NEXT: psrlw $2, %xmm3 ; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: paddq %xmm2, %xmm3 +; SSE2-NEXT: paddb %xmm2, %xmm3 ; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: psrlq $4, %xmm0 -; SSE2-NEXT: paddq %xmm3, %xmm0 +; SSE2-NEXT: psrlw $4, %xmm0 +; SSE2-NEXT: paddb %xmm3, %xmm0 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: psadbw %xmm1, %xmm0 ; SSE2-NEXT: retq @@ -242,18 +229,18 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind { ; SSE3-NEXT: pcmpeqd %xmm3, %xmm3 ; SSE3-NEXT: paddq %xmm2, %xmm3 ; SSE3-NEXT: movdqa %xmm3, %xmm0 -; SSE3-NEXT: psrlq $1, %xmm0 +; SSE3-NEXT: psrlw $1, %xmm0 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE3-NEXT: psubq %xmm0, %xmm3 -; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323] +; SSE3-NEXT: psubb %xmm0, %xmm3 +; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE3-NEXT: movdqa %xmm3, %xmm2 ; SSE3-NEXT: pand %xmm0, %xmm2 -; SSE3-NEXT: psrlq $2, %xmm3 +; SSE3-NEXT: psrlw $2, %xmm3 ; SSE3-NEXT: pand %xmm0, %xmm3 -; SSE3-NEXT: paddq %xmm2, %xmm3 +; SSE3-NEXT: paddb %xmm2, %xmm3 ; SSE3-NEXT: movdqa %xmm3, %xmm0 -; SSE3-NEXT: psrlq $4, %xmm0 -; SSE3-NEXT: paddq %xmm3, %xmm0 +; SSE3-NEXT: psrlw $4, %xmm0 +; SSE3-NEXT: paddb %xmm3, %xmm0 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE3-NEXT: psadbw %xmm1, %xmm0 ; SSE3-NEXT: retq @@ -386,15 +373,9 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind { ; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; BITALG_NOVLX-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm3 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; BITALG_NOVLX-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; BITALG_NOVLX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpshufb %xmm0, %xmm4, %xmm0 -; BITALG_NOVLX-NEXT: vpaddb %xmm3, %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: testv2i64u: @@ -404,14 +385,7 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind { ; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm0 ; BITALG-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; BITALG-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; BITALG-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm3 -; BITALG-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; BITALG-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; BITALG-NEXT: vpsrlw $4, %xmm0, %xmm0 -; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm0 -; BITALG-NEXT: vpshufb %xmm0, %xmm4, %xmm0 -; BITALG-NEXT: vpaddb %xmm3, %xmm0, %xmm0 +; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; BITALG-NEXT: retq ; @@ -448,18 +422,18 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 ; SSE2-NEXT: paddd %xmm2, %xmm3 ; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: psrld $1, %xmm0 +; SSE2-NEXT: psrlw $1, %xmm0 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: psubd %xmm0, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459] +; SSE2-NEXT: psubb %xmm0, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE2-NEXT: movdqa %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: psrld $2, %xmm3 +; SSE2-NEXT: psrlw $2, %xmm3 ; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: paddd %xmm2, %xmm3 +; SSE2-NEXT: paddb %xmm2, %xmm3 ; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: psrld $4, %xmm0 -; SSE2-NEXT: paddd %xmm3, %xmm0 +; SSE2-NEXT: psrlw $4, %xmm0 +; SSE2-NEXT: paddb %xmm3, %xmm0 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] @@ -478,18 +452,18 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; SSE3-NEXT: pcmpeqd %xmm3, %xmm3 ; SSE3-NEXT: paddd %xmm2, %xmm3 ; SSE3-NEXT: movdqa %xmm3, %xmm0 -; SSE3-NEXT: psrld $1, %xmm0 +; SSE3-NEXT: psrlw $1, %xmm0 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE3-NEXT: psubd %xmm0, %xmm3 -; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459] +; SSE3-NEXT: psubb %xmm0, %xmm3 +; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE3-NEXT: movdqa %xmm3, %xmm2 ; SSE3-NEXT: pand %xmm0, %xmm2 -; SSE3-NEXT: psrld $2, %xmm3 +; SSE3-NEXT: psrlw $2, %xmm3 ; SSE3-NEXT: pand %xmm0, %xmm3 -; SSE3-NEXT: paddd %xmm2, %xmm3 +; SSE3-NEXT: paddb %xmm2, %xmm3 ; SSE3-NEXT: movdqa %xmm3, %xmm0 -; SSE3-NEXT: psrld $4, %xmm0 -; SSE3-NEXT: paddd %xmm3, %xmm0 +; SSE3-NEXT: psrlw $4, %xmm0 +; SSE3-NEXT: paddb %xmm3, %xmm0 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm2 ; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] @@ -667,19 +641,13 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; BITALG_NOVLX-NEXT: vpaddd %xmm2, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm3 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; BITALG_NOVLX-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; BITALG_NOVLX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpshufb %xmm0, %xmm4, %xmm0 -; BITALG_NOVLX-NEXT: vpaddb %xmm3, %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 ; BITALG_NOVLX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: testv4i32: @@ -689,14 +657,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm0 ; BITALG-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; BITALG-NEXT: vpaddd %xmm2, %xmm0, %xmm0 -; BITALG-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm3 -; BITALG-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; BITALG-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; BITALG-NEXT: vpsrlw $4, %xmm0, %xmm0 -; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm0 -; BITALG-NEXT: vpshufb %xmm0, %xmm4, %xmm0 -; BITALG-NEXT: vpaddb %xmm3, %xmm0, %xmm0 +; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; BITALG-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 ; BITALG-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -742,18 +703,18 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind { ; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 ; SSE2-NEXT: paddd %xmm2, %xmm3 ; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: psrld $1, %xmm0 +; SSE2-NEXT: psrlw $1, %xmm0 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: psubd %xmm0, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459] +; SSE2-NEXT: psubb %xmm0, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE2-NEXT: movdqa %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: psrld $2, %xmm3 +; SSE2-NEXT: psrlw $2, %xmm3 ; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: paddd %xmm2, %xmm3 +; SSE2-NEXT: paddb %xmm2, %xmm3 ; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: psrld $4, %xmm0 -; SSE2-NEXT: paddd %xmm3, %xmm0 +; SSE2-NEXT: psrlw $4, %xmm0 +; SSE2-NEXT: paddb %xmm3, %xmm0 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] @@ -772,18 +733,18 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind { ; SSE3-NEXT: pcmpeqd %xmm3, %xmm3 ; SSE3-NEXT: paddd %xmm2, %xmm3 ; SSE3-NEXT: movdqa %xmm3, %xmm0 -; SSE3-NEXT: psrld $1, %xmm0 +; SSE3-NEXT: psrlw $1, %xmm0 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE3-NEXT: psubd %xmm0, %xmm3 -; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459] +; SSE3-NEXT: psubb %xmm0, %xmm3 +; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE3-NEXT: movdqa %xmm3, %xmm2 ; SSE3-NEXT: pand %xmm0, %xmm2 -; SSE3-NEXT: psrld $2, %xmm3 +; SSE3-NEXT: psrlw $2, %xmm3 ; SSE3-NEXT: pand %xmm0, %xmm3 -; SSE3-NEXT: paddd %xmm2, %xmm3 +; SSE3-NEXT: paddb %xmm2, %xmm3 ; SSE3-NEXT: movdqa %xmm3, %xmm0 -; SSE3-NEXT: psrld $4, %xmm0 -; SSE3-NEXT: paddd %xmm3, %xmm0 +; SSE3-NEXT: psrlw $4, %xmm0 +; SSE3-NEXT: paddb %xmm3, %xmm0 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm2 ; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] @@ -938,19 +899,13 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind { ; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; BITALG_NOVLX-NEXT: vpaddd %xmm2, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm3 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; BITALG_NOVLX-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; BITALG_NOVLX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpshufb %xmm0, %xmm4, %xmm0 -; BITALG_NOVLX-NEXT: vpaddb %xmm3, %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 ; BITALG_NOVLX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: testv4i32u: @@ -960,14 +915,7 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind { ; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm0 ; BITALG-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; BITALG-NEXT: vpaddd %xmm2, %xmm0, %xmm0 -; BITALG-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm3 -; BITALG-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; BITALG-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; BITALG-NEXT: vpsrlw $4, %xmm0, %xmm0 -; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm0 -; BITALG-NEXT: vpshufb %xmm0, %xmm4, %xmm0 -; BITALG-NEXT: vpaddb %xmm3, %xmm0, %xmm0 +; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; BITALG-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 ; BITALG-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -1014,16 +962,16 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $1, %xmm1 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: psubw %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [13107,13107,13107,13107,13107,13107,13107,13107] +; SSE2-NEXT: psubb %xmm1, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: psrlw $2, %xmm0 ; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: paddw %xmm2, %xmm0 +; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddw %xmm0, %xmm1 +; SSE2-NEXT: paddb %xmm0, %xmm1 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: psllw $8, %xmm0 @@ -1041,16 +989,16 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $1, %xmm1 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: psubw %xmm1, %xmm0 -; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [13107,13107,13107,13107,13107,13107,13107,13107] +; SSE3-NEXT: psubb %xmm1, %xmm0 +; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE3-NEXT: movdqa %xmm0, %xmm2 ; SSE3-NEXT: pand %xmm1, %xmm2 ; SSE3-NEXT: psrlw $2, %xmm0 ; SSE3-NEXT: pand %xmm1, %xmm0 -; SSE3-NEXT: paddw %xmm2, %xmm0 +; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddw %xmm0, %xmm1 +; SSE3-NEXT: paddb %xmm0, %xmm1 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE3-NEXT: movdqa %xmm1, %xmm0 ; SSE3-NEXT: psllw $8, %xmm0 @@ -1210,16 +1158,16 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind { ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $1, %xmm1 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: psubw %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [13107,13107,13107,13107,13107,13107,13107,13107] +; SSE2-NEXT: psubb %xmm1, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: psrlw $2, %xmm0 ; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: paddw %xmm2, %xmm0 +; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddw %xmm0, %xmm1 +; SSE2-NEXT: paddb %xmm0, %xmm1 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: psllw $8, %xmm0 @@ -1237,16 +1185,16 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind { ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $1, %xmm1 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: psubw %xmm1, %xmm0 -; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [13107,13107,13107,13107,13107,13107,13107,13107] +; SSE3-NEXT: psubb %xmm1, %xmm0 +; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE3-NEXT: movdqa %xmm0, %xmm2 ; SSE3-NEXT: pand %xmm1, %xmm2 ; SSE3-NEXT: psrlw $2, %xmm0 ; SSE3-NEXT: pand %xmm1, %xmm0 -; SSE3-NEXT: paddw %xmm2, %xmm0 +; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddw %xmm0, %xmm1 +; SSE3-NEXT: paddb %xmm0, %xmm1 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE3-NEXT: movdqa %xmm1, %xmm0 ; SSE3-NEXT: psllw $8, %xmm0 diff --git a/test/CodeGen/X86/vector-tzcnt-256.ll b/test/CodeGen/X86/vector-tzcnt-256.ll index b1173fa4b88..cae0a2d605a 100644 --- a/test/CodeGen/X86/vector-tzcnt-256.ll +++ b/test/CodeGen/X86/vector-tzcnt-256.ll @@ -124,14 +124,7 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind { ; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm0 ; BITALG_NOVLX-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 ; BITALG_NOVLX-NEXT: vpaddq %ymm2, %ymm0, %ymm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm3 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; BITALG_NOVLX-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; BITALG_NOVLX-NEXT: vpsrlw $4, %ymm0, %ymm0 -; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm0 -; BITALG_NOVLX-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; BITALG_NOVLX-NEXT: vpaddb %ymm3, %ymm0, %ymm0 +; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; BITALG_NOVLX-NEXT: retq ; @@ -142,14 +135,7 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind { ; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm0 ; BITALG-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 ; BITALG-NEXT: vpaddq %ymm2, %ymm0, %ymm0 -; BITALG-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm3 -; BITALG-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; BITALG-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; BITALG-NEXT: vpsrlw $4, %ymm0, %ymm0 -; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm0 -; BITALG-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; BITALG-NEXT: vpaddb %ymm3, %ymm0, %ymm0 +; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; BITALG-NEXT: retq ; @@ -270,14 +256,7 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind { ; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm0 ; BITALG_NOVLX-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 ; BITALG_NOVLX-NEXT: vpaddq %ymm2, %ymm0, %ymm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm3 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; BITALG_NOVLX-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; BITALG_NOVLX-NEXT: vpsrlw $4, %ymm0, %ymm0 -; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm0 -; BITALG_NOVLX-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; BITALG_NOVLX-NEXT: vpaddb %ymm3, %ymm0, %ymm0 +; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; BITALG_NOVLX-NEXT: retq ; @@ -288,14 +267,7 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind { ; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm0 ; BITALG-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 ; BITALG-NEXT: vpaddq %ymm2, %ymm0, %ymm0 -; BITALG-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm3 -; BITALG-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; BITALG-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; BITALG-NEXT: vpsrlw $4, %ymm0, %ymm0 -; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm0 -; BITALG-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; BITALG-NEXT: vpaddb %ymm3, %ymm0, %ymm0 +; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; BITALG-NEXT: retq ; @@ -452,14 +424,7 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { ; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm0 ; BITALG_NOVLX-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 ; BITALG_NOVLX-NEXT: vpaddd %ymm2, %ymm0, %ymm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm3 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; BITALG_NOVLX-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; BITALG_NOVLX-NEXT: vpsrlw $4, %ymm0, %ymm0 -; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm0 -; BITALG_NOVLX-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; BITALG_NOVLX-NEXT: vpaddb %ymm3, %ymm0, %ymm0 +; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; BITALG_NOVLX-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 ; BITALG_NOVLX-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] @@ -474,14 +439,7 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { ; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm0 ; BITALG-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 ; BITALG-NEXT: vpaddd %ymm2, %ymm0, %ymm0 -; BITALG-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm3 -; BITALG-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; BITALG-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; BITALG-NEXT: vpsrlw $4, %ymm0, %ymm0 -; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm0 -; BITALG-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; BITALG-NEXT: vpaddb %ymm3, %ymm0, %ymm0 +; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; BITALG-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 ; BITALG-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] @@ -623,14 +581,7 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind { ; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm0 ; BITALG_NOVLX-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 ; BITALG_NOVLX-NEXT: vpaddd %ymm2, %ymm0, %ymm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm3 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; BITALG_NOVLX-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; BITALG_NOVLX-NEXT: vpsrlw $4, %ymm0, %ymm0 -; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm0 -; BITALG_NOVLX-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; BITALG_NOVLX-NEXT: vpaddb %ymm3, %ymm0, %ymm0 +; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; BITALG_NOVLX-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 ; BITALG_NOVLX-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] @@ -645,14 +596,7 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind { ; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm0 ; BITALG-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 ; BITALG-NEXT: vpaddd %ymm2, %ymm0, %ymm0 -; BITALG-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm3 -; BITALG-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; BITALG-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; BITALG-NEXT: vpsrlw $4, %ymm0, %ymm0 -; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm0 -; BITALG-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; BITALG-NEXT: vpaddb %ymm3, %ymm0, %ymm0 +; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; BITALG-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 ; BITALG-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] diff --git a/test/CodeGen/X86/vector-tzcnt-512.ll b/test/CodeGen/X86/vector-tzcnt-512.ll index 37c86f7f81a..4a9fd82593a 100644 --- a/test/CodeGen/X86/vector-tzcnt-512.ll +++ b/test/CodeGen/X86/vector-tzcnt-512.ll @@ -87,14 +87,7 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind { ; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm0 ; BITALG-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 ; BITALG-NEXT: vpaddq %zmm2, %zmm0, %zmm0 -; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm3 -; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; BITALG-NEXT: vpshufb %zmm3, %zmm4, %zmm3 -; BITALG-NEXT: vpsrlw $4, %zmm0, %zmm0 -; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm0 -; BITALG-NEXT: vpshufb %zmm0, %zmm4, %zmm0 -; BITALG-NEXT: vpaddb %zmm3, %zmm0, %zmm0 +; BITALG-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 ; BITALG-NEXT: retq %out = call <8 x i64> @llvm.cttz.v8i64(<8 x i64> %in, i1 0) @@ -157,14 +150,7 @@ define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind { ; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm0 ; BITALG-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 ; BITALG-NEXT: vpaddq %zmm2, %zmm0, %zmm0 -; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm3 -; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; BITALG-NEXT: vpshufb %zmm3, %zmm4, %zmm3 -; BITALG-NEXT: vpsrlw $4, %zmm0, %zmm0 -; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm0 -; BITALG-NEXT: vpshufb %zmm0, %zmm4, %zmm0 -; BITALG-NEXT: vpaddb %zmm3, %zmm0, %zmm0 +; BITALG-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 ; BITALG-NEXT: retq %out = call <8 x i64> @llvm.cttz.v8i64(<8 x i64> %in, i1 -1) @@ -269,14 +255,7 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind { ; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm0 ; BITALG-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 ; BITALG-NEXT: vpaddd %zmm2, %zmm0, %zmm0 -; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm3 -; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; BITALG-NEXT: vpshufb %zmm3, %zmm4, %zmm3 -; BITALG-NEXT: vpsrlw $4, %zmm0, %zmm0 -; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm0 -; BITALG-NEXT: vpshufb %zmm0, %zmm4, %zmm0 -; BITALG-NEXT: vpaddb %zmm3, %zmm0, %zmm0 +; BITALG-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] ; BITALG-NEXT: vpsadbw %zmm1, %zmm2, %zmm2 ; BITALG-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] @@ -347,14 +326,7 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind { ; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm0 ; BITALG-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 ; BITALG-NEXT: vpaddd %zmm2, %zmm0, %zmm0 -; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm3 -; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; BITALG-NEXT: vpshufb %zmm3, %zmm4, %zmm3 -; BITALG-NEXT: vpsrlw $4, %zmm0, %zmm0 -; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm0 -; BITALG-NEXT: vpshufb %zmm0, %zmm4, %zmm0 -; BITALG-NEXT: vpaddb %zmm3, %zmm0, %zmm0 +; BITALG-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] ; BITALG-NEXT: vpsadbw %zmm1, %zmm2, %zmm2 ; BITALG-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] -- GitLab From 08bc40e744f3f44228a9ce57778b5acff1f7fa77 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 12 Oct 2018 14:45:57 +0000 Subject: [PATCH 0102/1116] [SelectionDAG] Move VectorLegalizer::ExpandCTLZ codegen into SelectionDAGLegalize Generalize SelectionDAGLegalize's CTLZ expansion to handle vectors - lets VectorLegalizer::ExpandCTLZ to just pass the expansion on instead of repeating the same codegen. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344349 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 2 +- .../SelectionDAG/LegalizeVectorOps.cpp | 27 ++------- test/CodeGen/X86/vec_ctbits.ll | 58 +++++++++---------- 3 files changed, 34 insertions(+), 53 deletions(-) diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index 27875c11909..a6c0610f963 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -2761,7 +2761,7 @@ SDValue SelectionDAGLegalize::ExpandBitCount(unsigned Opc, SDValue Op, return DAG.getNode(ISD::CTLZ, dl, Op.getValueType(), Op); case ISD::CTLZ: { EVT VT = Op.getValueType(); - unsigned Len = VT.getSizeInBits(); + unsigned Len = VT.getScalarSizeInBits(); if (TLI.isOperationLegalOrCustom(ISD::CTLZ_ZERO_UNDEF, VT)) { EVT SetCCVT = getSetCCResultType(VT); diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index 3f38ed8a03c..852415647b1 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -1082,32 +1082,13 @@ SDValue VectorLegalizer::ExpandCTLZ(SDValue Op) { return DAG.getNode(ISD::CTLZ, DL, Op.getValueType(), Op.getOperand(0)); } - // If CTPOP is available we can lower with a CTPOP based method: - // u16 ctlz(u16 x) { - // x |= (x >> 1); - // x |= (x >> 2); - // x |= (x >> 4); - // x |= (x >> 8); - // return ctpop(~x); - // } - // Ref: "Hacker's Delight" by Henry Warren + // If we have the appropriate vector bit operations, it is better to use them + // than unrolling and expanding each component. if (isPowerOf2_32(NumBitsPerElt) && TLI.isOperationLegalOrCustom(ISD::CTPOP, VT) && TLI.isOperationLegalOrCustom(ISD::SRL, VT) && - TLI.isOperationLegalOrCustomOrPromote(ISD::OR, VT) && - TLI.isOperationLegalOrCustomOrPromote(ISD::XOR, VT)) { - SDLoc DL(Op); - SDValue Res = Op.getOperand(0); - EVT ShiftTy = TLI.getShiftAmountTy(VT, DAG.getDataLayout()); - - for (unsigned i = 1; i != NumBitsPerElt; i *= 2) - Res = DAG.getNode( - ISD::OR, DL, VT, Res, - DAG.getNode(ISD::SRL, DL, VT, Res, DAG.getConstant(i, DL, ShiftTy))); - - Res = DAG.getNOT(DL, Res, VT); - return DAG.getNode(ISD::CTPOP, DL, VT, Res); - } + TLI.isOperationLegalOrCustomOrPromote(ISD::OR, VT)) + return Op; // Otherwise go ahead and unroll. return DAG.UnrollVectorOp(Op.getNode()); diff --git a/test/CodeGen/X86/vec_ctbits.ll b/test/CodeGen/X86/vec_ctbits.ll index 978a40cbb26..40e101756ef 100644 --- a/test/CodeGen/X86/vec_ctbits.ll +++ b/test/CodeGen/X86/vec_ctbits.ll @@ -142,42 +142,42 @@ define <2 x i32> @promlz(<2 x i32> %a) nounwind { ; CHECK-LABEL: promlz: ; CHECK: # %bb.0: ; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 -; CHECK-NEXT: pxor %xmm1, %xmm1 -; CHECK-NEXT: movdqa %xmm0, %xmm2 -; CHECK-NEXT: psrlq $1, %xmm2 -; CHECK-NEXT: por %xmm0, %xmm2 -; CHECK-NEXT: movdqa %xmm2, %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: psrlq $1, %xmm1 +; CHECK-NEXT: por %xmm0, %xmm1 +; CHECK-NEXT: movdqa %xmm1, %xmm0 ; CHECK-NEXT: psrlq $2, %xmm0 -; CHECK-NEXT: por %xmm2, %xmm0 -; CHECK-NEXT: movdqa %xmm0, %xmm2 -; CHECK-NEXT: psrlq $4, %xmm2 -; CHECK-NEXT: por %xmm0, %xmm2 -; CHECK-NEXT: movdqa %xmm2, %xmm0 +; CHECK-NEXT: por %xmm1, %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: psrlq $4, %xmm1 +; CHECK-NEXT: por %xmm0, %xmm1 +; CHECK-NEXT: movdqa %xmm1, %xmm0 ; CHECK-NEXT: psrlq $8, %xmm0 -; CHECK-NEXT: por %xmm2, %xmm0 -; CHECK-NEXT: movdqa %xmm0, %xmm2 -; CHECK-NEXT: psrlq $16, %xmm2 -; CHECK-NEXT: por %xmm0, %xmm2 -; CHECK-NEXT: movdqa %xmm2, %xmm0 +; CHECK-NEXT: por %xmm1, %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: psrlq $16, %xmm1 +; CHECK-NEXT: por %xmm0, %xmm1 +; CHECK-NEXT: movdqa %xmm1, %xmm0 ; CHECK-NEXT: psrlq $32, %xmm0 -; CHECK-NEXT: por %xmm2, %xmm0 -; CHECK-NEXT: pcmpeqd %xmm2, %xmm2 -; CHECK-NEXT: pxor %xmm0, %xmm2 -; CHECK-NEXT: movdqa %xmm2, %xmm0 +; CHECK-NEXT: por %xmm1, %xmm0 +; CHECK-NEXT: pcmpeqd %xmm1, %xmm1 +; CHECK-NEXT: pxor %xmm0, %xmm1 +; CHECK-NEXT: movdqa %xmm1, %xmm0 ; CHECK-NEXT: psrlw $1, %xmm0 ; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 -; CHECK-NEXT: psubb %xmm0, %xmm2 +; CHECK-NEXT: psubb %xmm0, %xmm1 ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; CHECK-NEXT: movdqa %xmm2, %xmm3 -; CHECK-NEXT: pand %xmm0, %xmm3 -; CHECK-NEXT: psrlw $2, %xmm2 +; CHECK-NEXT: movdqa %xmm1, %xmm2 ; CHECK-NEXT: pand %xmm0, %xmm2 -; CHECK-NEXT: paddb %xmm3, %xmm2 -; CHECK-NEXT: movdqa %xmm2, %xmm0 -; CHECK-NEXT: psrlw $4, %xmm0 -; CHECK-NEXT: paddb %xmm2, %xmm0 -; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 -; CHECK-NEXT: psadbw %xmm1, %xmm0 +; CHECK-NEXT: psrlw $2, %xmm1 +; CHECK-NEXT: pand %xmm0, %xmm1 +; CHECK-NEXT: paddb %xmm2, %xmm1 +; CHECK-NEXT: movdqa %xmm1, %xmm2 +; CHECK-NEXT: psrlw $4, %xmm2 +; CHECK-NEXT: paddb %xmm1, %xmm2 +; CHECK-NEXT: pand {{.*}}(%rip), %xmm2 +; CHECK-NEXT: pxor %xmm0, %xmm0 +; CHECK-NEXT: psadbw %xmm2, %xmm0 ; CHECK-NEXT: psubq {{.*}}(%rip), %xmm0 ; CHECK-NEXT: retq %c = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false) -- GitLab From 1cea19bc9824f1c266aacca66425f84ffd743dfb Mon Sep 17 00:00:00 2001 From: Eric Liu Date: Fri, 12 Oct 2018 15:01:11 +0000 Subject: [PATCH 0103/1116] Fix unused variable warning after r344348 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344350 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index d2971d0f861..872d90ad004 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -25024,6 +25024,7 @@ static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL, MVT VT = Op.getSimpleValueType(); MVT EltVT = VT.getVectorElementType(); int NumElts = VT.getVectorNumElements(); + (void)EltVT; assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported."); // Implement a lookup table in register by using an algorithm based on: -- GitLab From 05638b189e1bb92871b498484b3b30666ef6e82f Mon Sep 17 00:00:00 2001 From: Guillaume Chatelet Date: Fri, 12 Oct 2018 15:12:22 +0000 Subject: [PATCH 0104/1116] [llvm-exegesis][NFC] Simplify code at the cost of small code duplication Reviewers: courbet Subscribers: tschuett, llvm-commits Differential Revision: https://reviews.llvm.org/D53198 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344351 91177308-0d34-0410-b5e6-96231b3b80d8 --- tools/llvm-exegesis/lib/X86/Target.cpp | 118 +++++++++++++------------ 1 file changed, 60 insertions(+), 58 deletions(-) diff --git a/tools/llvm-exegesis/lib/X86/Target.cpp b/tools/llvm-exegesis/lib/X86/Target.cpp index 440996ad555..0e9a6de95ce 100644 --- a/tools/llvm-exegesis/lib/X86/Target.cpp +++ b/tools/llvm-exegesis/lib/X86/Target.cpp @@ -21,81 +21,83 @@ namespace exegesis { namespace { -// Common code for X86 Uops and Latency runners. -template class X86SnippetGenerator : public Impl { - using Impl::Impl; +static llvm::Error IsInvalidOpcode(const Instruction &Instr) { + const auto OpcodeName = Instr.Name; + if (OpcodeName.startswith("POPF") || OpcodeName.startswith("PUSHF") || + OpcodeName.startswith("ADJCALLSTACK")) + return llvm::make_error( + "Unsupported opcode: Push/Pop/AdjCallStack"); + return llvm::Error::success(); +} + +static unsigned GetX86FPFlags(const Instruction &Instr) { + return Instr.Description->TSFlags & llvm::X86II::FPTypeMask; +} + +class X86LatencySnippetGenerator : public LatencySnippetGenerator { +public: + using LatencySnippetGenerator::LatencySnippetGenerator; llvm::Expected generateCodeTemplate(const Instruction &Instr) const override { - // Test whether we can generate a snippet for this instruction. - const auto OpcodeName = Instr.Name; - if (OpcodeName.startswith("POPF") || OpcodeName.startswith("PUSHF") || - OpcodeName.startswith("ADJCALLSTACK")) { - return llvm::make_error( - "Unsupported opcode: Push/Pop/AdjCallStack"); - } + if (auto E = IsInvalidOpcode(Instr)) + return std::move(E); - // Handle X87. - const unsigned FPInstClass = - Instr.Description->TSFlags & llvm::X86II::FPTypeMask; - switch (FPInstClass) { + switch (GetX86FPFlags(Instr)) { case llvm::X86II::NotFP: - break; + return LatencySnippetGenerator::generateCodeTemplate(Instr); case llvm::X86II::ZeroArgFP: - return llvm::make_error("Unsupported x87 ZeroArgFP"); case llvm::X86II::OneArgFP: - return llvm::make_error("Unsupported x87 OneArgFP"); + case llvm::X86II::SpecialFP: + case llvm::X86II::CompareFP: + case llvm::X86II::CondMovFP: + return llvm::make_error("Unsupported x87 Instruction"); case llvm::X86II::OneArgFPRW: - case llvm::X86II::TwoArgFP: { + case llvm::X86II::TwoArgFP: // These are instructions like // - `ST(0) = fsqrt(ST(0))` (OneArgFPRW) // - `ST(0) = ST(0) + ST(i)` (TwoArgFP) // They are intrinsically serial and do not modify the state of the stack. - // We generate the same code for latency and uops. - return this->generateSelfAliasingCodeTemplate(Instr); - } - case llvm::X86II::CompareFP: - return Impl::handleCompareFP(Instr); - case llvm::X86II::CondMovFP: - return Impl::handleCondMovFP(Instr); - case llvm::X86II::SpecialFP: - return llvm::make_error("Unsupported x87 SpecialFP"); + return generateSelfAliasingCodeTemplate(Instr); default: llvm_unreachable("Unknown FP Type!"); } - - // Fallback to generic implementation. - return Impl::Base::generateCodeTemplate(Instr); } }; -class X86LatencyImpl : public LatencySnippetGenerator { -protected: - using Base = LatencySnippetGenerator; - using Base::Base; - llvm::Expected handleCompareFP(const Instruction &Instr) const { - return llvm::make_error( - "Unsupported x87 CompareFP"); - } - llvm::Expected handleCondMovFP(const Instruction &Instr) const { - return llvm::make_error( - "Unsupported x87 CondMovFP"); - } -}; +class X86UopsSnippetGenerator : public UopsSnippetGenerator { +public: + using UopsSnippetGenerator::UopsSnippetGenerator; -class X86UopsImpl : public UopsSnippetGenerator { -protected: - using Base = UopsSnippetGenerator; - using Base::Base; - // We can compute uops for any FP instruction that does not grow or shrink the - // stack (either do not touch the stack or push as much as they pop). - llvm::Expected handleCompareFP(const Instruction &Instr) const { - return generateUnconstrainedCodeTemplate( - Instr, "instruction does not grow/shrink the FP stack"); - } - llvm::Expected handleCondMovFP(const Instruction &Instr) const { - return generateUnconstrainedCodeTemplate( - Instr, "instruction does not grow/shrink the FP stack"); + llvm::Expected + generateCodeTemplate(const Instruction &Instr) const override { + if (auto E = IsInvalidOpcode(Instr)) + return std::move(E); + + switch (GetX86FPFlags(Instr)) { + case llvm::X86II::NotFP: + return UopsSnippetGenerator::generateCodeTemplate(Instr); + case llvm::X86II::ZeroArgFP: + case llvm::X86II::OneArgFP: + case llvm::X86II::SpecialFP: + return llvm::make_error("Unsupported x87 Instruction"); + case llvm::X86II::OneArgFPRW: + case llvm::X86II::TwoArgFP: + // These are instructions like + // - `ST(0) = fsqrt(ST(0))` (OneArgFPRW) + // - `ST(0) = ST(0) + ST(i)` (TwoArgFP) + // They are intrinsically serial and do not modify the state of the stack. + // We generate the same code for latency and uops. + return generateSelfAliasingCodeTemplate(Instr); + case llvm::X86II::CompareFP: + case llvm::X86II::CondMovFP: + // We can compute uops for any FP instruction that does not grow or shrink + // the stack (either do not touch the stack or push as much as they pop). + return generateUnconstrainedCodeTemplate( + Instr, "instruction does not grow/shrink the FP stack"); + default: + llvm_unreachable("Unknown FP Type!"); + } } }; @@ -330,12 +332,12 @@ class ExegesisX86Target : public ExegesisTarget { std::unique_ptr createLatencySnippetGenerator(const LLVMState &State) const override { - return llvm::make_unique>(State); + return llvm::make_unique(State); } std::unique_ptr createUopsSnippetGenerator(const LLVMState &State) const override { - return llvm::make_unique>(State); + return llvm::make_unique(State); } bool matchesArch(llvm::Triple::ArchType Arch) const override { -- GitLab From a6b5202ae09893e943f38b5675f7292c99828b2a Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Fri, 12 Oct 2018 15:22:14 +0000 Subject: [PATCH 0105/1116] [AArch64][x86] add tests for trunc disguised as vector ops (PR39016); NFC These correspond to the IR transform from: D52439 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344353 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/CodeGen/AArch64/extract-insert.ll | 118 +++++++++++++++++++++++++ test/CodeGen/X86/extract-insert.ll | 55 +++++++++++- 2 files changed, 172 insertions(+), 1 deletion(-) create mode 100644 test/CodeGen/AArch64/extract-insert.ll diff --git a/test/CodeGen/AArch64/extract-insert.ll b/test/CodeGen/AArch64/extract-insert.ll new file mode 100644 index 00000000000..91f6518edd8 --- /dev/null +++ b/test/CodeGen/AArch64/extract-insert.ll @@ -0,0 +1,118 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64_be-- < %s | FileCheck %s --check-prefix=BE +; RUN: llc -mtriple=aarch64-- < %s | FileCheck %s --check-prefix=LE + +define i32 @trunc_i64_to_i32_le(i64 %x) { +; BE-LABEL: trunc_i64_to_i32_le: +; BE: // %bb.0: +; BE-NEXT: fmov d0, x0 +; BE-NEXT: rev64 v0.4s, v0.4s +; BE-NEXT: fmov w0, s0 +; BE-NEXT: ret +; +; LE-LABEL: trunc_i64_to_i32_le: +; LE: // %bb.0: +; LE-NEXT: fmov d0, x0 +; LE-NEXT: fmov w0, s0 +; LE-NEXT: ret + %ins = insertelement <2 x i64> undef, i64 %x, i32 0 + %bc = bitcast <2 x i64> %ins to <4 x i32> + %ext = extractelement <4 x i32> %bc, i32 0 + ret i32 %ext +} + +define i32 @trunc_i64_to_i32_be(i64 %x) { +; BE-LABEL: trunc_i64_to_i32_be: +; BE: // %bb.0: +; BE-NEXT: fmov d0, x0 +; BE-NEXT: rev64 v0.4s, v0.4s +; BE-NEXT: mov w0, v0.s[1] +; BE-NEXT: ret +; +; LE-LABEL: trunc_i64_to_i32_be: +; LE: // %bb.0: +; LE-NEXT: fmov d0, x0 +; LE-NEXT: mov w0, v0.s[1] +; LE-NEXT: ret + %ins = insertelement <2 x i64> undef, i64 %x, i32 0 + %bc = bitcast <2 x i64> %ins to <4 x i32> + %ext = extractelement <4 x i32> %bc, i32 1 + ret i32 %ext +} + +define i16 @trunc_i64_to_i16_le(i64 %x) { +; BE-LABEL: trunc_i64_to_i16_le: +; BE: // %bb.0: +; BE-NEXT: fmov d0, x0 +; BE-NEXT: rev64 v0.8h, v0.8h +; BE-NEXT: umov w0, v0.h[0] +; BE-NEXT: ret +; +; LE-LABEL: trunc_i64_to_i16_le: +; LE: // %bb.0: +; LE-NEXT: fmov d0, x0 +; LE-NEXT: umov w0, v0.h[0] +; LE-NEXT: ret + %ins = insertelement <2 x i64> undef, i64 %x, i32 0 + %bc = bitcast <2 x i64> %ins to <8 x i16> + %ext = extractelement <8 x i16> %bc, i32 0 + ret i16 %ext +} + +define i16 @trunc_i64_to_i16_be(i64 %x) { +; BE-LABEL: trunc_i64_to_i16_be: +; BE: // %bb.0: +; BE-NEXT: fmov d0, x0 +; BE-NEXT: rev64 v0.8h, v0.8h +; BE-NEXT: umov w0, v0.h[3] +; BE-NEXT: ret +; +; LE-LABEL: trunc_i64_to_i16_be: +; LE: // %bb.0: +; LE-NEXT: fmov d0, x0 +; LE-NEXT: umov w0, v0.h[3] +; LE-NEXT: ret + %ins = insertelement <2 x i64> undef, i64 %x, i32 0 + %bc = bitcast <2 x i64> %ins to <8 x i16> + %ext = extractelement <8 x i16> %bc, i32 3 + ret i16 %ext +} + +define i8 @trunc_i32_to_i8_le(i32 %x) { +; BE-LABEL: trunc_i32_to_i8_le: +; BE: // %bb.0: +; BE-NEXT: fmov s0, w0 +; BE-NEXT: rev32 v0.16b, v0.16b +; BE-NEXT: umov w0, v0.b[0] +; BE-NEXT: ret +; +; LE-LABEL: trunc_i32_to_i8_le: +; LE: // %bb.0: +; LE-NEXT: fmov s0, w0 +; LE-NEXT: umov w0, v0.b[0] +; LE-NEXT: ret + %ins = insertelement <4 x i32> undef, i32 %x, i32 0 + %bc = bitcast <4 x i32> %ins to <16 x i8> + %ext = extractelement <16 x i8> %bc, i32 0 + ret i8 %ext +} + +define i8 @trunc_i32_to_i8_be(i32 %x) { +; BE-LABEL: trunc_i32_to_i8_be: +; BE: // %bb.0: +; BE-NEXT: fmov s0, w0 +; BE-NEXT: rev32 v0.16b, v0.16b +; BE-NEXT: umov w0, v0.b[3] +; BE-NEXT: ret +; +; LE-LABEL: trunc_i32_to_i8_be: +; LE: // %bb.0: +; LE-NEXT: fmov s0, w0 +; LE-NEXT: umov w0, v0.b[3] +; LE-NEXT: ret + %ins = insertelement <4 x i32> undef, i32 %x, i32 0 + %bc = bitcast <4 x i32> %ins to <16 x i8> + %ext = extractelement <16 x i8> %bc, i32 3 + ret i8 %ext +} + diff --git a/test/CodeGen/X86/extract-insert.ll b/test/CodeGen/X86/extract-insert.ll index b3fb50de718..2393e32ebf6 100644 --- a/test/CodeGen/X86/extract-insert.ll +++ b/test/CodeGen/X86/extract-insert.ll @@ -29,7 +29,7 @@ define i8 @extractelt_bitcast(i32 %x) nounwind { } ; TODO: This should have folded to avoid vector ops, but the transform -; is guarded by 'hasOneUse'. That limitation apparently makes some AMDGPU +; is guarded by 'hasOneUse'. That limitation apparently makes some AMDGPU ; codegen better. define i8 @extractelt_bitcast_extra_use(i32 %x, <4 x i8>* %p) nounwind { @@ -60,3 +60,56 @@ define i8 @extractelt_bitcast_extra_use(i32 %x, <4 x i8>* %p) nounwind { ret i8 %ext } +define i32 @trunc_i64_to_i32_le(i64 %x) { +; X86-LABEL: trunc_i64_to_i32_le: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: retl +; +; X64-LABEL: trunc_i64_to_i32_le: +; X64: # %bb.0: +; X64-NEXT: movq %rdi, %xmm0 +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: retq + %ins = insertelement <2 x i64> undef, i64 %x, i32 0 + %bc = bitcast <2 x i64> %ins to <4 x i32> + %ext = extractelement <4 x i32> %bc, i32 0 + ret i32 %ext +} + +define i16 @trunc_i64_to_i16_le(i64 %x) { +; X86-LABEL: trunc_i64_to_i16_le: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: # kill: def $ax killed $ax killed $eax +; X86-NEXT: retl +; +; X64-LABEL: trunc_i64_to_i16_le: +; X64: # %bb.0: +; X64-NEXT: movq %rdi, %xmm0 +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NEXT: retq + %ins = insertelement <2 x i64> undef, i64 %x, i32 0 + %bc = bitcast <2 x i64> %ins to <8 x i16> + %ext = extractelement <8 x i16> %bc, i32 0 + ret i16 %ext +} + +define i8 @trunc_i32_to_i8_le(i32 %x) { +; X86-LABEL: trunc_i32_to_i8_le: +; X86: # %bb.0: +; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: retl +; +; X64-LABEL: trunc_i32_to_i8_le: +; X64: # %bb.0: +; X64-NEXT: movl %edi, %eax +; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: retq + %ins = insertelement <4 x i32> undef, i32 %x, i32 0 + %bc = bitcast <4 x i32> %ins to <16 x i8> + %ext = extractelement <16 x i8> %bc, i32 0 + ret i8 %ext +} + -- GitLab From 150e4ae6c16145b2c7da7f09fe253db8dbe5eb44 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 12 Oct 2018 15:48:47 +0000 Subject: [PATCH 0106/1116] Pull out repeated value types. NFCI. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344354 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index 852415647b1..8cc37b5f233 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -1079,7 +1079,7 @@ SDValue VectorLegalizer::ExpandCTLZ(SDValue Op) { if (Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF && TLI.isOperationLegalOrCustom(ISD::CTLZ, VT)) { SDLoc DL(Op); - return DAG.getNode(ISD::CTLZ, DL, Op.getValueType(), Op.getOperand(0)); + return DAG.getNode(ISD::CTLZ, DL, VT, Op.getOperand(0)); } // If we have the appropriate vector bit operations, it is better to use them @@ -1095,10 +1095,12 @@ SDValue VectorLegalizer::ExpandCTLZ(SDValue Op) { } SDValue VectorLegalizer::ExpandCTTZ_ZERO_UNDEF(SDValue Op) { + EVT VT = Op.getValueType(); + // If the non-ZERO_UNDEF version is supported we can use that instead. - if (TLI.isOperationLegalOrCustom(ISD::CTTZ, Op.getValueType())) { + if (TLI.isOperationLegalOrCustom(ISD::CTTZ, VT)) { SDLoc DL(Op); - return DAG.getNode(ISD::CTTZ, DL, Op.getValueType(), Op.getOperand(0)); + return DAG.getNode(ISD::CTTZ, DL, VT, Op.getOperand(0)); } // Otherwise go ahead and unroll. -- GitLab From 7dac907c9c291ea9e3fb1ae1ca0e8223894485f4 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 12 Oct 2018 15:49:19 +0000 Subject: [PATCH 0107/1116] Pull out repeated value types. NFCI. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344355 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index a6c0610f963..56025110f0a 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -2708,10 +2708,11 @@ SDValue SelectionDAGLegalize::ExpandBSWAP(SDValue Op, const SDLoc &dl) { /// Expand the specified bitcount instruction into operations. SDValue SelectionDAGLegalize::ExpandBitCount(unsigned Opc, SDValue Op, const SDLoc &dl) { + EVT VT = Op.getValueType(); + switch (Opc) { default: llvm_unreachable("Cannot expand this yet!"); case ISD::CTPOP: { - EVT VT = Op.getValueType(); EVT ShVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout()); unsigned Len = VT.getSizeInBits(); @@ -2758,9 +2759,8 @@ SDValue SelectionDAGLegalize::ExpandBitCount(unsigned Opc, SDValue Op, } case ISD::CTLZ_ZERO_UNDEF: // This trivially expands to CTLZ. - return DAG.getNode(ISD::CTLZ, dl, Op.getValueType(), Op); + return DAG.getNode(ISD::CTLZ, dl, VT, Op); case ISD::CTLZ: { - EVT VT = Op.getValueType(); unsigned Len = VT.getScalarSizeInBits(); if (TLI.isOperationLegalOrCustom(ISD::CTLZ_ZERO_UNDEF, VT)) { @@ -2792,9 +2792,8 @@ SDValue SelectionDAGLegalize::ExpandBitCount(unsigned Opc, SDValue Op, } case ISD::CTTZ_ZERO_UNDEF: // This trivially expands to CTTZ. - return DAG.getNode(ISD::CTTZ, dl, Op.getValueType(), Op); + return DAG.getNode(ISD::CTTZ, dl, VT, Op); case ISD::CTTZ: { - EVT VT = Op.getValueType(); unsigned Len = VT.getSizeInBits(); if (TLI.isOperationLegalOrCustom(ISD::CTTZ_ZERO_UNDEF, VT)) { @@ -2818,7 +2817,7 @@ SDValue SelectionDAGLegalize::ExpandBitCount(unsigned Opc, SDValue Op, if (!TLI.isOperationLegalOrCustom(ISD::CTPOP, VT) && TLI.isOperationLegalOrCustom(ISD::CTLZ, VT)) return DAG.getNode(ISD::SUB, dl, VT, - DAG.getConstant(VT.getSizeInBits(), dl, VT), + DAG.getConstant(Len, dl, VT), DAG.getNode(ISD::CTLZ, dl, VT, Tmp3)); return DAG.getNode(ISD::CTPOP, dl, VT, Tmp3); } -- GitLab From 185de913d577af58e9ec8264b549a0bec097808f Mon Sep 17 00:00:00 2001 From: Zachary Turner Date: Fri, 12 Oct 2018 16:24:09 +0000 Subject: [PATCH 0108/1116] Make YAML quote forward slashes. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344357 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Support/YAMLTraits.h | 7 ++++++- test/CodeGen/AArch64/arm64-spill-remarks.ll | 8 ++++---- test/ObjectYAML/MachO/DWARF-BigEndian.yaml | 4 ++-- test/ObjectYAML/MachO/DWARF-LittleEndian.yaml | 4 ++-- test/ObjectYAML/MachO/DWARF-debug_str.yaml | 2 +- test/ObjectYAML/MachO/dylib_dylinker_command.yaml | 4 ++-- test/Other/size-remarks.ll | 4 ++-- test/Transforms/GVN/opt-remarks.ll | 6 +++--- .../Transforms/Inline/optimization-remarks-passed-yaml.ll | 6 +++--- test/Transforms/Inline/optimization-remarks-yaml.ll | 8 ++++---- unittests/Support/YAMLIOTest.cpp | 4 +++- 11 files changed, 32 insertions(+), 25 deletions(-) diff --git a/include/llvm/Support/YAMLTraits.h b/include/llvm/Support/YAMLTraits.h index 5d029ad5ce9..6219755e83a 100644 --- a/include/llvm/Support/YAMLTraits.h +++ b/include/llvm/Support/YAMLTraits.h @@ -578,7 +578,6 @@ inline QuotingType needsQuotes(StringRef S) { // Safe scalar characters. case '_': case '-': - case '/': case '^': case '.': case ',': @@ -595,6 +594,12 @@ inline QuotingType needsQuotes(StringRef S) { // DEL (0x7F) are excluded from the allowed character range. case 0x7F: return QuotingType::Double; + // Forward slash is allowed to be unquoted, but we quote it anyway. We have + // many tests that use FileCheck against YAML output, and this output often + // contains paths. If we quote backslashes but not forward slashes then + // paths will come out either quoted or unquoted depending on which platform + // the test is run on, making FileCheck comparisons difficult. + case '/': default: { // C0 control block (0x0 - 0x1F) is excluded from the allowed character // range. diff --git a/test/CodeGen/AArch64/arm64-spill-remarks.ll b/test/CodeGen/AArch64/arm64-spill-remarks.ll index 53a16ed748b..2d187a74445 100644 --- a/test/CodeGen/AArch64/arm64-spill-remarks.ll +++ b/test/CodeGen/AArch64/arm64-spill-remarks.ll @@ -38,7 +38,7 @@ ; YAML: --- !Missed ; YAML: Pass: regalloc ; YAML: Name: LoopSpillReload -; YAML: DebugLoc: { File: /tmp/kk.c, Line: 3, Column: 20 } +; YAML: DebugLoc: { File: '/tmp/kk.c', Line: 3, Column: 20 } ; YAML: Function: fpr128 ; YAML: Hotness: 300 ; YAML: Args: @@ -51,7 +51,7 @@ ; YAML: --- !Missed ; YAML: Pass: regalloc ; YAML: Name: LoopSpillReload -; YAML: DebugLoc: { File: /tmp/kk.c, Line: 2, Column: 20 } +; YAML: DebugLoc: { File: '/tmp/kk.c', Line: 2, Column: 20 } ; YAML: Function: fpr128 ; YAML: Hotness: 30000 ; YAML: Args: @@ -64,7 +64,7 @@ ; YAML: --- !Missed ; YAML: Pass: regalloc ; YAML: Name: LoopSpillReload -; YAML: DebugLoc: { File: /tmp/kk.c, Line: 1, Column: 20 } +; YAML: DebugLoc: { File: '/tmp/kk.c', Line: 1, Column: 20 } ; YAML: Function: fpr128 ; YAML: Hotness: 300 ; YAML: Args: @@ -79,7 +79,7 @@ ; THRESHOLD_YAML: --- !Missed ; THRESHOLD_YAML: Pass: regalloc ; THRESHOLD_YAML: Name: LoopSpillReload -; THRESHOLD_YAML: DebugLoc: { File: /tmp/kk.c, Line: 2, Column: 20 } +; THRESHOLD_YAML: DebugLoc: { File: '/tmp/kk.c', Line: 2, Column: 20 } ; THRESHOLD_YAML: Function: fpr128 ; THRESHOLD_YAML: Hotness: 30000 ; THRESHOLD_YAML: Args: diff --git a/test/ObjectYAML/MachO/DWARF-BigEndian.yaml b/test/ObjectYAML/MachO/DWARF-BigEndian.yaml index adc95b95010..c6a45cd36ea 100644 --- a/test/ObjectYAML/MachO/DWARF-BigEndian.yaml +++ b/test/ObjectYAML/MachO/DWARF-BigEndian.yaml @@ -376,8 +376,8 @@ DWARF: #CHECK: DWARF: #CHECK: debug_str: #CHECK: - 'clang version 4.0.0 (trunk 290181) (llvm/trunk 290209)' -#CHECK: - ../compiler-rt/lib/builtins/absvdi2.c -#CHECK: - /Users/cbieneman/dev/open-source/llvm-build-rel +#CHECK: - '../compiler-rt/lib/builtins/absvdi2.c' +#CHECK: - '/Users/cbieneman/dev/open-source/llvm-build-rel' #CHECK: - int #CHECK: - di_int #CHECK: - long long int diff --git a/test/ObjectYAML/MachO/DWARF-LittleEndian.yaml b/test/ObjectYAML/MachO/DWARF-LittleEndian.yaml index 1d6da66a073..1e136e67be1 100644 --- a/test/ObjectYAML/MachO/DWARF-LittleEndian.yaml +++ b/test/ObjectYAML/MachO/DWARF-LittleEndian.yaml @@ -365,8 +365,8 @@ DWARF: #CHECK: DWARF: #CHECK: debug_str: #CHECK: - 'clang version 4.0.0 (trunk 290181) (llvm/trunk 290209)' -#CHECK: - ../compiler-rt/lib/builtins/absvdi2.c -#CHECK: - /Users/cbieneman/dev/open-source/llvm-build-rel +#CHECK: - '../compiler-rt/lib/builtins/absvdi2.c' +#CHECK: - '/Users/cbieneman/dev/open-source/llvm-build-rel' #CHECK: - int #CHECK: - di_int #CHECK: - long long int diff --git a/test/ObjectYAML/MachO/DWARF-debug_str.yaml b/test/ObjectYAML/MachO/DWARF-debug_str.yaml index 417a755642b..84c5e22d255 100644 --- a/test/ObjectYAML/MachO/DWARF-debug_str.yaml +++ b/test/ObjectYAML/MachO/DWARF-debug_str.yaml @@ -257,7 +257,7 @@ DWARF: #CHECK: - '' #CHECK: - 'clang version 4.0.0 (trunk 288677) (llvm/trunk 288676)' #CHECK: - hello_world.c -#CHECK: - /Users/cbieneman/dev/open-source/llvm-build-rel +#CHECK: - '/Users/cbieneman/dev/open-source/llvm-build-rel' #CHECK: - main #CHECK: - argc #CHECK: - argv diff --git a/test/ObjectYAML/MachO/dylib_dylinker_command.yaml b/test/ObjectYAML/MachO/dylib_dylinker_command.yaml index 9184e3c5143..5fc6afa536e 100644 --- a/test/ObjectYAML/MachO/dylib_dylinker_command.yaml +++ b/test/ObjectYAML/MachO/dylib_dylinker_command.yaml @@ -40,7 +40,7 @@ LoadCommands: #CHECK: - cmd: LC_LOAD_DYLINKER #CHECK: cmdsize: 32 #CHECK: name: 12 -#CHECK: PayloadString: /usr/lib/dyld +#CHECK: PayloadString: '/usr/lib/dyld' #CHECK: ZeroPadBytes: 7 #CHECK: - cmd: LC_LOAD_DYLIB #CHECK: cmdsize: 48 @@ -58,5 +58,5 @@ LoadCommands: #CHECK: timestamp: 2 #CHECK: current_version: 80349697 #CHECK: compatibility_version: 65536 -#CHECK: PayloadString: /usr/lib/libSystem.B.dylib +#CHECK: PayloadString: '/usr/lib/libSystem.B.dylib' #CHECK: ZeroPadBytes: 6 diff --git a/test/Other/size-remarks.ll b/test/Other/size-remarks.ll index 34cb1202bb9..1e96dd02207 100644 --- a/test/Other/size-remarks.ll +++ b/test/Other/size-remarks.ll @@ -32,7 +32,7 @@ ; CGSCC-NEXT: Name: IRSizeChange ; CGSCC-NEXT: Function: ; CGSCC-NEXT: Args: -; CGSCC-NEXT: - Pass: Function Integration/Inlining +; CGSCC-NEXT: - Pass: 'Function Integration/Inlining' ; CGSCC-NEXT: - String: ': IR instruction count changed from ' ; CGSCC-NEXT: - IRInstrsBefore: '[[ORIG]]' ; CGSCC-NEXT: - String: ' to ' @@ -44,7 +44,7 @@ ; CGSCC-NEXT: Name: FunctionIRSizeChange ; CGSCC-NEXT: Function: ; CGSCC-NEXT: Args: -; CGSCC-NEXT: - Pass: Function Integration/Inlining +; CGSCC-NEXT: - Pass: 'Function Integration/Inlining' ; CGSCC-NEXT: - String: ': Function: ' ; CGSCC-NEXT: - Function: bar ; CGSCC-NEXT: - String: ': IR instruction count changed from ' diff --git a/test/Transforms/GVN/opt-remarks.ll b/test/Transforms/GVN/opt-remarks.ll index 6919528bb83..120ff36f204 100644 --- a/test/Transforms/GVN/opt-remarks.ll +++ b/test/Transforms/GVN/opt-remarks.ll @@ -49,7 +49,7 @@ ; YAML-NEXT: --- !Missed ; YAML-NEXT: Pass: gvn ; YAML-NEXT: Name: LoadClobbered -; YAML-NEXT: DebugLoc: { File: /tmp/s.c, Line: 3, Column: 3 } +; YAML-NEXT: DebugLoc: { File: '/tmp/s.c', Line: 3, Column: 3 } ; YAML-NEXT: Function: may_alias ; YAML-NEXT: Args: ; YAML-NEXT: - String: 'load of type ' @@ -57,10 +57,10 @@ ; YAML-NEXT: - String: ' not eliminated' ; YAML-NEXT: - String: ' in favor of ' ; YAML-NEXT: - OtherAccess: load -; YAML-NEXT: DebugLoc: { File: /tmp/s.c, Line: 1, Column: 13 } +; YAML-NEXT: DebugLoc: { File: '/tmp/s.c', Line: 1, Column: 13 } ; YAML-NEXT: - String: ' because it is clobbered by ' ; YAML-NEXT: - ClobberedBy: store -; YAML-NEXT: DebugLoc: { File: /tmp/s.c, Line: 2, Column: 10 } +; YAML-NEXT: DebugLoc: { File: '/tmp/s.c', Line: 2, Column: 10 } ; YAML-NEXT: ... define i32 @arg(i32* %p, i32 %i) { diff --git a/test/Transforms/Inline/optimization-remarks-passed-yaml.ll b/test/Transforms/Inline/optimization-remarks-passed-yaml.ll index 0ac76354a2b..8692abfaf19 100644 --- a/test/Transforms/Inline/optimization-remarks-passed-yaml.ll +++ b/test/Transforms/Inline/optimization-remarks-passed-yaml.ll @@ -22,15 +22,15 @@ ; YAML: --- !Passed ; YAML-NEXT: Pass: inline ; YAML-NEXT: Name: Inlined -; YAML-NEXT: DebugLoc: { File: /tmp/s.c, Line: 4, Column: 10 } +; YAML-NEXT: DebugLoc: { File: '/tmp/s.c', Line: 4, Column: 10 } ; YAML-NEXT: Function: bar ; YAML-NEXT: Hotness: 30 ; YAML-NEXT: Args: ; YAML-NEXT: - Callee: foo -; YAML-NEXT: DebugLoc: { File: /tmp/s.c, Line: 1, Column: 0 } +; YAML-NEXT: DebugLoc: { File: '/tmp/s.c', Line: 1, Column: 0 } ; YAML-NEXT: - String: ' inlined into ' ; YAML-NEXT: - Caller: bar -; YAML-NEXT: DebugLoc: { File: /tmp/s.c, Line: 3, Column: 0 } +; YAML-NEXT: DebugLoc: { File: '/tmp/s.c', Line: 3, Column: 0 } ; YAML-NEXT: - String: ' with ' ; YAML-NEXT: - String: '(cost=' ; YAML-NEXT: - Cost: '{{[0-9\-]+}}' diff --git a/test/Transforms/Inline/optimization-remarks-yaml.ll b/test/Transforms/Inline/optimization-remarks-yaml.ll index cb366dbbdd3..10a93f5cd79 100644 --- a/test/Transforms/Inline/optimization-remarks-yaml.ll +++ b/test/Transforms/Inline/optimization-remarks-yaml.ll @@ -52,27 +52,27 @@ ; YAML: --- !Missed ; YAML-NEXT: Pass: inline ; YAML-NEXT: Name: NoDefinition -; YAML-NEXT: DebugLoc: { File: /tmp/s.c, Line: 5, Column: 10 } +; YAML-NEXT: DebugLoc: { File: '/tmp/s.c', Line: 5, Column: 10 } ; YAML-NEXT: Function: baz ; YAML-NEXT: Hotness: 30 ; YAML-NEXT: Args: ; YAML-NEXT: - Callee: foo ; YAML-NEXT: - String: ' will not be inlined into ' ; YAML-NEXT: - Caller: baz -; YAML-NEXT: DebugLoc: { File: /tmp/s.c, Line: 4, Column: 0 } +; YAML-NEXT: DebugLoc: { File: '/tmp/s.c', Line: 4, Column: 0 } ; YAML-NEXT: - String: ' because its definition is unavailable' ; YAML-NEXT: ... ; YAML-NEXT: --- !Missed ; YAML-NEXT: Pass: inline ; YAML-NEXT: Name: NoDefinition -; YAML-NEXT: DebugLoc: { File: /tmp/s.c, Line: 5, Column: 18 } +; YAML-NEXT: DebugLoc: { File: '/tmp/s.c', Line: 5, Column: 18 } ; YAML-NEXT: Function: baz ; YAML-NEXT: Hotness: 30 ; YAML-NEXT: Args: ; YAML-NEXT: - Callee: bar ; YAML-NEXT: - String: ' will not be inlined into ' ; YAML-NEXT: - Caller: baz -; YAML-NEXT: DebugLoc: { File: /tmp/s.c, Line: 4, Column: 0 } +; YAML-NEXT: DebugLoc: { File: '/tmp/s.c', Line: 4, Column: 0 } ; YAML-NEXT: - String: ' because its definition is unavailable' ; YAML-NEXT: ... diff --git a/unittests/Support/YAMLIOTest.cpp b/unittests/Support/YAMLIOTest.cpp index 4530482ec80..94e9874147f 100644 --- a/unittests/Support/YAMLIOTest.cpp +++ b/unittests/Support/YAMLIOTest.cpp @@ -2543,7 +2543,9 @@ TEST(YAMLIO, TestEscaped) { // Single quote TestEscaped("@abc@", "'@abc@'"); // No quote - TestEscaped("abc/", "abc/"); + TestEscaped("abc", "abc"); + // Forward slash quoted + TestEscaped("abc/", "'abc/'"); // Double quote non-printable TestEscaped("\01@abc@", "\"\\x01@abc@\""); // Double quote inside single quote -- GitLab From 7b3c18864147ef2d8995cbecd44bf6f7659406af Mon Sep 17 00:00:00 2001 From: Zachary Turner Date: Fri, 12 Oct 2018 16:31:08 +0000 Subject: [PATCH 0109/1116] Revert "Make YAML quote forward slashes." This reverts commit b86c16ad8c97dadc1f529da72a5bb74e9eaed344. This is being reverted because I forgot to write a useful commit message, so I'm going to resubmit it with an actual commit message. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344358 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Support/YAMLTraits.h | 7 +------ test/CodeGen/AArch64/arm64-spill-remarks.ll | 8 ++++---- test/ObjectYAML/MachO/DWARF-BigEndian.yaml | 4 ++-- test/ObjectYAML/MachO/DWARF-LittleEndian.yaml | 4 ++-- test/ObjectYAML/MachO/DWARF-debug_str.yaml | 2 +- test/ObjectYAML/MachO/dylib_dylinker_command.yaml | 4 ++-- test/Other/size-remarks.ll | 4 ++-- test/Transforms/GVN/opt-remarks.ll | 6 +++--- .../Transforms/Inline/optimization-remarks-passed-yaml.ll | 6 +++--- test/Transforms/Inline/optimization-remarks-yaml.ll | 8 ++++---- unittests/Support/YAMLIOTest.cpp | 4 +--- 11 files changed, 25 insertions(+), 32 deletions(-) diff --git a/include/llvm/Support/YAMLTraits.h b/include/llvm/Support/YAMLTraits.h index 6219755e83a..5d029ad5ce9 100644 --- a/include/llvm/Support/YAMLTraits.h +++ b/include/llvm/Support/YAMLTraits.h @@ -578,6 +578,7 @@ inline QuotingType needsQuotes(StringRef S) { // Safe scalar characters. case '_': case '-': + case '/': case '^': case '.': case ',': @@ -594,12 +595,6 @@ inline QuotingType needsQuotes(StringRef S) { // DEL (0x7F) are excluded from the allowed character range. case 0x7F: return QuotingType::Double; - // Forward slash is allowed to be unquoted, but we quote it anyway. We have - // many tests that use FileCheck against YAML output, and this output often - // contains paths. If we quote backslashes but not forward slashes then - // paths will come out either quoted or unquoted depending on which platform - // the test is run on, making FileCheck comparisons difficult. - case '/': default: { // C0 control block (0x0 - 0x1F) is excluded from the allowed character // range. diff --git a/test/CodeGen/AArch64/arm64-spill-remarks.ll b/test/CodeGen/AArch64/arm64-spill-remarks.ll index 2d187a74445..53a16ed748b 100644 --- a/test/CodeGen/AArch64/arm64-spill-remarks.ll +++ b/test/CodeGen/AArch64/arm64-spill-remarks.ll @@ -38,7 +38,7 @@ ; YAML: --- !Missed ; YAML: Pass: regalloc ; YAML: Name: LoopSpillReload -; YAML: DebugLoc: { File: '/tmp/kk.c', Line: 3, Column: 20 } +; YAML: DebugLoc: { File: /tmp/kk.c, Line: 3, Column: 20 } ; YAML: Function: fpr128 ; YAML: Hotness: 300 ; YAML: Args: @@ -51,7 +51,7 @@ ; YAML: --- !Missed ; YAML: Pass: regalloc ; YAML: Name: LoopSpillReload -; YAML: DebugLoc: { File: '/tmp/kk.c', Line: 2, Column: 20 } +; YAML: DebugLoc: { File: /tmp/kk.c, Line: 2, Column: 20 } ; YAML: Function: fpr128 ; YAML: Hotness: 30000 ; YAML: Args: @@ -64,7 +64,7 @@ ; YAML: --- !Missed ; YAML: Pass: regalloc ; YAML: Name: LoopSpillReload -; YAML: DebugLoc: { File: '/tmp/kk.c', Line: 1, Column: 20 } +; YAML: DebugLoc: { File: /tmp/kk.c, Line: 1, Column: 20 } ; YAML: Function: fpr128 ; YAML: Hotness: 300 ; YAML: Args: @@ -79,7 +79,7 @@ ; THRESHOLD_YAML: --- !Missed ; THRESHOLD_YAML: Pass: regalloc ; THRESHOLD_YAML: Name: LoopSpillReload -; THRESHOLD_YAML: DebugLoc: { File: '/tmp/kk.c', Line: 2, Column: 20 } +; THRESHOLD_YAML: DebugLoc: { File: /tmp/kk.c, Line: 2, Column: 20 } ; THRESHOLD_YAML: Function: fpr128 ; THRESHOLD_YAML: Hotness: 30000 ; THRESHOLD_YAML: Args: diff --git a/test/ObjectYAML/MachO/DWARF-BigEndian.yaml b/test/ObjectYAML/MachO/DWARF-BigEndian.yaml index c6a45cd36ea..adc95b95010 100644 --- a/test/ObjectYAML/MachO/DWARF-BigEndian.yaml +++ b/test/ObjectYAML/MachO/DWARF-BigEndian.yaml @@ -376,8 +376,8 @@ DWARF: #CHECK: DWARF: #CHECK: debug_str: #CHECK: - 'clang version 4.0.0 (trunk 290181) (llvm/trunk 290209)' -#CHECK: - '../compiler-rt/lib/builtins/absvdi2.c' -#CHECK: - '/Users/cbieneman/dev/open-source/llvm-build-rel' +#CHECK: - ../compiler-rt/lib/builtins/absvdi2.c +#CHECK: - /Users/cbieneman/dev/open-source/llvm-build-rel #CHECK: - int #CHECK: - di_int #CHECK: - long long int diff --git a/test/ObjectYAML/MachO/DWARF-LittleEndian.yaml b/test/ObjectYAML/MachO/DWARF-LittleEndian.yaml index 1e136e67be1..1d6da66a073 100644 --- a/test/ObjectYAML/MachO/DWARF-LittleEndian.yaml +++ b/test/ObjectYAML/MachO/DWARF-LittleEndian.yaml @@ -365,8 +365,8 @@ DWARF: #CHECK: DWARF: #CHECK: debug_str: #CHECK: - 'clang version 4.0.0 (trunk 290181) (llvm/trunk 290209)' -#CHECK: - '../compiler-rt/lib/builtins/absvdi2.c' -#CHECK: - '/Users/cbieneman/dev/open-source/llvm-build-rel' +#CHECK: - ../compiler-rt/lib/builtins/absvdi2.c +#CHECK: - /Users/cbieneman/dev/open-source/llvm-build-rel #CHECK: - int #CHECK: - di_int #CHECK: - long long int diff --git a/test/ObjectYAML/MachO/DWARF-debug_str.yaml b/test/ObjectYAML/MachO/DWARF-debug_str.yaml index 84c5e22d255..417a755642b 100644 --- a/test/ObjectYAML/MachO/DWARF-debug_str.yaml +++ b/test/ObjectYAML/MachO/DWARF-debug_str.yaml @@ -257,7 +257,7 @@ DWARF: #CHECK: - '' #CHECK: - 'clang version 4.0.0 (trunk 288677) (llvm/trunk 288676)' #CHECK: - hello_world.c -#CHECK: - '/Users/cbieneman/dev/open-source/llvm-build-rel' +#CHECK: - /Users/cbieneman/dev/open-source/llvm-build-rel #CHECK: - main #CHECK: - argc #CHECK: - argv diff --git a/test/ObjectYAML/MachO/dylib_dylinker_command.yaml b/test/ObjectYAML/MachO/dylib_dylinker_command.yaml index 5fc6afa536e..9184e3c5143 100644 --- a/test/ObjectYAML/MachO/dylib_dylinker_command.yaml +++ b/test/ObjectYAML/MachO/dylib_dylinker_command.yaml @@ -40,7 +40,7 @@ LoadCommands: #CHECK: - cmd: LC_LOAD_DYLINKER #CHECK: cmdsize: 32 #CHECK: name: 12 -#CHECK: PayloadString: '/usr/lib/dyld' +#CHECK: PayloadString: /usr/lib/dyld #CHECK: ZeroPadBytes: 7 #CHECK: - cmd: LC_LOAD_DYLIB #CHECK: cmdsize: 48 @@ -58,5 +58,5 @@ LoadCommands: #CHECK: timestamp: 2 #CHECK: current_version: 80349697 #CHECK: compatibility_version: 65536 -#CHECK: PayloadString: '/usr/lib/libSystem.B.dylib' +#CHECK: PayloadString: /usr/lib/libSystem.B.dylib #CHECK: ZeroPadBytes: 6 diff --git a/test/Other/size-remarks.ll b/test/Other/size-remarks.ll index 1e96dd02207..34cb1202bb9 100644 --- a/test/Other/size-remarks.ll +++ b/test/Other/size-remarks.ll @@ -32,7 +32,7 @@ ; CGSCC-NEXT: Name: IRSizeChange ; CGSCC-NEXT: Function: ; CGSCC-NEXT: Args: -; CGSCC-NEXT: - Pass: 'Function Integration/Inlining' +; CGSCC-NEXT: - Pass: Function Integration/Inlining ; CGSCC-NEXT: - String: ': IR instruction count changed from ' ; CGSCC-NEXT: - IRInstrsBefore: '[[ORIG]]' ; CGSCC-NEXT: - String: ' to ' @@ -44,7 +44,7 @@ ; CGSCC-NEXT: Name: FunctionIRSizeChange ; CGSCC-NEXT: Function: ; CGSCC-NEXT: Args: -; CGSCC-NEXT: - Pass: 'Function Integration/Inlining' +; CGSCC-NEXT: - Pass: Function Integration/Inlining ; CGSCC-NEXT: - String: ': Function: ' ; CGSCC-NEXT: - Function: bar ; CGSCC-NEXT: - String: ': IR instruction count changed from ' diff --git a/test/Transforms/GVN/opt-remarks.ll b/test/Transforms/GVN/opt-remarks.ll index 120ff36f204..6919528bb83 100644 --- a/test/Transforms/GVN/opt-remarks.ll +++ b/test/Transforms/GVN/opt-remarks.ll @@ -49,7 +49,7 @@ ; YAML-NEXT: --- !Missed ; YAML-NEXT: Pass: gvn ; YAML-NEXT: Name: LoadClobbered -; YAML-NEXT: DebugLoc: { File: '/tmp/s.c', Line: 3, Column: 3 } +; YAML-NEXT: DebugLoc: { File: /tmp/s.c, Line: 3, Column: 3 } ; YAML-NEXT: Function: may_alias ; YAML-NEXT: Args: ; YAML-NEXT: - String: 'load of type ' @@ -57,10 +57,10 @@ ; YAML-NEXT: - String: ' not eliminated' ; YAML-NEXT: - String: ' in favor of ' ; YAML-NEXT: - OtherAccess: load -; YAML-NEXT: DebugLoc: { File: '/tmp/s.c', Line: 1, Column: 13 } +; YAML-NEXT: DebugLoc: { File: /tmp/s.c, Line: 1, Column: 13 } ; YAML-NEXT: - String: ' because it is clobbered by ' ; YAML-NEXT: - ClobberedBy: store -; YAML-NEXT: DebugLoc: { File: '/tmp/s.c', Line: 2, Column: 10 } +; YAML-NEXT: DebugLoc: { File: /tmp/s.c, Line: 2, Column: 10 } ; YAML-NEXT: ... define i32 @arg(i32* %p, i32 %i) { diff --git a/test/Transforms/Inline/optimization-remarks-passed-yaml.ll b/test/Transforms/Inline/optimization-remarks-passed-yaml.ll index 8692abfaf19..0ac76354a2b 100644 --- a/test/Transforms/Inline/optimization-remarks-passed-yaml.ll +++ b/test/Transforms/Inline/optimization-remarks-passed-yaml.ll @@ -22,15 +22,15 @@ ; YAML: --- !Passed ; YAML-NEXT: Pass: inline ; YAML-NEXT: Name: Inlined -; YAML-NEXT: DebugLoc: { File: '/tmp/s.c', Line: 4, Column: 10 } +; YAML-NEXT: DebugLoc: { File: /tmp/s.c, Line: 4, Column: 10 } ; YAML-NEXT: Function: bar ; YAML-NEXT: Hotness: 30 ; YAML-NEXT: Args: ; YAML-NEXT: - Callee: foo -; YAML-NEXT: DebugLoc: { File: '/tmp/s.c', Line: 1, Column: 0 } +; YAML-NEXT: DebugLoc: { File: /tmp/s.c, Line: 1, Column: 0 } ; YAML-NEXT: - String: ' inlined into ' ; YAML-NEXT: - Caller: bar -; YAML-NEXT: DebugLoc: { File: '/tmp/s.c', Line: 3, Column: 0 } +; YAML-NEXT: DebugLoc: { File: /tmp/s.c, Line: 3, Column: 0 } ; YAML-NEXT: - String: ' with ' ; YAML-NEXT: - String: '(cost=' ; YAML-NEXT: - Cost: '{{[0-9\-]+}}' diff --git a/test/Transforms/Inline/optimization-remarks-yaml.ll b/test/Transforms/Inline/optimization-remarks-yaml.ll index 10a93f5cd79..cb366dbbdd3 100644 --- a/test/Transforms/Inline/optimization-remarks-yaml.ll +++ b/test/Transforms/Inline/optimization-remarks-yaml.ll @@ -52,27 +52,27 @@ ; YAML: --- !Missed ; YAML-NEXT: Pass: inline ; YAML-NEXT: Name: NoDefinition -; YAML-NEXT: DebugLoc: { File: '/tmp/s.c', Line: 5, Column: 10 } +; YAML-NEXT: DebugLoc: { File: /tmp/s.c, Line: 5, Column: 10 } ; YAML-NEXT: Function: baz ; YAML-NEXT: Hotness: 30 ; YAML-NEXT: Args: ; YAML-NEXT: - Callee: foo ; YAML-NEXT: - String: ' will not be inlined into ' ; YAML-NEXT: - Caller: baz -; YAML-NEXT: DebugLoc: { File: '/tmp/s.c', Line: 4, Column: 0 } +; YAML-NEXT: DebugLoc: { File: /tmp/s.c, Line: 4, Column: 0 } ; YAML-NEXT: - String: ' because its definition is unavailable' ; YAML-NEXT: ... ; YAML-NEXT: --- !Missed ; YAML-NEXT: Pass: inline ; YAML-NEXT: Name: NoDefinition -; YAML-NEXT: DebugLoc: { File: '/tmp/s.c', Line: 5, Column: 18 } +; YAML-NEXT: DebugLoc: { File: /tmp/s.c, Line: 5, Column: 18 } ; YAML-NEXT: Function: baz ; YAML-NEXT: Hotness: 30 ; YAML-NEXT: Args: ; YAML-NEXT: - Callee: bar ; YAML-NEXT: - String: ' will not be inlined into ' ; YAML-NEXT: - Caller: baz -; YAML-NEXT: DebugLoc: { File: '/tmp/s.c', Line: 4, Column: 0 } +; YAML-NEXT: DebugLoc: { File: /tmp/s.c, Line: 4, Column: 0 } ; YAML-NEXT: - String: ' because its definition is unavailable' ; YAML-NEXT: ... diff --git a/unittests/Support/YAMLIOTest.cpp b/unittests/Support/YAMLIOTest.cpp index 94e9874147f..4530482ec80 100644 --- a/unittests/Support/YAMLIOTest.cpp +++ b/unittests/Support/YAMLIOTest.cpp @@ -2543,9 +2543,7 @@ TEST(YAMLIO, TestEscaped) { // Single quote TestEscaped("@abc@", "'@abc@'"); // No quote - TestEscaped("abc", "abc"); - // Forward slash quoted - TestEscaped("abc/", "'abc/'"); + TestEscaped("abc/", "abc/"); // Double quote non-printable TestEscaped("\01@abc@", "\"\\x01@abc@\""); // Double quote inside single quote -- GitLab From d5e155bacd7b191e759082a68424c0f2cc60f29f Mon Sep 17 00:00:00 2001 From: Zachary Turner Date: Fri, 12 Oct 2018 16:31:20 +0000 Subject: [PATCH 0110/1116] Make YAML quote forward slashes. If you have the string /usr/bin, prior to this patch it would not be quoted by our YAML serializer. But a string like C:\src would be, due to the presence of a backslash. This makes the quoting rules of basically every single file path different depending on the path syntax (posix vs. Windows). While technically not required by the YAML specification to quote forward slashes, when the behavior of paths is inconsistent it makes it difficult to portably write FileCheck lines that will work with either kind of path. Differential Revision: https://reviews.llvm.org/D53169 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344359 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Support/YAMLTraits.h | 7 ++++++- test/CodeGen/AArch64/arm64-spill-remarks.ll | 8 ++++---- test/ObjectYAML/MachO/DWARF-BigEndian.yaml | 4 ++-- test/ObjectYAML/MachO/DWARF-LittleEndian.yaml | 4 ++-- test/ObjectYAML/MachO/DWARF-debug_str.yaml | 2 +- test/ObjectYAML/MachO/dylib_dylinker_command.yaml | 4 ++-- test/Other/size-remarks.ll | 4 ++-- test/Transforms/GVN/opt-remarks.ll | 6 +++--- .../Transforms/Inline/optimization-remarks-passed-yaml.ll | 6 +++--- test/Transforms/Inline/optimization-remarks-yaml.ll | 8 ++++---- unittests/Support/YAMLIOTest.cpp | 4 +++- 11 files changed, 32 insertions(+), 25 deletions(-) diff --git a/include/llvm/Support/YAMLTraits.h b/include/llvm/Support/YAMLTraits.h index 5d029ad5ce9..6219755e83a 100644 --- a/include/llvm/Support/YAMLTraits.h +++ b/include/llvm/Support/YAMLTraits.h @@ -578,7 +578,6 @@ inline QuotingType needsQuotes(StringRef S) { // Safe scalar characters. case '_': case '-': - case '/': case '^': case '.': case ',': @@ -595,6 +594,12 @@ inline QuotingType needsQuotes(StringRef S) { // DEL (0x7F) are excluded from the allowed character range. case 0x7F: return QuotingType::Double; + // Forward slash is allowed to be unquoted, but we quote it anyway. We have + // many tests that use FileCheck against YAML output, and this output often + // contains paths. If we quote backslashes but not forward slashes then + // paths will come out either quoted or unquoted depending on which platform + // the test is run on, making FileCheck comparisons difficult. + case '/': default: { // C0 control block (0x0 - 0x1F) is excluded from the allowed character // range. diff --git a/test/CodeGen/AArch64/arm64-spill-remarks.ll b/test/CodeGen/AArch64/arm64-spill-remarks.ll index 53a16ed748b..2d187a74445 100644 --- a/test/CodeGen/AArch64/arm64-spill-remarks.ll +++ b/test/CodeGen/AArch64/arm64-spill-remarks.ll @@ -38,7 +38,7 @@ ; YAML: --- !Missed ; YAML: Pass: regalloc ; YAML: Name: LoopSpillReload -; YAML: DebugLoc: { File: /tmp/kk.c, Line: 3, Column: 20 } +; YAML: DebugLoc: { File: '/tmp/kk.c', Line: 3, Column: 20 } ; YAML: Function: fpr128 ; YAML: Hotness: 300 ; YAML: Args: @@ -51,7 +51,7 @@ ; YAML: --- !Missed ; YAML: Pass: regalloc ; YAML: Name: LoopSpillReload -; YAML: DebugLoc: { File: /tmp/kk.c, Line: 2, Column: 20 } +; YAML: DebugLoc: { File: '/tmp/kk.c', Line: 2, Column: 20 } ; YAML: Function: fpr128 ; YAML: Hotness: 30000 ; YAML: Args: @@ -64,7 +64,7 @@ ; YAML: --- !Missed ; YAML: Pass: regalloc ; YAML: Name: LoopSpillReload -; YAML: DebugLoc: { File: /tmp/kk.c, Line: 1, Column: 20 } +; YAML: DebugLoc: { File: '/tmp/kk.c', Line: 1, Column: 20 } ; YAML: Function: fpr128 ; YAML: Hotness: 300 ; YAML: Args: @@ -79,7 +79,7 @@ ; THRESHOLD_YAML: --- !Missed ; THRESHOLD_YAML: Pass: regalloc ; THRESHOLD_YAML: Name: LoopSpillReload -; THRESHOLD_YAML: DebugLoc: { File: /tmp/kk.c, Line: 2, Column: 20 } +; THRESHOLD_YAML: DebugLoc: { File: '/tmp/kk.c', Line: 2, Column: 20 } ; THRESHOLD_YAML: Function: fpr128 ; THRESHOLD_YAML: Hotness: 30000 ; THRESHOLD_YAML: Args: diff --git a/test/ObjectYAML/MachO/DWARF-BigEndian.yaml b/test/ObjectYAML/MachO/DWARF-BigEndian.yaml index adc95b95010..c6a45cd36ea 100644 --- a/test/ObjectYAML/MachO/DWARF-BigEndian.yaml +++ b/test/ObjectYAML/MachO/DWARF-BigEndian.yaml @@ -376,8 +376,8 @@ DWARF: #CHECK: DWARF: #CHECK: debug_str: #CHECK: - 'clang version 4.0.0 (trunk 290181) (llvm/trunk 290209)' -#CHECK: - ../compiler-rt/lib/builtins/absvdi2.c -#CHECK: - /Users/cbieneman/dev/open-source/llvm-build-rel +#CHECK: - '../compiler-rt/lib/builtins/absvdi2.c' +#CHECK: - '/Users/cbieneman/dev/open-source/llvm-build-rel' #CHECK: - int #CHECK: - di_int #CHECK: - long long int diff --git a/test/ObjectYAML/MachO/DWARF-LittleEndian.yaml b/test/ObjectYAML/MachO/DWARF-LittleEndian.yaml index 1d6da66a073..1e136e67be1 100644 --- a/test/ObjectYAML/MachO/DWARF-LittleEndian.yaml +++ b/test/ObjectYAML/MachO/DWARF-LittleEndian.yaml @@ -365,8 +365,8 @@ DWARF: #CHECK: DWARF: #CHECK: debug_str: #CHECK: - 'clang version 4.0.0 (trunk 290181) (llvm/trunk 290209)' -#CHECK: - ../compiler-rt/lib/builtins/absvdi2.c -#CHECK: - /Users/cbieneman/dev/open-source/llvm-build-rel +#CHECK: - '../compiler-rt/lib/builtins/absvdi2.c' +#CHECK: - '/Users/cbieneman/dev/open-source/llvm-build-rel' #CHECK: - int #CHECK: - di_int #CHECK: - long long int diff --git a/test/ObjectYAML/MachO/DWARF-debug_str.yaml b/test/ObjectYAML/MachO/DWARF-debug_str.yaml index 417a755642b..84c5e22d255 100644 --- a/test/ObjectYAML/MachO/DWARF-debug_str.yaml +++ b/test/ObjectYAML/MachO/DWARF-debug_str.yaml @@ -257,7 +257,7 @@ DWARF: #CHECK: - '' #CHECK: - 'clang version 4.0.0 (trunk 288677) (llvm/trunk 288676)' #CHECK: - hello_world.c -#CHECK: - /Users/cbieneman/dev/open-source/llvm-build-rel +#CHECK: - '/Users/cbieneman/dev/open-source/llvm-build-rel' #CHECK: - main #CHECK: - argc #CHECK: - argv diff --git a/test/ObjectYAML/MachO/dylib_dylinker_command.yaml b/test/ObjectYAML/MachO/dylib_dylinker_command.yaml index 9184e3c5143..5fc6afa536e 100644 --- a/test/ObjectYAML/MachO/dylib_dylinker_command.yaml +++ b/test/ObjectYAML/MachO/dylib_dylinker_command.yaml @@ -40,7 +40,7 @@ LoadCommands: #CHECK: - cmd: LC_LOAD_DYLINKER #CHECK: cmdsize: 32 #CHECK: name: 12 -#CHECK: PayloadString: /usr/lib/dyld +#CHECK: PayloadString: '/usr/lib/dyld' #CHECK: ZeroPadBytes: 7 #CHECK: - cmd: LC_LOAD_DYLIB #CHECK: cmdsize: 48 @@ -58,5 +58,5 @@ LoadCommands: #CHECK: timestamp: 2 #CHECK: current_version: 80349697 #CHECK: compatibility_version: 65536 -#CHECK: PayloadString: /usr/lib/libSystem.B.dylib +#CHECK: PayloadString: '/usr/lib/libSystem.B.dylib' #CHECK: ZeroPadBytes: 6 diff --git a/test/Other/size-remarks.ll b/test/Other/size-remarks.ll index 34cb1202bb9..1e96dd02207 100644 --- a/test/Other/size-remarks.ll +++ b/test/Other/size-remarks.ll @@ -32,7 +32,7 @@ ; CGSCC-NEXT: Name: IRSizeChange ; CGSCC-NEXT: Function: ; CGSCC-NEXT: Args: -; CGSCC-NEXT: - Pass: Function Integration/Inlining +; CGSCC-NEXT: - Pass: 'Function Integration/Inlining' ; CGSCC-NEXT: - String: ': IR instruction count changed from ' ; CGSCC-NEXT: - IRInstrsBefore: '[[ORIG]]' ; CGSCC-NEXT: - String: ' to ' @@ -44,7 +44,7 @@ ; CGSCC-NEXT: Name: FunctionIRSizeChange ; CGSCC-NEXT: Function: ; CGSCC-NEXT: Args: -; CGSCC-NEXT: - Pass: Function Integration/Inlining +; CGSCC-NEXT: - Pass: 'Function Integration/Inlining' ; CGSCC-NEXT: - String: ': Function: ' ; CGSCC-NEXT: - Function: bar ; CGSCC-NEXT: - String: ': IR instruction count changed from ' diff --git a/test/Transforms/GVN/opt-remarks.ll b/test/Transforms/GVN/opt-remarks.ll index 6919528bb83..120ff36f204 100644 --- a/test/Transforms/GVN/opt-remarks.ll +++ b/test/Transforms/GVN/opt-remarks.ll @@ -49,7 +49,7 @@ ; YAML-NEXT: --- !Missed ; YAML-NEXT: Pass: gvn ; YAML-NEXT: Name: LoadClobbered -; YAML-NEXT: DebugLoc: { File: /tmp/s.c, Line: 3, Column: 3 } +; YAML-NEXT: DebugLoc: { File: '/tmp/s.c', Line: 3, Column: 3 } ; YAML-NEXT: Function: may_alias ; YAML-NEXT: Args: ; YAML-NEXT: - String: 'load of type ' @@ -57,10 +57,10 @@ ; YAML-NEXT: - String: ' not eliminated' ; YAML-NEXT: - String: ' in favor of ' ; YAML-NEXT: - OtherAccess: load -; YAML-NEXT: DebugLoc: { File: /tmp/s.c, Line: 1, Column: 13 } +; YAML-NEXT: DebugLoc: { File: '/tmp/s.c', Line: 1, Column: 13 } ; YAML-NEXT: - String: ' because it is clobbered by ' ; YAML-NEXT: - ClobberedBy: store -; YAML-NEXT: DebugLoc: { File: /tmp/s.c, Line: 2, Column: 10 } +; YAML-NEXT: DebugLoc: { File: '/tmp/s.c', Line: 2, Column: 10 } ; YAML-NEXT: ... define i32 @arg(i32* %p, i32 %i) { diff --git a/test/Transforms/Inline/optimization-remarks-passed-yaml.ll b/test/Transforms/Inline/optimization-remarks-passed-yaml.ll index 0ac76354a2b..8692abfaf19 100644 --- a/test/Transforms/Inline/optimization-remarks-passed-yaml.ll +++ b/test/Transforms/Inline/optimization-remarks-passed-yaml.ll @@ -22,15 +22,15 @@ ; YAML: --- !Passed ; YAML-NEXT: Pass: inline ; YAML-NEXT: Name: Inlined -; YAML-NEXT: DebugLoc: { File: /tmp/s.c, Line: 4, Column: 10 } +; YAML-NEXT: DebugLoc: { File: '/tmp/s.c', Line: 4, Column: 10 } ; YAML-NEXT: Function: bar ; YAML-NEXT: Hotness: 30 ; YAML-NEXT: Args: ; YAML-NEXT: - Callee: foo -; YAML-NEXT: DebugLoc: { File: /tmp/s.c, Line: 1, Column: 0 } +; YAML-NEXT: DebugLoc: { File: '/tmp/s.c', Line: 1, Column: 0 } ; YAML-NEXT: - String: ' inlined into ' ; YAML-NEXT: - Caller: bar -; YAML-NEXT: DebugLoc: { File: /tmp/s.c, Line: 3, Column: 0 } +; YAML-NEXT: DebugLoc: { File: '/tmp/s.c', Line: 3, Column: 0 } ; YAML-NEXT: - String: ' with ' ; YAML-NEXT: - String: '(cost=' ; YAML-NEXT: - Cost: '{{[0-9\-]+}}' diff --git a/test/Transforms/Inline/optimization-remarks-yaml.ll b/test/Transforms/Inline/optimization-remarks-yaml.ll index cb366dbbdd3..10a93f5cd79 100644 --- a/test/Transforms/Inline/optimization-remarks-yaml.ll +++ b/test/Transforms/Inline/optimization-remarks-yaml.ll @@ -52,27 +52,27 @@ ; YAML: --- !Missed ; YAML-NEXT: Pass: inline ; YAML-NEXT: Name: NoDefinition -; YAML-NEXT: DebugLoc: { File: /tmp/s.c, Line: 5, Column: 10 } +; YAML-NEXT: DebugLoc: { File: '/tmp/s.c', Line: 5, Column: 10 } ; YAML-NEXT: Function: baz ; YAML-NEXT: Hotness: 30 ; YAML-NEXT: Args: ; YAML-NEXT: - Callee: foo ; YAML-NEXT: - String: ' will not be inlined into ' ; YAML-NEXT: - Caller: baz -; YAML-NEXT: DebugLoc: { File: /tmp/s.c, Line: 4, Column: 0 } +; YAML-NEXT: DebugLoc: { File: '/tmp/s.c', Line: 4, Column: 0 } ; YAML-NEXT: - String: ' because its definition is unavailable' ; YAML-NEXT: ... ; YAML-NEXT: --- !Missed ; YAML-NEXT: Pass: inline ; YAML-NEXT: Name: NoDefinition -; YAML-NEXT: DebugLoc: { File: /tmp/s.c, Line: 5, Column: 18 } +; YAML-NEXT: DebugLoc: { File: '/tmp/s.c', Line: 5, Column: 18 } ; YAML-NEXT: Function: baz ; YAML-NEXT: Hotness: 30 ; YAML-NEXT: Args: ; YAML-NEXT: - Callee: bar ; YAML-NEXT: - String: ' will not be inlined into ' ; YAML-NEXT: - Caller: baz -; YAML-NEXT: DebugLoc: { File: /tmp/s.c, Line: 4, Column: 0 } +; YAML-NEXT: DebugLoc: { File: '/tmp/s.c', Line: 4, Column: 0 } ; YAML-NEXT: - String: ' because its definition is unavailable' ; YAML-NEXT: ... diff --git a/unittests/Support/YAMLIOTest.cpp b/unittests/Support/YAMLIOTest.cpp index 4530482ec80..94e9874147f 100644 --- a/unittests/Support/YAMLIOTest.cpp +++ b/unittests/Support/YAMLIOTest.cpp @@ -2543,7 +2543,9 @@ TEST(YAMLIO, TestEscaped) { // Single quote TestEscaped("@abc@", "'@abc@'"); // No quote - TestEscaped("abc/", "abc/"); + TestEscaped("abc", "abc"); + // Forward slash quoted + TestEscaped("abc/", "'abc/'"); // Double quote non-printable TestEscaped("\01@abc@", "\"\\x01@abc@\""); // Double quote inside single quote -- GitLab From b609135fc92b83d634e9ae99e6a1b84ceb2f9c3c Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Fri, 12 Oct 2018 16:35:44 +0000 Subject: [PATCH 0111/1116] [MC][ELF] fix newly added test Summary: Reland of - r344197 "[MC][ELF] compute entity size for explicit sections" - r344206 "[MC][ELF] Fix section_mergeable_size.ll" after being reverted in r344278 due to build breakages from not specifying a target triple. Move test from test/CodeGen/Generic/ to test/MC/ELF/. Add explicit target triple so we don't try to run this test on non ELF targets. Reported: https://reviews.llvm.org/D53056#1261707 Reviewers: fhahn, rnk, espindola, NoQ Reviewed By: fhahn, rnk Subscribers: NoQ, MaskRay, rengolin, emaste, arichardson, llvm-commits, pirama, srhines Differential Revision: https://reviews.llvm.org/D53146 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344360 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/TargetLoweringObjectFileImpl.cpp | 50 ++++++++++---------- test/CodeGen/X86/section_mergeable_size.ll | 3 ++ 2 files changed, 28 insertions(+), 25 deletions(-) create mode 100644 test/CodeGen/X86/section_mergeable_size.ll diff --git a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp index f6882c40531..b046cd81d6c 100644 --- a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp +++ b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp @@ -506,6 +506,30 @@ static const MCSymbolELF *getAssociatedSymbol(const GlobalObject *GO, return OtherGO ? dyn_cast(TM.getSymbol(OtherGO)) : nullptr; } +static unsigned getEntrySizeForKind(SectionKind Kind) { + if (Kind.isMergeable1ByteCString()) + return 1; + else if (Kind.isMergeable2ByteCString()) + return 2; + else if (Kind.isMergeable4ByteCString()) + return 4; + else if (Kind.isMergeableConst4()) + return 4; + else if (Kind.isMergeableConst8()) + return 8; + else if (Kind.isMergeableConst16()) + return 16; + else if (Kind.isMergeableConst32()) + return 32; + else { + // We shouldn't have mergeable C strings or mergeable constants that we + // didn't handle above. + assert(!Kind.isMergeableCString() && "unknown string width"); + assert(!Kind.isMergeableConst() && "unknown data width"); + return 0; + } +} + MCSection *TargetLoweringObjectFileELF::getExplicitSectionGlobal( const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const { StringRef SectionName = GO->getSection(); @@ -550,7 +574,7 @@ MCSection *TargetLoweringObjectFileELF::getExplicitSectionGlobal( MCSectionELF *Section = getContext().getELFSection( SectionName, getELFSectionType(SectionName, Kind), Flags, - /*EntrySize=*/0, Group, UniqueID, AssociatedSymbol); + getEntrySizeForKind(Kind), Group, UniqueID, AssociatedSymbol); // Make sure that we did not get some other section with incompatible sh_link. // This should not be possible due to UniqueID code above. assert(Section->getAssociatedSymbol() == AssociatedSymbol && @@ -577,30 +601,6 @@ static StringRef getSectionPrefixForGlobal(SectionKind Kind) { return ".data.rel.ro"; } -static unsigned getEntrySizeForKind(SectionKind Kind) { - if (Kind.isMergeable1ByteCString()) - return 1; - else if (Kind.isMergeable2ByteCString()) - return 2; - else if (Kind.isMergeable4ByteCString()) - return 4; - else if (Kind.isMergeableConst4()) - return 4; - else if (Kind.isMergeableConst8()) - return 8; - else if (Kind.isMergeableConst16()) - return 16; - else if (Kind.isMergeableConst32()) - return 32; - else { - // We shouldn't have mergeable C strings or mergeable constants that we - // didn't handle above. - assert(!Kind.isMergeableCString() && "unknown string width"); - assert(!Kind.isMergeableConst() && "unknown data width"); - return 0; - } -} - static MCSectionELF *selectELFSectionForGlobal( MCContext &Ctx, const GlobalObject *GO, SectionKind Kind, Mangler &Mang, const TargetMachine &TM, bool EmitUniqueSection, unsigned Flags, diff --git a/test/CodeGen/X86/section_mergeable_size.ll b/test/CodeGen/X86/section_mergeable_size.ll new file mode 100644 index 00000000000..73b70c47f03 --- /dev/null +++ b/test/CodeGen/X86/section_mergeable_size.ll @@ -0,0 +1,3 @@ +; RUN: llc -mtriple x86_64-linux-gnu < %s | FileCheck %s +@a = internal unnamed_addr constant [1 x [1 x i32]] zeroinitializer, section ".init.rodata", align 4 +; CHECK: .init.rodata,"aM",{{[@%]}}progbits,4 -- GitLab From 8f99faa030cc8542c434dc6fd982f38ba09655a3 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Fri, 12 Oct 2018 16:41:02 +0000 Subject: [PATCH 0112/1116] [x86] add and use fast horizontal vector math subtarget feature This is the planned follow-up to D52997. Here we are reducing horizontal vector math codegen by default. AMD Jaguar (btver2) should have no difference with this patch because it has fast-hops. (If we want to set that bit for other CPUs, let me know.) The code changes are small, but there are many test diffs. For files that are specifically testing for hops, I added RUNs to distinguish fast/slow, so we can see the consequences side-by-side. For files that are primarily concerned with codegen other than hops, I just updated the CHECK lines to reflect the new default codegen. To recap the recent horizontal op story: 1. Before rL343727, we were producing hops for all subtargets for a variety of patterns. Hops were likely not optimal for all targets though. 2. The IR improvement in r343727 exposed a hole in the backend hop pattern matching, so we reduced hop codegen for all subtargets. That was bad for Jaguar (PR39195). 3. We restored the hop codegen for all targets with rL344141. Good for Jaguar, but probably bad for other CPUs. 4. This patch allows us to distinguish when we want to produce hops, so everyone can be happy. I'm not sure if we have the best predicate here, but the intent is to undo the extra hop-iness that was enabled by r344141. Differential Revision: https://reviews.llvm.org/D53095 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344361 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86.td | 12 +- lib/Target/X86/X86ISelLowering.cpp | 22 +- lib/Target/X86/X86Subtarget.h | 4 + test/CodeGen/X86/avx2-phaddsub.ll | 36 +- .../X86/avx512-intrinsics-fast-isel.ll | 21 +- test/CodeGen/X86/haddsub-shuf.ll | 891 ++++++++++++++---- test/CodeGen/X86/haddsub-undef.ll | 410 ++++++-- test/CodeGen/X86/haddsub.ll | 385 ++++++-- test/CodeGen/X86/madd.ll | 69 +- test/CodeGen/X86/phaddsub.ll | 616 ++++++++---- test/CodeGen/X86/required-vector-width.ll | 6 +- test/CodeGen/X86/sad.ll | 238 ++--- test/CodeGen/X86/vector-reduce-add.ll | 356 +++---- test/CodeGen/X86/vector-reduce-fadd-fast.ll | 493 +++++----- test/CodeGen/X86/vector-shuffle-combining.ll | 39 +- 15 files changed, 2378 insertions(+), 1220 deletions(-) diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td index 5d627f34c55..d1263a1fb45 100644 --- a/lib/Target/X86/X86.td +++ b/lib/Target/X86/X86.td @@ -404,6 +404,15 @@ def FeatureFastBEXTR : SubtargetFeature<"fast-bextr", "HasFastBEXTR", "true", "Indicates that the BEXTR instruction is implemented as a single uop " "with good throughput.">; +// Combine vector math operations with shuffles into horizontal math +// instructions if a CPU implements horizontal operations (introduced with +// SSE3) with better latency/throughput than the alternative sequence. +def FeatureFastHorizontalOps + : SubtargetFeature< + "fast-hops", "HasFastHorizontalOps", "true", + "Prefer horizontal vector math instructions (haddp, phsub, etc.) over " + "normal vector instructions with shuffles", [FeatureSSE3]>; + // Merge branches using three-way conditional code. def FeatureMergeToThreeWayBranch : SubtargetFeature<"merge-to-threeway-branch", "ThreewayBranchProfitable", "true", @@ -998,7 +1007,8 @@ def : ProcessorModel<"btver2", BtVer2Model, [ FeatureLAHFSAHF, FeatureFast15ByteNOP, FeatureFastBEXTR, - FeatureFastPartialYMMorZMMWrite + FeatureFastPartialYMMorZMMWrite, + FeatureFastHorizontalOps ]>; // Bulldozer diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 872d90ad004..97731dff9b2 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -37031,9 +37031,6 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) { // The low half of the 128-bit result must choose from A. // The high half of the 128-bit result must choose from B, // unless B is undef. In that case, we are always choosing from A. - // TODO: Using a horizontal op on a single input is likely worse for - // performance on many CPUs, so this should be limited here or reversed - // in a later pass. unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2; unsigned Src = B.getNode() ? i >= NumEltsPer64BitChunk : 0; @@ -37051,6 +37048,16 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) { return true; } +/// Horizontal vector math instructions may be slower than normal math with +/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch +/// implementation, and likely shuffle complexity of the alternate sequence. +static bool shouldCombineToHorizontalOp(bool IsSingleSource, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + bool IsOptimizingSize = DAG.getMachineFunction().getFunction().optForSize(); + bool HasFastHOps = Subtarget.hasFastHorizontalOps(); + return !IsSingleSource || IsOptimizingSize || HasFastHOps; +} + /// Do target-specific dag combines on floating-point adds/subs. static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { @@ -37063,7 +37070,8 @@ static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG, // Try to synthesize horizontal add/sub from adds/subs of shuffles. if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && - isHorizontalBinOp(LHS, RHS, IsFadd)) { + isHorizontalBinOp(LHS, RHS, IsFadd) && + shouldCombineToHorizontalOp(LHS == RHS, DAG, Subtarget)) { auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB; return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS); } @@ -39787,7 +39795,8 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, // Try to synthesize horizontal adds from adds of shuffles. if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 || VT == MVT::v8i32) && - Subtarget.hasSSSE3() && isHorizontalBinOp(Op0, Op1, true)) { + Subtarget.hasSSSE3() && isHorizontalBinOp(Op0, Op1, true) && + shouldCombineToHorizontalOp(Op0 == Op1, DAG, Subtarget)) { auto HADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL, ArrayRef Ops) { return DAG.getNode(X86ISD::HADD, DL, Ops[0].getValueType(), Ops); @@ -39918,7 +39927,8 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG, EVT VT = N->getValueType(0); if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 || VT == MVT::v8i32) && - Subtarget.hasSSSE3() && isHorizontalBinOp(Op0, Op1, false)) { + Subtarget.hasSSSE3() && isHorizontalBinOp(Op0, Op1, false) && + shouldCombineToHorizontalOp(Op0 == Op1, DAG, Subtarget)) { auto HSUBBuilder = [](SelectionDAG &DAG, const SDLoc &DL, ArrayRef Ops) { return DAG.getNode(X86ISD::HSUB, DL, Ops[0].getValueType(), Ops); diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h index ddee9a692e1..0df3058c374 100644 --- a/lib/Target/X86/X86Subtarget.h +++ b/lib/Target/X86/X86Subtarget.h @@ -388,6 +388,9 @@ protected: /// Processor has a single uop BEXTR implementation. bool HasFastBEXTR = false; + /// Try harder to combine to horizontal vector ops if they are fast. + bool HasFastHorizontalOps = false; + /// Use a retpoline thunk rather than indirect calls to block speculative /// execution. bool UseRetpolineIndirectCalls = false; @@ -636,6 +639,7 @@ public: bool hasFastLZCNT() const { return HasFastLZCNT; } bool hasFastSHLDRotate() const { return HasFastSHLDRotate; } bool hasFastBEXTR() const { return HasFastBEXTR; } + bool hasFastHorizontalOps() const { return HasFastHorizontalOps; } bool hasMacroFusion() const { return HasMacroFusion; } bool hasERMSB() const { return HasERMSB; } bool hasSlowDivide32() const { return HasSlowDivide32; } diff --git a/test/CodeGen/X86/avx2-phaddsub.ll b/test/CodeGen/X86/avx2-phaddsub.ll index 67ea37575ab..99cdb100e3f 100644 --- a/test/CodeGen/X86/avx2-phaddsub.ll +++ b/test/CodeGen/X86/avx2-phaddsub.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X32 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64 +; RUN: llc < %s -mtriple=i686-- -mattr=+avx2 | FileCheck %s --check-prefixes=X32,X32-SLOW +; RUN: llc < %s -mtriple=i686-- -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=X32,X32-FAST +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=X64,X64-SLOW +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=X64,X64-FAST define <16 x i16> @phaddw1(<16 x i16> %x, <16 x i16> %y) { ; X32-LABEL: phaddw1: @@ -67,15 +69,29 @@ define <8 x i32> @phaddd2(<8 x i32> %x, <8 x i32> %y) { } define <8 x i32> @phaddd3(<8 x i32> %x) { -; X32-LABEL: phaddd3: -; X32: # %bb.0: -; X32-NEXT: vphaddd %ymm0, %ymm0, %ymm0 -; X32-NEXT: retl +; X32-SLOW-LABEL: phaddd3: +; X32-SLOW: # %bb.0: +; X32-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7] +; X32-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7] +; X32-SLOW-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; X32-SLOW-NEXT: retl ; -; X64-LABEL: phaddd3: -; X64: # %bb.0: -; X64-NEXT: vphaddd %ymm0, %ymm0, %ymm0 -; X64-NEXT: retq +; X32-FAST-LABEL: phaddd3: +; X32-FAST: # %bb.0: +; X32-FAST-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; X32-FAST-NEXT: retl +; +; X64-SLOW-LABEL: phaddd3: +; X64-SLOW: # %bb.0: +; X64-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7] +; X64-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7] +; X64-SLOW-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; X64-SLOW-NEXT: retq +; +; X64-FAST-LABEL: phaddd3: +; X64-FAST: # %bb.0: +; X64-FAST-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; X64-FAST-NEXT: retq %a = shufflevector <8 x i32> %x, <8 x i32> undef, <8 x i32> %b = shufflevector <8 x i32> %x, <8 x i32> undef, <8 x i32> %r = add <8 x i32> %a, %b diff --git a/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll index 20c509732c8..fa37d2148f2 100644 --- a/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll +++ b/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll @@ -6860,7 +6860,8 @@ define i32 @test_mm512_reduce_add_epi32(<8 x i64> %__W) { ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; CHECK-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: vmovd %xmm0, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: ret{{[l|q]}} @@ -6989,7 +6990,8 @@ define i32 @test_mm512_mask_reduce_add_epi32(i16 zeroext %__M, <8 x i64> %__W) { ; X86-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; X86-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovd %xmm0, %eax ; X86-NEXT: vzeroupper ; X86-NEXT: retl @@ -7004,7 +7006,8 @@ define i32 @test_mm512_mask_reduce_add_epi32(i16 zeroext %__M, <8 x i64> %__W) { ; X64-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; X64-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; X64-NEXT: vmovd %xmm0, %eax ; X64-NEXT: vzeroupper ; X64-NEXT: retq @@ -7210,7 +7213,8 @@ define double @test_mm512_reduce_add_pd(<8 x double> %__W) { ; X86-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; X86-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 +; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ; X86-NEXT: vmovlpd %xmm0, (%esp) ; X86-NEXT: fldl (%esp) ; X86-NEXT: movl %ebp, %esp @@ -7225,7 +7229,8 @@ define double @test_mm512_reduce_add_pd(<8 x double> %__W) { ; X64-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; X64-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 +; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq entry: @@ -7405,7 +7410,8 @@ define double @test_mm512_mask_reduce_add_pd(i8 zeroext %__M, <8 x double> %__W) ; X86-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; X86-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 +; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ; X86-NEXT: vmovlpd %xmm0, (%esp) ; X86-NEXT: fldl (%esp) ; X86-NEXT: movl %ebp, %esp @@ -7422,7 +7428,8 @@ define double @test_mm512_mask_reduce_add_pd(i8 zeroext %__M, <8 x double> %__W) ; X64-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; X64-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 +; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq entry: diff --git a/test/CodeGen/X86/haddsub-shuf.ll b/test/CodeGen/X86/haddsub-shuf.ll index ac5d5a70e30..0ece3fe1414 100644 --- a/test/CodeGen/X86/haddsub-shuf.ll +++ b/test/CodeGen/X86/haddsub-shuf.ll @@ -1,21 +1,54 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSSE3 -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3,SSSE3_SLOW +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSSE3,SSSE3_FAST +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1_SLOW +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX1,AVX1_FAST +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2_SLOW +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=AVX,AVX2,AVX2_FAST ; The next 8 tests check for matching the horizontal op and eliminating the shuffle. ; PR34111 - https://bugs.llvm.org/show_bug.cgi?id=34111 define <4 x float> @hadd_v4f32(<4 x float> %a) { -; SSSE3-LABEL: hadd_v4f32: -; SSSE3: # %bb.0: -; SSSE3-NEXT: haddps %xmm0, %xmm0 -; SSSE3-NEXT: retq -; -; AVX-LABEL: hadd_v4f32: -; AVX: # %bb.0: -; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSSE3_SLOW-LABEL: hadd_v4f32: +; SSSE3_SLOW: # %bb.0: +; SSSE3_SLOW-NEXT: movaps %xmm0, %xmm1 +; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3] +; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSSE3_SLOW-NEXT: addps %xmm1, %xmm0 +; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] +; SSSE3_SLOW-NEXT: retq +; +; SSSE3_FAST-LABEL: hadd_v4f32: +; SSSE3_FAST: # %bb.0: +; SSSE3_FAST-NEXT: haddps %xmm0, %xmm0 +; SSSE3_FAST-NEXT: retq +; +; AVX1_SLOW-LABEL: hadd_v4f32: +; AVX1_SLOW: # %bb.0: +; AVX1_SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3] +; AVX1_SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3] +; AVX1_SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX1_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX1_SLOW-NEXT: retq +; +; AVX1_FAST-LABEL: hadd_v4f32: +; AVX1_FAST: # %bb.0: +; AVX1_FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX1_FAST-NEXT: retq +; +; AVX2_SLOW-LABEL: hadd_v4f32: +; AVX2_SLOW: # %bb.0: +; AVX2_SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3] +; AVX2_SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3] +; AVX2_SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX2_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX2_SLOW-NEXT: retq +; +; AVX2_FAST-LABEL: hadd_v4f32: +; AVX2_FAST: # %bb.0: +; AVX2_FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX2_FAST-NEXT: retq %a02 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> %a13 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> %hop = fadd <2 x float> %a02, %a13 @@ -54,16 +87,51 @@ define <8 x float> @hadd_v8f32a(<8 x float> %a) { } define <8 x float> @hadd_v8f32b(<8 x float> %a) { -; SSSE3-LABEL: hadd_v8f32b: -; SSSE3: # %bb.0: -; SSSE3-NEXT: haddps %xmm0, %xmm0 -; SSSE3-NEXT: haddps %xmm1, %xmm1 -; SSSE3-NEXT: retq -; -; AVX-LABEL: hadd_v8f32b: -; AVX: # %bb.0: -; AVX-NEXT: vhaddps %ymm0, %ymm0, %ymm0 -; AVX-NEXT: retq +; SSSE3_SLOW-LABEL: hadd_v8f32b: +; SSSE3_SLOW: # %bb.0: +; SSSE3_SLOW-NEXT: movaps %xmm0, %xmm2 +; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[2,3] +; SSSE3_SLOW-NEXT: movaps %xmm1, %xmm3 +; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm1[2,3] +; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSSE3_SLOW-NEXT: addps %xmm2, %xmm0 +; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3,2,3] +; SSSE3_SLOW-NEXT: addps %xmm3, %xmm1 +; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] +; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm1[0,0] +; SSSE3_SLOW-NEXT: retq +; +; SSSE3_FAST-LABEL: hadd_v8f32b: +; SSSE3_FAST: # %bb.0: +; SSSE3_FAST-NEXT: haddps %xmm0, %xmm0 +; SSSE3_FAST-NEXT: haddps %xmm1, %xmm1 +; SSSE3_FAST-NEXT: retq +; +; AVX1_SLOW-LABEL: hadd_v8f32b: +; AVX1_SLOW: # %bb.0: +; AVX1_SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7] +; AVX1_SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7] +; AVX1_SLOW-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; AVX1_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX1_SLOW-NEXT: retq +; +; AVX1_FAST-LABEL: hadd_v8f32b: +; AVX1_FAST: # %bb.0: +; AVX1_FAST-NEXT: vhaddps %ymm0, %ymm0, %ymm0 +; AVX1_FAST-NEXT: retq +; +; AVX2_SLOW-LABEL: hadd_v8f32b: +; AVX2_SLOW: # %bb.0: +; AVX2_SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7] +; AVX2_SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7] +; AVX2_SLOW-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; AVX2_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX2_SLOW-NEXT: retq +; +; AVX2_FAST-LABEL: hadd_v8f32b: +; AVX2_FAST: # %bb.0: +; AVX2_FAST-NEXT: vhaddps %ymm0, %ymm0, %ymm0 +; AVX2_FAST-NEXT: retq %a0 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> %a1 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> %hop = fadd <8 x float> %a0, %a1 @@ -72,15 +140,45 @@ define <8 x float> @hadd_v8f32b(<8 x float> %a) { } define <4 x float> @hsub_v4f32(<4 x float> %a) { -; SSSE3-LABEL: hsub_v4f32: -; SSSE3: # %bb.0: -; SSSE3-NEXT: hsubps %xmm0, %xmm0 -; SSSE3-NEXT: retq -; -; AVX-LABEL: hsub_v4f32: -; AVX: # %bb.0: -; AVX-NEXT: vhsubps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSSE3_SLOW-LABEL: hsub_v4f32: +; SSSE3_SLOW: # %bb.0: +; SSSE3_SLOW-NEXT: movaps %xmm0, %xmm1 +; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3] +; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSSE3_SLOW-NEXT: subps %xmm0, %xmm1 +; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0] +; SSSE3_SLOW-NEXT: retq +; +; SSSE3_FAST-LABEL: hsub_v4f32: +; SSSE3_FAST: # %bb.0: +; SSSE3_FAST-NEXT: hsubps %xmm0, %xmm0 +; SSSE3_FAST-NEXT: retq +; +; AVX1_SLOW-LABEL: hsub_v4f32: +; AVX1_SLOW: # %bb.0: +; AVX1_SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3] +; AVX1_SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3] +; AVX1_SLOW-NEXT: vsubps %xmm0, %xmm1, %xmm0 +; AVX1_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX1_SLOW-NEXT: retq +; +; AVX1_FAST-LABEL: hsub_v4f32: +; AVX1_FAST: # %bb.0: +; AVX1_FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0 +; AVX1_FAST-NEXT: retq +; +; AVX2_SLOW-LABEL: hsub_v4f32: +; AVX2_SLOW: # %bb.0: +; AVX2_SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3] +; AVX2_SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3] +; AVX2_SLOW-NEXT: vsubps %xmm0, %xmm1, %xmm0 +; AVX2_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX2_SLOW-NEXT: retq +; +; AVX2_FAST-LABEL: hsub_v4f32: +; AVX2_FAST: # %bb.0: +; AVX2_FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0 +; AVX2_FAST-NEXT: retq %a02 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> %a13 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> %hop = fsub <2 x float> %a02, %a13 @@ -119,16 +217,51 @@ define <8 x float> @hsub_v8f32a(<8 x float> %a) { } define <8 x float> @hsub_v8f32b(<8 x float> %a) { -; SSSE3-LABEL: hsub_v8f32b: -; SSSE3: # %bb.0: -; SSSE3-NEXT: hsubps %xmm0, %xmm0 -; SSSE3-NEXT: hsubps %xmm1, %xmm1 -; SSSE3-NEXT: retq -; -; AVX-LABEL: hsub_v8f32b: -; AVX: # %bb.0: -; AVX-NEXT: vhsubps %ymm0, %ymm0, %ymm0 -; AVX-NEXT: retq +; SSSE3_SLOW-LABEL: hsub_v8f32b: +; SSSE3_SLOW: # %bb.0: +; SSSE3_SLOW-NEXT: movaps %xmm0, %xmm2 +; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[2,3] +; SSSE3_SLOW-NEXT: movaps %xmm1, %xmm3 +; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm1[2,3] +; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSSE3_SLOW-NEXT: subps %xmm0, %xmm2 +; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3,2,3] +; SSSE3_SLOW-NEXT: subps %xmm1, %xmm3 +; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm2[0,0] +; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm3[0,0] +; SSSE3_SLOW-NEXT: retq +; +; SSSE3_FAST-LABEL: hsub_v8f32b: +; SSSE3_FAST: # %bb.0: +; SSSE3_FAST-NEXT: hsubps %xmm0, %xmm0 +; SSSE3_FAST-NEXT: hsubps %xmm1, %xmm1 +; SSSE3_FAST-NEXT: retq +; +; AVX1_SLOW-LABEL: hsub_v8f32b: +; AVX1_SLOW: # %bb.0: +; AVX1_SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7] +; AVX1_SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7] +; AVX1_SLOW-NEXT: vsubps %ymm0, %ymm1, %ymm0 +; AVX1_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX1_SLOW-NEXT: retq +; +; AVX1_FAST-LABEL: hsub_v8f32b: +; AVX1_FAST: # %bb.0: +; AVX1_FAST-NEXT: vhsubps %ymm0, %ymm0, %ymm0 +; AVX1_FAST-NEXT: retq +; +; AVX2_SLOW-LABEL: hsub_v8f32b: +; AVX2_SLOW: # %bb.0: +; AVX2_SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7] +; AVX2_SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7] +; AVX2_SLOW-NEXT: vsubps %ymm0, %ymm1, %ymm0 +; AVX2_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX2_SLOW-NEXT: retq +; +; AVX2_FAST-LABEL: hsub_v8f32b: +; AVX2_FAST: # %bb.0: +; AVX2_FAST-NEXT: vhsubps %ymm0, %ymm0, %ymm0 +; AVX2_FAST-NEXT: retq %a0 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> %a1 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> %hop = fsub <8 x float> %a0, %a1 @@ -137,15 +270,42 @@ define <8 x float> @hsub_v8f32b(<8 x float> %a) { } define <2 x double> @hadd_v2f64(<2 x double> %a) { -; SSSE3-LABEL: hadd_v2f64: -; SSSE3: # %bb.0: -; SSSE3-NEXT: haddpd %xmm0, %xmm0 -; SSSE3-NEXT: retq -; -; AVX-LABEL: hadd_v2f64: -; AVX: # %bb.0: -; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSSE3_SLOW-LABEL: hadd_v2f64: +; SSSE3_SLOW: # %bb.0: +; SSSE3_SLOW-NEXT: movapd %xmm0, %xmm1 +; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSSE3_SLOW-NEXT: addpd %xmm0, %xmm1 +; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0] +; SSSE3_SLOW-NEXT: retq +; +; SSSE3_FAST-LABEL: hadd_v2f64: +; SSSE3_FAST: # %bb.0: +; SSSE3_FAST-NEXT: haddpd %xmm0, %xmm0 +; SSSE3_FAST-NEXT: retq +; +; AVX1_SLOW-LABEL: hadd_v2f64: +; AVX1_SLOW: # %bb.0: +; AVX1_SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1_SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; AVX1_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX1_SLOW-NEXT: retq +; +; AVX1_FAST-LABEL: hadd_v2f64: +; AVX1_FAST: # %bb.0: +; AVX1_FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 +; AVX1_FAST-NEXT: retq +; +; AVX2_SLOW-LABEL: hadd_v2f64: +; AVX2_SLOW: # %bb.0: +; AVX2_SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX2_SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; AVX2_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX2_SLOW-NEXT: retq +; +; AVX2_FAST-LABEL: hadd_v2f64: +; AVX2_FAST: # %bb.0: +; AVX2_FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 +; AVX2_FAST-NEXT: retq %a0 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> %a1 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> %hop = fadd <2 x double> %a0, %a1 @@ -154,16 +314,47 @@ define <2 x double> @hadd_v2f64(<2 x double> %a) { } define <4 x double> @hadd_v4f64(<4 x double> %a) { -; SSSE3-LABEL: hadd_v4f64: -; SSSE3: # %bb.0: -; SSSE3-NEXT: haddpd %xmm0, %xmm0 -; SSSE3-NEXT: haddpd %xmm1, %xmm1 -; SSSE3-NEXT: retq -; -; AVX-LABEL: hadd_v4f64: -; AVX: # %bb.0: -; AVX-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 -; AVX-NEXT: retq +; SSSE3_SLOW-LABEL: hadd_v4f64: +; SSSE3_SLOW: # %bb.0: +; SSSE3_SLOW-NEXT: movapd %xmm0, %xmm2 +; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSSE3_SLOW-NEXT: movapd %xmm1, %xmm3 +; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] +; SSSE3_SLOW-NEXT: addpd %xmm1, %xmm3 +; SSSE3_SLOW-NEXT: addpd %xmm0, %xmm2 +; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm2[0,0] +; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm3[0,0] +; SSSE3_SLOW-NEXT: retq +; +; SSSE3_FAST-LABEL: hadd_v4f64: +; SSSE3_FAST: # %bb.0: +; SSSE3_FAST-NEXT: haddpd %xmm0, %xmm0 +; SSSE3_FAST-NEXT: haddpd %xmm1, %xmm1 +; SSSE3_FAST-NEXT: retq +; +; AVX1_SLOW-LABEL: hadd_v4f64: +; AVX1_SLOW: # %bb.0: +; AVX1_SLOW-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2] +; AVX1_SLOW-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX1_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX1_SLOW-NEXT: retq +; +; AVX1_FAST-LABEL: hadd_v4f64: +; AVX1_FAST: # %bb.0: +; AVX1_FAST-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 +; AVX1_FAST-NEXT: retq +; +; AVX2_SLOW-LABEL: hadd_v4f64: +; AVX2_SLOW: # %bb.0: +; AVX2_SLOW-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2] +; AVX2_SLOW-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX2_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX2_SLOW-NEXT: retq +; +; AVX2_FAST-LABEL: hadd_v4f64: +; AVX2_FAST: # %bb.0: +; AVX2_FAST-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 +; AVX2_FAST-NEXT: retq %a0 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> %a1 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> %hop = fadd <4 x double> %a0, %a1 @@ -172,15 +363,42 @@ define <4 x double> @hadd_v4f64(<4 x double> %a) { } define <2 x double> @hsub_v2f64(<2 x double> %a) { -; SSSE3-LABEL: hsub_v2f64: -; SSSE3: # %bb.0: -; SSSE3-NEXT: hsubpd %xmm0, %xmm0 -; SSSE3-NEXT: retq -; -; AVX-LABEL: hsub_v2f64: -; AVX: # %bb.0: -; AVX-NEXT: vhsubpd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSSE3_SLOW-LABEL: hsub_v2f64: +; SSSE3_SLOW: # %bb.0: +; SSSE3_SLOW-NEXT: movapd %xmm0, %xmm1 +; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSSE3_SLOW-NEXT: subpd %xmm1, %xmm0 +; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] +; SSSE3_SLOW-NEXT: retq +; +; SSSE3_FAST-LABEL: hsub_v2f64: +; SSSE3_FAST: # %bb.0: +; SSSE3_FAST-NEXT: hsubpd %xmm0, %xmm0 +; SSSE3_FAST-NEXT: retq +; +; AVX1_SLOW-LABEL: hsub_v2f64: +; AVX1_SLOW: # %bb.0: +; AVX1_SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1_SLOW-NEXT: vsubpd %xmm1, %xmm0, %xmm0 +; AVX1_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX1_SLOW-NEXT: retq +; +; AVX1_FAST-LABEL: hsub_v2f64: +; AVX1_FAST: # %bb.0: +; AVX1_FAST-NEXT: vhsubpd %xmm0, %xmm0, %xmm0 +; AVX1_FAST-NEXT: retq +; +; AVX2_SLOW-LABEL: hsub_v2f64: +; AVX2_SLOW: # %bb.0: +; AVX2_SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX2_SLOW-NEXT: vsubpd %xmm1, %xmm0, %xmm0 +; AVX2_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX2_SLOW-NEXT: retq +; +; AVX2_FAST-LABEL: hsub_v2f64: +; AVX2_FAST: # %bb.0: +; AVX2_FAST-NEXT: vhsubpd %xmm0, %xmm0, %xmm0 +; AVX2_FAST-NEXT: retq %a0 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> %a1 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> %hop = fsub <2 x double> %a0, %a1 @@ -189,16 +407,47 @@ define <2 x double> @hsub_v2f64(<2 x double> %a) { } define <4 x double> @hsub_v4f64(<4 x double> %a) { -; SSSE3-LABEL: hsub_v4f64: -; SSSE3: # %bb.0: -; SSSE3-NEXT: hsubpd %xmm0, %xmm0 -; SSSE3-NEXT: hsubpd %xmm1, %xmm1 -; SSSE3-NEXT: retq -; -; AVX-LABEL: hsub_v4f64: -; AVX: # %bb.0: -; AVX-NEXT: vhsubpd %ymm0, %ymm0, %ymm0 -; AVX-NEXT: retq +; SSSE3_SLOW-LABEL: hsub_v4f64: +; SSSE3_SLOW: # %bb.0: +; SSSE3_SLOW-NEXT: movapd %xmm0, %xmm2 +; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSSE3_SLOW-NEXT: movapd %xmm1, %xmm3 +; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] +; SSSE3_SLOW-NEXT: subpd %xmm3, %xmm1 +; SSSE3_SLOW-NEXT: subpd %xmm2, %xmm0 +; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] +; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm1[0,0] +; SSSE3_SLOW-NEXT: retq +; +; SSSE3_FAST-LABEL: hsub_v4f64: +; SSSE3_FAST: # %bb.0: +; SSSE3_FAST-NEXT: hsubpd %xmm0, %xmm0 +; SSSE3_FAST-NEXT: hsubpd %xmm1, %xmm1 +; SSSE3_FAST-NEXT: retq +; +; AVX1_SLOW-LABEL: hsub_v4f64: +; AVX1_SLOW: # %bb.0: +; AVX1_SLOW-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2] +; AVX1_SLOW-NEXT: vsubpd %ymm1, %ymm0, %ymm0 +; AVX1_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX1_SLOW-NEXT: retq +; +; AVX1_FAST-LABEL: hsub_v4f64: +; AVX1_FAST: # %bb.0: +; AVX1_FAST-NEXT: vhsubpd %ymm0, %ymm0, %ymm0 +; AVX1_FAST-NEXT: retq +; +; AVX2_SLOW-LABEL: hsub_v4f64: +; AVX2_SLOW: # %bb.0: +; AVX2_SLOW-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2] +; AVX2_SLOW-NEXT: vsubpd %ymm1, %ymm0, %ymm0 +; AVX2_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX2_SLOW-NEXT: retq +; +; AVX2_FAST-LABEL: hsub_v4f64: +; AVX2_FAST: # %bb.0: +; AVX2_FAST-NEXT: vhsubpd %ymm0, %ymm0, %ymm0 +; AVX2_FAST-NEXT: retq %a0 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> %a1 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> %hop = fsub <4 x double> %a0, %a1 @@ -207,15 +456,44 @@ define <4 x double> @hsub_v4f64(<4 x double> %a) { } define <4 x i32> @hadd_v4i32(<4 x i32> %a) { -; SSSE3-LABEL: hadd_v4i32: -; SSSE3: # %bb.0: -; SSSE3-NEXT: phaddd %xmm0, %xmm0 -; SSSE3-NEXT: retq -; -; AVX-LABEL: hadd_v4i32: -; AVX: # %bb.0: -; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSSE3_SLOW-LABEL: hadd_v4i32: +; SSSE3_SLOW: # %bb.0: +; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] +; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSSE3_SLOW-NEXT: paddd %xmm1, %xmm0 +; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSSE3_SLOW-NEXT: retq +; +; SSSE3_FAST-LABEL: hadd_v4i32: +; SSSE3_FAST: # %bb.0: +; SSSE3_FAST-NEXT: phaddd %xmm0, %xmm0 +; SSSE3_FAST-NEXT: retq +; +; AVX1_SLOW-LABEL: hadd_v4i32: +; AVX1_SLOW: # %bb.0: +; AVX1_SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] +; AVX1_SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; AVX1_SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1_SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1_SLOW-NEXT: retq +; +; AVX1_FAST-LABEL: hadd_v4i32: +; AVX1_FAST: # %bb.0: +; AVX1_FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1_FAST-NEXT: retq +; +; AVX2_SLOW-LABEL: hadd_v4i32: +; AVX2_SLOW: # %bb.0: +; AVX2_SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] +; AVX2_SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; AVX2_SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX2_SLOW-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX2_SLOW-NEXT: retq +; +; AVX2_FAST-LABEL: hadd_v4i32: +; AVX2_FAST: # %bb.0: +; AVX2_FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX2_FAST-NEXT: retq %a02 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> %a13 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> %hop = add <4 x i32> %a02, %a13 @@ -254,25 +532,57 @@ define <8 x i32> @hadd_v8i32a(<8 x i32> %a) { } define <8 x i32> @hadd_v8i32b(<8 x i32> %a) { -; SSSE3-LABEL: hadd_v8i32b: -; SSSE3: # %bb.0: -; SSSE3-NEXT: phaddd %xmm0, %xmm0 -; SSSE3-NEXT: phaddd %xmm1, %xmm1 -; SSSE3-NEXT: retq -; -; AVX1-LABEL: hadd_v8i32b: -; AVX1: # %bb.0: -; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] -; AVX1-NEXT: retq -; -; AVX2-LABEL: hadd_v8i32b: -; AVX2: # %bb.0: -; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 -; AVX2-NEXT: retq +; SSSE3_SLOW-LABEL: hadd_v8i32b: +; SSSE3_SLOW: # %bb.0: +; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] +; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,2,2,3] +; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSSE3_SLOW-NEXT: paddd %xmm2, %xmm0 +; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; SSSE3_SLOW-NEXT: paddd %xmm3, %xmm1 +; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; SSSE3_SLOW-NEXT: retq +; +; SSSE3_FAST-LABEL: hadd_v8i32b: +; SSSE3_FAST: # %bb.0: +; SSSE3_FAST-NEXT: phaddd %xmm0, %xmm0 +; SSSE3_FAST-NEXT: phaddd %xmm1, %xmm1 +; SSSE3_FAST-NEXT: retq +; +; AVX1_SLOW-LABEL: hadd_v8i32b: +; AVX1_SLOW: # %bb.0: +; AVX1_SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7] +; AVX1_SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7] +; AVX1_SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1_SLOW-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1_SLOW-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; AVX1_SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1_SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX1_SLOW-NEXT: retq +; +; AVX1_FAST-LABEL: hadd_v8i32b: +; AVX1_FAST: # %bb.0: +; AVX1_FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm1 +; AVX1_FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1_FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1_FAST-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1_FAST-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX1_FAST-NEXT: retq +; +; AVX2_SLOW-LABEL: hadd_v8i32b: +; AVX2_SLOW: # %bb.0: +; AVX2_SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7] +; AVX2_SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7] +; AVX2_SLOW-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; AVX2_SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] +; AVX2_SLOW-NEXT: retq +; +; AVX2_FAST-LABEL: hadd_v8i32b: +; AVX2_FAST: # %bb.0: +; AVX2_FAST-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX2_FAST-NEXT: retq %a0 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> %a1 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> %hop = add <8 x i32> %a0, %a1 @@ -281,15 +591,44 @@ define <8 x i32> @hadd_v8i32b(<8 x i32> %a) { } define <4 x i32> @hsub_v4i32(<4 x i32> %a) { -; SSSE3-LABEL: hsub_v4i32: -; SSSE3: # %bb.0: -; SSSE3-NEXT: phsubd %xmm0, %xmm0 -; SSSE3-NEXT: retq -; -; AVX-LABEL: hsub_v4i32: -; AVX: # %bb.0: -; AVX-NEXT: vphsubd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSSE3_SLOW-LABEL: hsub_v4i32: +; SSSE3_SLOW: # %bb.0: +; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] +; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSSE3_SLOW-NEXT: psubd %xmm0, %xmm1 +; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] +; SSSE3_SLOW-NEXT: retq +; +; SSSE3_FAST-LABEL: hsub_v4i32: +; SSSE3_FAST: # %bb.0: +; SSSE3_FAST-NEXT: phsubd %xmm0, %xmm0 +; SSSE3_FAST-NEXT: retq +; +; AVX1_SLOW-LABEL: hsub_v4i32: +; AVX1_SLOW: # %bb.0: +; AVX1_SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] +; AVX1_SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; AVX1_SLOW-NEXT: vpsubd %xmm0, %xmm1, %xmm0 +; AVX1_SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1_SLOW-NEXT: retq +; +; AVX1_FAST-LABEL: hsub_v4i32: +; AVX1_FAST: # %bb.0: +; AVX1_FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0 +; AVX1_FAST-NEXT: retq +; +; AVX2_SLOW-LABEL: hsub_v4i32: +; AVX2_SLOW: # %bb.0: +; AVX2_SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] +; AVX2_SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; AVX2_SLOW-NEXT: vpsubd %xmm0, %xmm1, %xmm0 +; AVX2_SLOW-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX2_SLOW-NEXT: retq +; +; AVX2_FAST-LABEL: hsub_v4i32: +; AVX2_FAST: # %bb.0: +; AVX2_FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0 +; AVX2_FAST-NEXT: retq %a02 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> %a13 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> %hop = sub <4 x i32> %a02, %a13 @@ -328,25 +667,57 @@ define <8 x i32> @hsub_v8i32a(<8 x i32> %a) { } define <8 x i32> @hsub_v8i32b(<8 x i32> %a) { -; SSSE3-LABEL: hsub_v8i32b: -; SSSE3: # %bb.0: -; SSSE3-NEXT: phsubd %xmm0, %xmm0 -; SSSE3-NEXT: phsubd %xmm1, %xmm1 -; SSSE3-NEXT: retq -; -; AVX1-LABEL: hsub_v8i32b: -; AVX1: # %bb.0: -; AVX1-NEXT: vphsubd %xmm0, %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vphsubd %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] -; AVX1-NEXT: retq -; -; AVX2-LABEL: hsub_v8i32b: -; AVX2: # %bb.0: -; AVX2-NEXT: vphsubd %ymm0, %ymm0, %ymm0 -; AVX2-NEXT: retq +; SSSE3_SLOW-LABEL: hsub_v8i32b: +; SSSE3_SLOW: # %bb.0: +; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] +; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,2,2,3] +; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSSE3_SLOW-NEXT: psubd %xmm0, %xmm2 +; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] +; SSSE3_SLOW-NEXT: psubd %xmm0, %xmm3 +; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1] +; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,0,1] +; SSSE3_SLOW-NEXT: retq +; +; SSSE3_FAST-LABEL: hsub_v8i32b: +; SSSE3_FAST: # %bb.0: +; SSSE3_FAST-NEXT: phsubd %xmm0, %xmm0 +; SSSE3_FAST-NEXT: phsubd %xmm1, %xmm1 +; SSSE3_FAST-NEXT: retq +; +; AVX1_SLOW-LABEL: hsub_v8i32b: +; AVX1_SLOW: # %bb.0: +; AVX1_SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7] +; AVX1_SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7] +; AVX1_SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1_SLOW-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1_SLOW-NEXT: vpsubd %xmm2, %xmm3, %xmm2 +; AVX1_SLOW-NEXT: vpsubd %xmm0, %xmm1, %xmm0 +; AVX1_SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX1_SLOW-NEXT: retq +; +; AVX1_FAST-LABEL: hsub_v8i32b: +; AVX1_FAST: # %bb.0: +; AVX1_FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm1 +; AVX1_FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1_FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0 +; AVX1_FAST-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1_FAST-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX1_FAST-NEXT: retq +; +; AVX2_SLOW-LABEL: hsub_v8i32b: +; AVX2_SLOW: # %bb.0: +; AVX2_SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7] +; AVX2_SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7] +; AVX2_SLOW-NEXT: vpsubd %ymm0, %ymm1, %ymm0 +; AVX2_SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] +; AVX2_SLOW-NEXT: retq +; +; AVX2_FAST-LABEL: hsub_v8i32b: +; AVX2_FAST: # %bb.0: +; AVX2_FAST-NEXT: vphsubd %ymm0, %ymm0, %ymm0 +; AVX2_FAST-NEXT: retq %a0 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> %a1 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> %hop = sub <8 x i32> %a0, %a1 @@ -355,15 +726,45 @@ define <8 x i32> @hsub_v8i32b(<8 x i32> %a) { } define <8 x i16> @hadd_v8i16(<8 x i16> %a) { -; SSSE3-LABEL: hadd_v8i16: -; SSSE3: # %bb.0: -; SSSE3-NEXT: phaddw %xmm0, %xmm0 -; SSSE3-NEXT: retq -; -; AVX-LABEL: hadd_v8i16: -; AVX: # %bb.0: -; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSSE3_SLOW-LABEL: hadd_v8i16: +; SSSE3_SLOW: # %bb.0: +; SSSE3_SLOW-NEXT: movdqa %xmm0, %xmm1 +; SSSE3_SLOW-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSSE3_SLOW-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] +; SSSE3_SLOW-NEXT: paddw %xmm1, %xmm0 +; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSSE3_SLOW-NEXT: retq +; +; SSSE3_FAST-LABEL: hadd_v8i16: +; SSSE3_FAST: # %bb.0: +; SSSE3_FAST-NEXT: phaddw %xmm0, %xmm0 +; SSSE3_FAST-NEXT: retq +; +; AVX1_SLOW-LABEL: hadd_v8i16: +; AVX1_SLOW: # %bb.0: +; AVX1_SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1_SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] +; AVX1_SLOW-NEXT: vpaddw %xmm0, %xmm1, %xmm0 +; AVX1_SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1_SLOW-NEXT: retq +; +; AVX1_FAST-LABEL: hadd_v8i16: +; AVX1_FAST: # %bb.0: +; AVX1_FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX1_FAST-NEXT: retq +; +; AVX2_SLOW-LABEL: hadd_v8i16: +; AVX2_SLOW: # %bb.0: +; AVX2_SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX2_SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] +; AVX2_SLOW-NEXT: vpaddw %xmm0, %xmm1, %xmm0 +; AVX2_SLOW-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX2_SLOW-NEXT: retq +; +; AVX2_FAST-LABEL: hadd_v8i16: +; AVX2_FAST: # %bb.0: +; AVX2_FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX2_FAST-NEXT: retq %a0246 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> %a1357 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> %hop = add <8 x i16> %a0246, %a1357 @@ -402,25 +803,64 @@ define <16 x i16> @hadd_v16i16a(<16 x i16> %a) { } define <16 x i16> @hadd_v16i16b(<16 x i16> %a) { -; SSSE3-LABEL: hadd_v16i16b: -; SSSE3: # %bb.0: -; SSSE3-NEXT: phaddw %xmm0, %xmm0 -; SSSE3-NEXT: phaddw %xmm1, %xmm1 -; SSSE3-NEXT: retq -; -; AVX1-LABEL: hadd_v16i16b: -; AVX1: # %bb.0: -; AVX1-NEXT: vphaddw %xmm0, %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vphaddw %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] -; AVX1-NEXT: retq -; -; AVX2-LABEL: hadd_v16i16b: -; AVX2: # %bb.0: -; AVX2-NEXT: vphaddw %ymm0, %ymm0, %ymm0 -; AVX2-NEXT: retq +; SSSE3_SLOW-LABEL: hadd_v16i16b: +; SSSE3_SLOW: # %bb.0: +; SSSE3_SLOW-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSSE3_SLOW-NEXT: movdqa %xmm0, %xmm3 +; SSSE3_SLOW-NEXT: pshufb %xmm2, %xmm3 +; SSSE3_SLOW-NEXT: movdqa %xmm1, %xmm4 +; SSSE3_SLOW-NEXT: pshufb %xmm2, %xmm4 +; SSSE3_SLOW-NEXT: movdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] +; SSSE3_SLOW-NEXT: pshufb %xmm2, %xmm0 +; SSSE3_SLOW-NEXT: paddw %xmm3, %xmm0 +; SSSE3_SLOW-NEXT: pshufb %xmm2, %xmm1 +; SSSE3_SLOW-NEXT: paddw %xmm4, %xmm1 +; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; SSSE3_SLOW-NEXT: retq +; +; SSSE3_FAST-LABEL: hadd_v16i16b: +; SSSE3_FAST: # %bb.0: +; SSSE3_FAST-NEXT: phaddw %xmm0, %xmm0 +; SSSE3_FAST-NEXT: phaddw %xmm1, %xmm1 +; SSSE3_FAST-NEXT: retq +; +; AVX1_SLOW-LABEL: hadd_v16i16b: +; AVX1_SLOW: # %bb.0: +; AVX1_SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1_SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm2 +; AVX1_SLOW-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1_SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX1_SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] +; AVX1_SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX1_SLOW-NEXT: vpaddw %xmm0, %xmm2, %xmm0 +; AVX1_SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm2 +; AVX1_SLOW-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX1_SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX1_SLOW-NEXT: retq +; +; AVX1_FAST-LABEL: hadd_v16i16b: +; AVX1_FAST: # %bb.0: +; AVX1_FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm1 +; AVX1_FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1_FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX1_FAST-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1_FAST-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX1_FAST-NEXT: retq +; +; AVX2_SLOW-LABEL: hadd_v16i16b: +; AVX2_SLOW: # %bb.0: +; AVX2_SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2_SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,30,31,26,27,28,29,30,31] +; AVX2_SLOW-NEXT: vpaddw %ymm0, %ymm1, %ymm0 +; AVX2_SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] +; AVX2_SLOW-NEXT: retq +; +; AVX2_FAST-LABEL: hadd_v16i16b: +; AVX2_FAST: # %bb.0: +; AVX2_FAST-NEXT: vphaddw %ymm0, %ymm0, %ymm0 +; AVX2_FAST-NEXT: retq %a0 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> %a1 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> %hop = add <16 x i16> %a0, %a1 @@ -429,15 +869,45 @@ define <16 x i16> @hadd_v16i16b(<16 x i16> %a) { } define <8 x i16> @hsub_v8i16(<8 x i16> %a) { -; SSSE3-LABEL: hsub_v8i16: -; SSSE3: # %bb.0: -; SSSE3-NEXT: phsubw %xmm0, %xmm0 -; SSSE3-NEXT: retq -; -; AVX-LABEL: hsub_v8i16: -; AVX: # %bb.0: -; AVX-NEXT: vphsubw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSSE3_SLOW-LABEL: hsub_v8i16: +; SSSE3_SLOW: # %bb.0: +; SSSE3_SLOW-NEXT: movdqa %xmm0, %xmm1 +; SSSE3_SLOW-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSSE3_SLOW-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] +; SSSE3_SLOW-NEXT: psubw %xmm0, %xmm1 +; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] +; SSSE3_SLOW-NEXT: retq +; +; SSSE3_FAST-LABEL: hsub_v8i16: +; SSSE3_FAST: # %bb.0: +; SSSE3_FAST-NEXT: phsubw %xmm0, %xmm0 +; SSSE3_FAST-NEXT: retq +; +; AVX1_SLOW-LABEL: hsub_v8i16: +; AVX1_SLOW: # %bb.0: +; AVX1_SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1_SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] +; AVX1_SLOW-NEXT: vpsubw %xmm0, %xmm1, %xmm0 +; AVX1_SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1_SLOW-NEXT: retq +; +; AVX1_FAST-LABEL: hsub_v8i16: +; AVX1_FAST: # %bb.0: +; AVX1_FAST-NEXT: vphsubw %xmm0, %xmm0, %xmm0 +; AVX1_FAST-NEXT: retq +; +; AVX2_SLOW-LABEL: hsub_v8i16: +; AVX2_SLOW: # %bb.0: +; AVX2_SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX2_SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] +; AVX2_SLOW-NEXT: vpsubw %xmm0, %xmm1, %xmm0 +; AVX2_SLOW-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX2_SLOW-NEXT: retq +; +; AVX2_FAST-LABEL: hsub_v8i16: +; AVX2_FAST: # %bb.0: +; AVX2_FAST-NEXT: vphsubw %xmm0, %xmm0, %xmm0 +; AVX2_FAST-NEXT: retq %a0246 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> %a1357 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> %hop = sub <8 x i16> %a0246, %a1357 @@ -476,25 +946,64 @@ define <16 x i16> @hsub_v16i16a(<16 x i16> %a) { } define <16 x i16> @hsub_v16i16b(<16 x i16> %a) { -; SSSE3-LABEL: hsub_v16i16b: -; SSSE3: # %bb.0: -; SSSE3-NEXT: phsubw %xmm0, %xmm0 -; SSSE3-NEXT: phsubw %xmm1, %xmm1 -; SSSE3-NEXT: retq -; -; AVX1-LABEL: hsub_v16i16b: -; AVX1: # %bb.0: -; AVX1-NEXT: vphsubw %xmm0, %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vphsubw %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] -; AVX1-NEXT: retq -; -; AVX2-LABEL: hsub_v16i16b: -; AVX2: # %bb.0: -; AVX2-NEXT: vphsubw %ymm0, %ymm0, %ymm0 -; AVX2-NEXT: retq +; SSSE3_SLOW-LABEL: hsub_v16i16b: +; SSSE3_SLOW: # %bb.0: +; SSSE3_SLOW-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSSE3_SLOW-NEXT: movdqa %xmm0, %xmm3 +; SSSE3_SLOW-NEXT: pshufb %xmm2, %xmm3 +; SSSE3_SLOW-NEXT: movdqa %xmm1, %xmm4 +; SSSE3_SLOW-NEXT: pshufb %xmm2, %xmm4 +; SSSE3_SLOW-NEXT: movdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] +; SSSE3_SLOW-NEXT: pshufb %xmm2, %xmm0 +; SSSE3_SLOW-NEXT: psubw %xmm0, %xmm3 +; SSSE3_SLOW-NEXT: pshufb %xmm2, %xmm1 +; SSSE3_SLOW-NEXT: psubw %xmm1, %xmm4 +; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,1] +; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,1,0,1] +; SSSE3_SLOW-NEXT: retq +; +; SSSE3_FAST-LABEL: hsub_v16i16b: +; SSSE3_FAST: # %bb.0: +; SSSE3_FAST-NEXT: phsubw %xmm0, %xmm0 +; SSSE3_FAST-NEXT: phsubw %xmm1, %xmm1 +; SSSE3_FAST-NEXT: retq +; +; AVX1_SLOW-LABEL: hsub_v16i16b: +; AVX1_SLOW: # %bb.0: +; AVX1_SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1_SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm2 +; AVX1_SLOW-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1_SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX1_SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] +; AVX1_SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX1_SLOW-NEXT: vpsubw %xmm0, %xmm2, %xmm0 +; AVX1_SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm2 +; AVX1_SLOW-NEXT: vpsubw %xmm2, %xmm1, %xmm1 +; AVX1_SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX1_SLOW-NEXT: retq +; +; AVX1_FAST-LABEL: hsub_v16i16b: +; AVX1_FAST: # %bb.0: +; AVX1_FAST-NEXT: vphsubw %xmm0, %xmm0, %xmm1 +; AVX1_FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1_FAST-NEXT: vphsubw %xmm0, %xmm0, %xmm0 +; AVX1_FAST-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1_FAST-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX1_FAST-NEXT: retq +; +; AVX2_SLOW-LABEL: hsub_v16i16b: +; AVX2_SLOW: # %bb.0: +; AVX2_SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2_SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,30,31,26,27,28,29,30,31] +; AVX2_SLOW-NEXT: vpsubw %ymm0, %ymm1, %ymm0 +; AVX2_SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] +; AVX2_SLOW-NEXT: retq +; +; AVX2_FAST-LABEL: hsub_v16i16b: +; AVX2_FAST: # %bb.0: +; AVX2_FAST-NEXT: vphsubw %ymm0, %ymm0, %ymm0 +; AVX2_FAST-NEXT: retq %a0 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> %a1 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> %hop = sub <16 x i16> %a0, %a1 diff --git a/test/CodeGen/X86/haddsub-undef.ll b/test/CodeGen/X86/haddsub-undef.ll index d7c0936a474..e0590a76615 100644 --- a/test/CodeGen/X86/haddsub-undef.ll +++ b/test/CodeGen/X86/haddsub-undef.ll @@ -1,7 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSE-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSE,SSE-FAST +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-FAST +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST ; Verify that we correctly fold horizontal binop even in the presence of UNDEFs. @@ -339,8 +342,6 @@ define <8 x i32> @test14_undef(<8 x i32> %a, <8 x i32> %b) { ret <8 x i32> %vecinit5 } -; On AVX2, the following sequence can be folded into a single horizontal add. -; If the Subtarget doesn't support AVX2, then we avoid emitting two packed ; integer horizontal adds instead of two scalar adds followed by vector inserts. define <8 x i32> @test15_undef(<8 x i32> %a, <8 x i32> %b) { ; SSE-LABEL: test15_undef: @@ -451,15 +452,38 @@ define <8 x i32> @test17_undef(<8 x i32> %a, <8 x i32> %b) { } define <2 x double> @add_pd_003(<2 x double> %x) { -; SSE-LABEL: add_pd_003: -; SSE: # %bb.0: -; SSE-NEXT: haddpd %xmm0, %xmm0 -; SSE-NEXT: retq +; SSE-SLOW-LABEL: add_pd_003: +; SSE-SLOW: # %bb.0: +; SSE-SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm0[0,0] +; SSE-SLOW-NEXT: addpd %xmm1, %xmm0 +; SSE-SLOW-NEXT: retq ; -; AVX-LABEL: add_pd_003: -; AVX: # %bb.0: -; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSE-FAST-LABEL: add_pd_003: +; SSE-FAST: # %bb.0: +; SSE-FAST-NEXT: haddpd %xmm0, %xmm0 +; SSE-FAST-NEXT: retq +; +; AVX1-SLOW-LABEL: add_pd_003: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] +; AVX1-SLOW-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX1-SLOW-NEXT: retq +; +; AVX1-FAST-LABEL: add_pd_003: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: retq +; +; AVX2-SLOW-LABEL: add_pd_003: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] +; AVX2-SLOW-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: add_pd_003: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 +; AVX2-FAST-NEXT: retq %l = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> %add = fadd <2 x double> %l, %x ret <2 x double> %add @@ -468,31 +492,84 @@ define <2 x double> @add_pd_003(<2 x double> %x) { ; Change shuffle mask - no undefs. define <2 x double> @add_pd_003_2(<2 x double> %x) { -; SSE-LABEL: add_pd_003_2: -; SSE: # %bb.0: -; SSE-NEXT: haddpd %xmm0, %xmm0 -; SSE-NEXT: retq +; SSE-SLOW-LABEL: add_pd_003_2: +; SSE-SLOW: # %bb.0: +; SSE-SLOW-NEXT: movapd %xmm0, %xmm1 +; SSE-SLOW-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0] +; SSE-SLOW-NEXT: addpd %xmm0, %xmm1 +; SSE-SLOW-NEXT: movapd %xmm1, %xmm0 +; SSE-SLOW-NEXT: retq ; -; AVX-LABEL: add_pd_003_2: -; AVX: # %bb.0: -; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSE-FAST-LABEL: add_pd_003_2: +; SSE-FAST: # %bb.0: +; SSE-FAST-NEXT: haddpd %xmm0, %xmm0 +; SSE-FAST-NEXT: retq +; +; AVX1-SLOW-LABEL: add_pd_003_2: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-SLOW-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX1-SLOW-NEXT: retq +; +; AVX1-FAST-LABEL: add_pd_003_2: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: retq +; +; AVX2-SLOW-LABEL: add_pd_003_2: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX2-SLOW-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: add_pd_003_2: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 +; AVX2-FAST-NEXT: retq %l = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> %add = fadd <2 x double> %l, %x ret <2 x double> %add } define <2 x double> @add_pd_010(<2 x double> %x) { -; SSE-LABEL: add_pd_010: -; SSE: # %bb.0: -; SSE-NEXT: haddpd %xmm0, %xmm0 -; SSE-NEXT: retq +; SSE-SLOW-LABEL: add_pd_010: +; SSE-SLOW: # %bb.0: +; SSE-SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm0[0,0] +; SSE-SLOW-NEXT: addpd %xmm0, %xmm1 +; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] +; SSE-SLOW-NEXT: movapd %xmm1, %xmm0 +; SSE-SLOW-NEXT: retq ; -; AVX-LABEL: add_pd_010: -; AVX: # %bb.0: -; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX-NEXT: retq +; SSE-FAST-LABEL: add_pd_010: +; SSE-FAST: # %bb.0: +; SSE-FAST-NEXT: haddpd %xmm0, %xmm0 +; SSE-FAST-NEXT: retq +; +; AVX1-SLOW-LABEL: add_pd_010: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] +; AVX1-SLOW-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX1-SLOW-NEXT: retq +; +; AVX1-FAST-LABEL: add_pd_010: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX1-FAST-NEXT: retq +; +; AVX2-SLOW-LABEL: add_pd_010: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] +; AVX2-SLOW-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX2-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: add_pd_010: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX2-FAST-NEXT: retq %l = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> %add = fadd <2 x double> %l, %x %shuffle2 = shufflevector <2 x double> %add, <2 x double> undef, <2 x i32> @@ -500,15 +577,42 @@ define <2 x double> @add_pd_010(<2 x double> %x) { } define <4 x float> @add_ps_007(<4 x float> %x) { -; SSE-LABEL: add_ps_007: -; SSE: # %bb.0: -; SSE-NEXT: haddps %xmm0, %xmm0 -; SSE-NEXT: retq +; SSE-SLOW-LABEL: add_ps_007: +; SSE-SLOW: # %bb.0: +; SSE-SLOW-NEXT: movaps %xmm0, %xmm1 +; SSE-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3] +; SSE-SLOW-NEXT: addps %xmm1, %xmm0 +; SSE-SLOW-NEXT: retq ; -; AVX-LABEL: add_ps_007: -; AVX: # %bb.0: -; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSE-FAST-LABEL: add_ps_007: +; SSE-FAST: # %bb.0: +; SSE-FAST-NEXT: haddps %xmm0, %xmm0 +; SSE-FAST-NEXT: retq +; +; AVX1-SLOW-LABEL: add_ps_007: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,1,0,2] +; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] +; AVX1-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX1-SLOW-NEXT: retq +; +; AVX1-FAST-LABEL: add_ps_007: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: retq +; +; AVX2-SLOW-LABEL: add_ps_007: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,1,0,2] +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] +; AVX2-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: add_ps_007: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX2-FAST-NEXT: retq %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %add = fadd <4 x float> %l, %r @@ -516,17 +620,48 @@ define <4 x float> @add_ps_007(<4 x float> %x) { } define <4 x float> @add_ps_030(<4 x float> %x) { -; SSE-LABEL: add_ps_030: -; SSE: # %bb.0: -; SSE-NEXT: haddps %xmm0, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,2,3] -; SSE-NEXT: retq +; SSE-SLOW-LABEL: add_ps_030: +; SSE-SLOW: # %bb.0: +; SSE-SLOW-NEXT: movaps %xmm0, %xmm1 +; SSE-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3] +; SSE-SLOW-NEXT: addps %xmm1, %xmm0 +; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,2,3] +; SSE-SLOW-NEXT: retq ; -; AVX-LABEL: add_ps_030: -; AVX: # %bb.0: -; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,2,3] -; AVX-NEXT: retq +; SSE-FAST-LABEL: add_ps_030: +; SSE-FAST: # %bb.0: +; SSE-FAST-NEXT: haddps %xmm0, %xmm0 +; SSE-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,2,3] +; SSE-FAST-NEXT: retq +; +; AVX1-SLOW-LABEL: add_ps_030: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,1,0,2] +; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] +; AVX1-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,2,3] +; AVX1-SLOW-NEXT: retq +; +; AVX1-FAST-LABEL: add_ps_030: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,2,3] +; AVX1-FAST-NEXT: retq +; +; AVX2-SLOW-LABEL: add_ps_030: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,1,0,2] +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] +; AVX2-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,2,3] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: add_ps_030: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,2,3] +; AVX2-FAST-NEXT: retq %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %add = fadd <4 x float> %l, %r @@ -535,15 +670,41 @@ define <4 x float> @add_ps_030(<4 x float> %x) { } define <4 x float> @add_ps_007_2(<4 x float> %x) { -; SSE-LABEL: add_ps_007_2: -; SSE: # %bb.0: -; SSE-NEXT: haddps %xmm0, %xmm0 -; SSE-NEXT: retq +; SSE-SLOW-LABEL: add_ps_007_2: +; SSE-SLOW: # %bb.0: +; SSE-SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm0[0,0] +; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3] +; SSE-SLOW-NEXT: addps %xmm1, %xmm0 +; SSE-SLOW-NEXT: retq ; -; AVX-LABEL: add_ps_007_2: -; AVX: # %bb.0: -; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSE-FAST-LABEL: add_ps_007_2: +; SSE-FAST: # %bb.0: +; SSE-FAST-NEXT: haddps %xmm0, %xmm0 +; SSE-FAST-NEXT: retq +; +; AVX1-SLOW-LABEL: add_ps_007_2: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] +; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] +; AVX1-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX1-SLOW-NEXT: retq +; +; AVX1-FAST-LABEL: add_ps_007_2: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: retq +; +; AVX2-SLOW-LABEL: add_ps_007_2: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] +; AVX2-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: add_ps_007_2: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX2-FAST-NEXT: retq %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %add = fadd <4 x float> %l, %r @@ -551,32 +712,83 @@ define <4 x float> @add_ps_007_2(<4 x float> %x) { } define <4 x float> @add_ps_008(<4 x float> %x) { -; SSE-LABEL: add_ps_008: -; SSE: # %bb.0: -; SSE-NEXT: haddps %xmm0, %xmm0 -; SSE-NEXT: retq +; SSE-SLOW-LABEL: add_ps_008: +; SSE-SLOW: # %bb.0: +; SSE-SLOW-NEXT: movsldup {{.*#+}} xmm1 = xmm0[0,0,2,2] +; SSE-SLOW-NEXT: addps %xmm1, %xmm0 +; SSE-SLOW-NEXT: retq ; -; AVX-LABEL: add_ps_008: -; AVX: # %bb.0: -; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSE-FAST-LABEL: add_ps_008: +; SSE-FAST: # %bb.0: +; SSE-FAST-NEXT: haddps %xmm0, %xmm0 +; SSE-FAST-NEXT: retq +; +; AVX1-SLOW-LABEL: add_ps_008: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vmovsldup {{.*#+}} xmm1 = xmm0[0,0,2,2] +; AVX1-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX1-SLOW-NEXT: retq +; +; AVX1-FAST-LABEL: add_ps_008: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: retq +; +; AVX2-SLOW-LABEL: add_ps_008: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovsldup {{.*#+}} xmm1 = xmm0[0,0,2,2] +; AVX2-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: add_ps_008: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX2-FAST-NEXT: retq %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %add = fadd <4 x float> %l, %x ret <4 x float> %add } define <4 x float> @add_ps_017(<4 x float> %x) { -; SSE-LABEL: add_ps_017: -; SSE: # %bb.0: -; SSE-NEXT: haddps %xmm0, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] -; SSE-NEXT: retq +; SSE-SLOW-LABEL: add_ps_017: +; SSE-SLOW: # %bb.0: +; SSE-SLOW-NEXT: movsldup {{.*#+}} xmm1 = xmm0[0,0,2,2] +; SSE-SLOW-NEXT: addps %xmm0, %xmm1 +; SSE-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3] +; SSE-SLOW-NEXT: movaps %xmm1, %xmm0 +; SSE-SLOW-NEXT: retq ; -; AVX-LABEL: add_ps_017: -; AVX: # %bb.0: -; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX-NEXT: retq +; SSE-FAST-LABEL: add_ps_017: +; SSE-FAST: # %bb.0: +; SSE-FAST-NEXT: haddps %xmm0, %xmm0 +; SSE-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE-FAST-NEXT: retq +; +; AVX1-SLOW-LABEL: add_ps_017: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vmovsldup {{.*#+}} xmm1 = xmm0[0,0,2,2] +; AVX1-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX1-SLOW-NEXT: retq +; +; AVX1-FAST-LABEL: add_ps_017: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX1-FAST-NEXT: retq +; +; AVX2-SLOW-LABEL: add_ps_017: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovsldup {{.*#+}} xmm1 = xmm0[0,0,2,2] +; AVX2-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: add_ps_017: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX2-FAST-NEXT: retq %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %add = fadd <4 x float> %l, %x %shuffle2 = shufflevector <4 x float> %add, <4 x float> undef, <4 x i32> @@ -584,17 +796,47 @@ define <4 x float> @add_ps_017(<4 x float> %x) { } define <4 x float> @add_ps_018(<4 x float> %x) { -; SSE-LABEL: add_ps_018: -; SSE: # %bb.0: -; SSE-NEXT: haddps %xmm0, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: retq +; SSE-SLOW-LABEL: add_ps_018: +; SSE-SLOW: # %bb.0: +; SSE-SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm0[0,0] +; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3] +; SSE-SLOW-NEXT: addps %xmm1, %xmm0 +; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-SLOW-NEXT: retq ; -; AVX-LABEL: add_ps_018: -; AVX: # %bb.0: -; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-NEXT: retq +; SSE-FAST-LABEL: add_ps_018: +; SSE-FAST: # %bb.0: +; SSE-FAST-NEXT: haddps %xmm0, %xmm0 +; SSE-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-FAST-NEXT: retq +; +; AVX1-SLOW-LABEL: add_ps_018: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] +; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] +; AVX1-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-SLOW-NEXT: retq +; +; AVX1-FAST-LABEL: add_ps_018: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-FAST-NEXT: retq +; +; AVX2-SLOW-LABEL: add_ps_018: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] +; AVX2-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: add_ps_018: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-FAST-NEXT: retq %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %add = fadd <4 x float> %l, %r diff --git a/test/CodeGen/X86/haddsub.ll b/test/CodeGen/X86/haddsub.ll index 030de9c7f14..6221d4e43bc 100644 --- a/test/CodeGen/X86/haddsub.ll +++ b/test/CodeGen/X86/haddsub.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3 | FileCheck %s --check-prefix=SSE3 -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3 | FileCheck %s --check-prefixes=SSE3,SSE3-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3,fast-hops | FileCheck %s --check-prefixes=SSE3,SSE3-FAST +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST define <2 x double> @haddpd1(<2 x double> %x, <2 x double> %y) { ; SSE3-LABEL: haddpd1: @@ -35,15 +37,29 @@ define <2 x double> @haddpd2(<2 x double> %x, <2 x double> %y) { } define <2 x double> @haddpd3(<2 x double> %x) { -; SSE3-LABEL: haddpd3: -; SSE3: # %bb.0: -; SSE3-NEXT: haddpd %xmm0, %xmm0 -; SSE3-NEXT: retq +; SSE3-SLOW-LABEL: haddpd3: +; SSE3-SLOW: # %bb.0: +; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1 +; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE3-SLOW-NEXT: addpd %xmm0, %xmm1 +; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0 +; SSE3-SLOW-NEXT: retq ; -; AVX-LABEL: haddpd3: -; AVX: # %bb.0: -; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSE3-FAST-LABEL: haddpd3: +; SSE3-FAST: # %bb.0: +; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0 +; SSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: haddpd3: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: haddpd3: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: retq %a = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> %b = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> %r = fadd <2 x double> %a, %b @@ -83,15 +99,30 @@ define <4 x float> @haddps2(<4 x float> %x, <4 x float> %y) { } define <4 x float> @haddps3(<4 x float> %x) { -; SSE3-LABEL: haddps3: -; SSE3: # %bb.0: -; SSE3-NEXT: haddps %xmm0, %xmm0 -; SSE3-NEXT: retq +; SSE3-SLOW-LABEL: haddps3: +; SSE3-SLOW: # %bb.0: +; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 +; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3] +; SSE3-SLOW-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE3-SLOW-NEXT: addps %xmm1, %xmm0 +; SSE3-SLOW-NEXT: retq ; -; AVX-LABEL: haddps3: -; AVX: # %bb.0: -; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSE3-FAST-LABEL: haddps3: +; SSE3-FAST: # %bb.0: +; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 +; SSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: haddps3: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3] +; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: haddps3: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: retq %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %r = fadd <4 x float> %a, %b @@ -99,15 +130,30 @@ define <4 x float> @haddps3(<4 x float> %x) { } define <4 x float> @haddps4(<4 x float> %x) { -; SSE3-LABEL: haddps4: -; SSE3: # %bb.0: -; SSE3-NEXT: haddps %xmm0, %xmm0 -; SSE3-NEXT: retq +; SSE3-SLOW-LABEL: haddps4: +; SSE3-SLOW: # %bb.0: +; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 +; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3] +; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSE3-SLOW-NEXT: addps %xmm1, %xmm0 +; SSE3-SLOW-NEXT: retq ; -; AVX-LABEL: haddps4: -; AVX: # %bb.0: -; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSE3-FAST-LABEL: haddps4: +; SSE3-FAST: # %bb.0: +; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 +; SSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: haddps4: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3] +; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3] +; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: haddps4: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: retq %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %r = fadd <4 x float> %a, %b @@ -115,15 +161,30 @@ define <4 x float> @haddps4(<4 x float> %x) { } define <4 x float> @haddps5(<4 x float> %x) { -; SSE3-LABEL: haddps5: -; SSE3: # %bb.0: -; SSE3-NEXT: haddps %xmm0, %xmm0 -; SSE3-NEXT: retq +; SSE3-SLOW-LABEL: haddps5: +; SSE3-SLOW: # %bb.0: +; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 +; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,3] +; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2,2,3] +; SSE3-SLOW-NEXT: addps %xmm1, %xmm0 +; SSE3-SLOW-NEXT: retq ; -; AVX-LABEL: haddps5: -; AVX: # %bb.0: -; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSE3-FAST-LABEL: haddps5: +; SSE3-FAST: # %bb.0: +; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 +; SSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: haddps5: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,3,2,3] +; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,2,2,3] +; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: haddps5: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: retq %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %r = fadd <4 x float> %a, %b @@ -131,15 +192,27 @@ define <4 x float> @haddps5(<4 x float> %x) { } define <4 x float> @haddps6(<4 x float> %x) { -; SSE3-LABEL: haddps6: -; SSE3: # %bb.0: -; SSE3-NEXT: haddps %xmm0, %xmm0 -; SSE3-NEXT: retq +; SSE3-SLOW-LABEL: haddps6: +; SSE3-SLOW: # %bb.0: +; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE3-SLOW-NEXT: addps %xmm1, %xmm0 +; SSE3-SLOW-NEXT: retq ; -; AVX-LABEL: haddps6: -; AVX: # %bb.0: -; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSE3-FAST-LABEL: haddps6: +; SSE3-FAST: # %bb.0: +; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 +; SSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: haddps6: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: haddps6: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: retq %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %r = fadd <4 x float> %a, %b @@ -147,15 +220,30 @@ define <4 x float> @haddps6(<4 x float> %x) { } define <4 x float> @haddps7(<4 x float> %x) { -; SSE3-LABEL: haddps7: -; SSE3: # %bb.0: -; SSE3-NEXT: haddps %xmm0, %xmm0 -; SSE3-NEXT: retq +; SSE3-SLOW-LABEL: haddps7: +; SSE3-SLOW: # %bb.0: +; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 +; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE3-SLOW-NEXT: addps %xmm1, %xmm0 +; SSE3-SLOW-NEXT: retq ; -; AVX-LABEL: haddps7: -; AVX: # %bb.0: -; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSE3-FAST-LABEL: haddps7: +; SSE3-FAST: # %bb.0: +; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 +; SSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: haddps7: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: haddps7: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: retq %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %r = fadd <4 x float> %a, %b @@ -179,15 +267,28 @@ define <2 x double> @hsubpd1(<2 x double> %x, <2 x double> %y) { } define <2 x double> @hsubpd2(<2 x double> %x) { -; SSE3-LABEL: hsubpd2: -; SSE3: # %bb.0: -; SSE3-NEXT: hsubpd %xmm0, %xmm0 -; SSE3-NEXT: retq +; SSE3-SLOW-LABEL: hsubpd2: +; SSE3-SLOW: # %bb.0: +; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1 +; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE3-SLOW-NEXT: subpd %xmm1, %xmm0 +; SSE3-SLOW-NEXT: retq ; -; AVX-LABEL: hsubpd2: -; AVX: # %bb.0: -; AVX-NEXT: vhsubpd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSE3-FAST-LABEL: hsubpd2: +; SSE3-FAST: # %bb.0: +; SSE3-FAST-NEXT: hsubpd %xmm0, %xmm0 +; SSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: hsubpd2: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-SLOW-NEXT: vsubpd %xmm1, %xmm0, %xmm0 +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: hsubpd2: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vhsubpd %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: retq %a = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> %b = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> %r = fsub <2 x double> %a, %b @@ -211,15 +312,31 @@ define <4 x float> @hsubps1(<4 x float> %x, <4 x float> %y) { } define <4 x float> @hsubps2(<4 x float> %x) { -; SSE3-LABEL: hsubps2: -; SSE3: # %bb.0: -; SSE3-NEXT: hsubps %xmm0, %xmm0 -; SSE3-NEXT: retq +; SSE3-SLOW-LABEL: hsubps2: +; SSE3-SLOW: # %bb.0: +; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 +; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3] +; SSE3-SLOW-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE3-SLOW-NEXT: subps %xmm0, %xmm1 +; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0 +; SSE3-SLOW-NEXT: retq ; -; AVX-LABEL: hsubps2: -; AVX: # %bb.0: -; AVX-NEXT: vhsubps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSE3-FAST-LABEL: hsubps2: +; SSE3-FAST: # %bb.0: +; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0 +; SSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: hsubps2: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3] +; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-SLOW-NEXT: vsubps %xmm0, %xmm1, %xmm0 +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: hsubps2: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: retq %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %r = fsub <4 x float> %a, %b @@ -227,15 +344,31 @@ define <4 x float> @hsubps2(<4 x float> %x) { } define <4 x float> @hsubps3(<4 x float> %x) { -; SSE3-LABEL: hsubps3: -; SSE3: # %bb.0: -; SSE3-NEXT: hsubps %xmm0, %xmm0 -; SSE3-NEXT: retq +; SSE3-SLOW-LABEL: hsubps3: +; SSE3-SLOW: # %bb.0: +; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 +; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3] +; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSE3-SLOW-NEXT: subps %xmm0, %xmm1 +; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0 +; SSE3-SLOW-NEXT: retq ; -; AVX-LABEL: hsubps3: -; AVX: # %bb.0: -; AVX-NEXT: vhsubps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSE3-FAST-LABEL: hsubps3: +; SSE3-FAST: # %bb.0: +; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0 +; SSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: hsubps3: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3] +; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3] +; AVX-SLOW-NEXT: vsubps %xmm0, %xmm1, %xmm0 +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: hsubps3: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: retq %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %r = fsub <4 x float> %a, %b @@ -243,15 +376,27 @@ define <4 x float> @hsubps3(<4 x float> %x) { } define <4 x float> @hsubps4(<4 x float> %x) { -; SSE3-LABEL: hsubps4: -; SSE3: # %bb.0: -; SSE3-NEXT: hsubps %xmm0, %xmm0 -; SSE3-NEXT: retq +; SSE3-SLOW-LABEL: hsubps4: +; SSE3-SLOW: # %bb.0: +; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE3-SLOW-NEXT: subps %xmm1, %xmm0 +; SSE3-SLOW-NEXT: retq ; -; AVX-LABEL: hsubps4: -; AVX: # %bb.0: -; AVX-NEXT: vhsubps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSE3-FAST-LABEL: hsubps4: +; SSE3-FAST: # %bb.0: +; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0 +; SSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: hsubps4: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-SLOW-NEXT: vsubps %xmm1, %xmm0, %xmm0 +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: hsubps4: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: retq %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %r = fsub <4 x float> %a, %b @@ -293,16 +438,35 @@ define <8 x float> @vhaddps2(<8 x float> %x, <8 x float> %y) { } define <8 x float> @vhaddps3(<8 x float> %x) { -; SSE3-LABEL: vhaddps3: -; SSE3: # %bb.0: -; SSE3-NEXT: haddps %xmm0, %xmm0 -; SSE3-NEXT: haddps %xmm1, %xmm1 -; SSE3-NEXT: retq +; SSE3-SLOW-LABEL: vhaddps3: +; SSE3-SLOW: # %bb.0: +; SSE3-SLOW-NEXT: movaps %xmm1, %xmm2 +; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[2,3] +; SSE3-SLOW-NEXT: movaps %xmm0, %xmm3 +; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[2,3] +; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3,2,3] +; SSE3-SLOW-NEXT: addps %xmm2, %xmm1 +; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSE3-SLOW-NEXT: addps %xmm3, %xmm0 +; SSE3-SLOW-NEXT: retq ; -; AVX-LABEL: vhaddps3: -; AVX: # %bb.0: -; AVX-NEXT: vhaddps %ymm0, %ymm0, %ymm0 -; AVX-NEXT: retq +; SSE3-FAST-LABEL: vhaddps3: +; SSE3-FAST: # %bb.0: +; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 +; SSE3-FAST-NEXT: haddps %xmm1, %xmm1 +; SSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: vhaddps3: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7] +; AVX-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7] +; AVX-SLOW-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: vhaddps3: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vhaddps %ymm0, %ymm0, %ymm0 +; AVX-FAST-NEXT: retq %a = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> %b = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> %r = fadd <8 x float> %a, %b @@ -327,16 +491,37 @@ define <8 x float> @vhsubps1(<8 x float> %x, <8 x float> %y) { } define <8 x float> @vhsubps3(<8 x float> %x) { -; SSE3-LABEL: vhsubps3: -; SSE3: # %bb.0: -; SSE3-NEXT: hsubps %xmm0, %xmm0 -; SSE3-NEXT: hsubps %xmm1, %xmm1 -; SSE3-NEXT: retq +; SSE3-SLOW-LABEL: vhsubps3: +; SSE3-SLOW: # %bb.0: +; SSE3-SLOW-NEXT: movaps %xmm1, %xmm2 +; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[2,3] +; SSE3-SLOW-NEXT: movaps %xmm0, %xmm3 +; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[2,3] +; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3,2,3] +; SSE3-SLOW-NEXT: subps %xmm1, %xmm2 +; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSE3-SLOW-NEXT: subps %xmm0, %xmm3 +; SSE3-SLOW-NEXT: movaps %xmm3, %xmm0 +; SSE3-SLOW-NEXT: movaps %xmm2, %xmm1 +; SSE3-SLOW-NEXT: retq ; -; AVX-LABEL: vhsubps3: -; AVX: # %bb.0: -; AVX-NEXT: vhsubps %ymm0, %ymm0, %ymm0 -; AVX-NEXT: retq +; SSE3-FAST-LABEL: vhsubps3: +; SSE3-FAST: # %bb.0: +; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0 +; SSE3-FAST-NEXT: hsubps %xmm1, %xmm1 +; SSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: vhsubps3: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7] +; AVX-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7] +; AVX-SLOW-NEXT: vsubps %ymm0, %ymm1, %ymm0 +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: vhsubps3: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vhsubps %ymm0, %ymm0, %ymm0 +; AVX-FAST-NEXT: retq %a = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> %b = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> %r = fsub <8 x float> %a, %b diff --git a/test/CodeGen/X86/madd.ll b/test/CodeGen/X86/madd.ll index 30320a750e0..c36faecbf85 100644 --- a/test/CodeGen/X86/madd.ll +++ b/test/CodeGen/X86/madd.ll @@ -50,7 +50,8 @@ define i32 @_Z10test_shortPsS_i_128(i16* nocapture readonly, i16* nocapture read ; AVX-NEXT: # %bb.2: # %middle.block ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: retq entry: @@ -129,7 +130,8 @@ define i32 @_Z10test_shortPsS_i_256(i16* nocapture readonly, i16* nocapture read ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -153,7 +155,8 @@ define i32 @_Z10test_shortPsS_i_256(i16* nocapture readonly, i16* nocapture read ; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX256-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX256-NEXT: vmovd %xmm0, %eax ; AVX256-NEXT: vzeroupper ; AVX256-NEXT: retq @@ -252,7 +255,8 @@ define i32 @_Z10test_shortPsS_i_512(i16* nocapture readonly, i16* nocapture read ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -278,7 +282,8 @@ define i32 @_Z10test_shortPsS_i_512(i16* nocapture readonly, i16* nocapture read ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -437,7 +442,8 @@ define i32 @_Z10test_shortPsS_i_1024(i16* nocapture readonly, i16* nocapture rea ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -469,7 +475,8 @@ define i32 @_Z10test_shortPsS_i_1024(i16* nocapture readonly, i16* nocapture rea ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -620,7 +627,8 @@ define i32 @_Z9test_charPcS_i_128(i8* nocapture readonly, i8* nocapture readonly ; AVX-NEXT: # %bb.2: # %middle.block ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: retq entry: @@ -704,7 +712,8 @@ define i32 @_Z9test_charPcS_i_256(i8* nocapture readonly, i8* nocapture readonly ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -729,7 +738,8 @@ define i32 @_Z9test_charPcS_i_256(i8* nocapture readonly, i8* nocapture readonly ; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX256-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX256-NEXT: vmovd %xmm0, %eax ; AVX256-NEXT: vzeroupper ; AVX256-NEXT: retq @@ -836,7 +846,8 @@ define i32 @_Z9test_charPcS_i_512(i8* nocapture readonly, i8* nocapture readonly ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -863,7 +874,8 @@ define i32 @_Z9test_charPcS_i_512(i8* nocapture readonly, i8* nocapture readonly ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1039,7 +1051,8 @@ define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonl ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -1073,7 +1086,8 @@ define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonl ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1222,7 +1236,8 @@ define i32 @test_unsigned_short_128(i16* nocapture readonly, i16* nocapture read ; AVX-NEXT: # %bb.2: # %middle.block ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: retq entry: @@ -1313,7 +1328,8 @@ define i32 @test_unsigned_short_256(i16* nocapture readonly, i16* nocapture read ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -1338,7 +1354,8 @@ define i32 @test_unsigned_short_256(i16* nocapture readonly, i16* nocapture read ; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX256-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX256-NEXT: vmovd %xmm0, %eax ; AVX256-NEXT: vzeroupper ; AVX256-NEXT: retq @@ -1460,7 +1477,8 @@ define i32 @test_unsigned_short_512(i16* nocapture readonly, i16* nocapture read ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -1491,7 +1509,8 @@ define i32 @test_unsigned_short_512(i16* nocapture readonly, i16* nocapture read ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1699,7 +1718,8 @@ define i32 @test_unsigned_short_1024(i16* nocapture readonly, i16* nocapture rea ; AVX1-NEXT: vpaddd %xmm0, %xmm8, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -1742,7 +1762,8 @@ define i32 @test_unsigned_short_1024(i16* nocapture readonly, i16* nocapture rea ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -2692,7 +2713,8 @@ define i32 @madd_double_reduction(<8 x i16>* %arg, <8 x i16>* %arg1, <8 x i16>* ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: retq ; @@ -2707,7 +2729,8 @@ define i32 @madd_double_reduction(<8 x i16>* %arg, <8 x i16>* %arg1, <8 x i16>* ; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX256-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX256-NEXT: vmovd %xmm0, %eax ; AVX256-NEXT: vzeroupper ; AVX256-NEXT: retq diff --git a/test/CodeGen/X86/phaddsub.ll b/test/CodeGen/X86/phaddsub.ll index 7b3f8db76c4..b4ff08cd254 100644 --- a/test/CodeGen/X86/phaddsub.ll +++ b/test/CodeGen/X86/phaddsub.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSSE3 -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3,SSSE3-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSSE3,SSSE3-FAST +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST define <8 x i16> @phaddw1(<8 x i16> %x, <8 x i16> %y) { ; SSSE3-LABEL: phaddw1: @@ -67,15 +69,29 @@ define <4 x i32> @phaddd2(<4 x i32> %x, <4 x i32> %y) { } define <4 x i32> @phaddd3(<4 x i32> %x) { -; SSSE3-LABEL: phaddd3: -; SSSE3: # %bb.0: -; SSSE3-NEXT: phaddd %xmm0, %xmm0 -; SSSE3-NEXT: retq +; SSSE3-SLOW-LABEL: phaddd3: +; SSSE3-SLOW: # %bb.0: +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: retq ; -; AVX-LABEL: phaddd3: -; AVX: # %bb.0: -; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSSE3-FAST-LABEL: phaddd3: +; SSSE3-FAST: # %bb.0: +; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 +; SSSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: phaddd3: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: phaddd3: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: retq %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %r = add <4 x i32> %a, %b @@ -83,15 +99,29 @@ define <4 x i32> @phaddd3(<4 x i32> %x) { } define <4 x i32> @phaddd4(<4 x i32> %x) { -; SSSE3-LABEL: phaddd4: -; SSSE3: # %bb.0: -; SSSE3-NEXT: phaddd %xmm0, %xmm0 -; SSSE3-NEXT: retq +; SSSE3-SLOW-LABEL: phaddd4: +; SSSE3-SLOW: # %bb.0: +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: retq ; -; AVX-LABEL: phaddd4: -; AVX: # %bb.0: -; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSSE3-FAST-LABEL: phaddd4: +; SSSE3-FAST: # %bb.0: +; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 +; SSSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: phaddd4: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: phaddd4: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: retq %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %r = add <4 x i32> %a, %b @@ -99,15 +129,29 @@ define <4 x i32> @phaddd4(<4 x i32> %x) { } define <4 x i32> @phaddd5(<4 x i32> %x) { -; SSSE3-LABEL: phaddd5: -; SSSE3: # %bb.0: -; SSSE3-NEXT: phaddd %xmm0, %xmm0 -; SSSE3-NEXT: retq +; SSSE3-SLOW-LABEL: phaddd5: +; SSSE3-SLOW: # %bb.0: +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,3,2,3] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,2,2,3] +; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: retq ; -; AVX-LABEL: phaddd5: -; AVX: # %bb.0: -; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSSE3-FAST-LABEL: phaddd5: +; SSSE3-FAST: # %bb.0: +; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 +; SSSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: phaddd5: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,3,2,3] +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,3] +; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: phaddd5: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: retq %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %r = add <4 x i32> %a, %b @@ -115,15 +159,27 @@ define <4 x i32> @phaddd5(<4 x i32> %x) { } define <4 x i32> @phaddd6(<4 x i32> %x) { -; SSSE3-LABEL: phaddd6: -; SSSE3: # %bb.0: -; SSSE3-NEXT: phaddd %xmm0, %xmm0 -; SSSE3-NEXT: retq +; SSSE3-SLOW-LABEL: phaddd6: +; SSSE3-SLOW: # %bb.0: +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: retq ; -; AVX-LABEL: phaddd6: -; AVX: # %bb.0: -; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSSE3-FAST-LABEL: phaddd6: +; SSSE3-FAST: # %bb.0: +; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 +; SSSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: phaddd6: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: phaddd6: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: retq %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %r = add <4 x i32> %a, %b @@ -131,15 +187,29 @@ define <4 x i32> @phaddd6(<4 x i32> %x) { } define <4 x i32> @phaddd7(<4 x i32> %x) { -; SSSE3-LABEL: phaddd7: -; SSSE3: # %bb.0: -; SSSE3-NEXT: phaddd %xmm0, %xmm0 -; SSSE3-NEXT: retq +; SSSE3-SLOW-LABEL: phaddd7: +; SSSE3-SLOW: # %bb.0: +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: retq ; -; AVX-LABEL: phaddd7: -; AVX: # %bb.0: -; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSSE3-FAST-LABEL: phaddd7: +; SSSE3-FAST: # %bb.0: +; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 +; SSSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: phaddd7: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: phaddd7: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: retq %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %r = add <4 x i32> %a, %b @@ -179,15 +249,30 @@ define <4 x i32> @phsubd1(<4 x i32> %x, <4 x i32> %y) { } define <4 x i32> @phsubd2(<4 x i32> %x) { -; SSSE3-LABEL: phsubd2: -; SSSE3: # %bb.0: -; SSSE3-NEXT: phsubd %xmm0, %xmm0 -; SSSE3-NEXT: retq +; SSSE3-SLOW-LABEL: phsubd2: +; SSSE3-SLOW: # %bb.0: +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSSE3-SLOW-NEXT: psubd %xmm0, %xmm1 +; SSSE3-SLOW-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: retq ; -; AVX-LABEL: phsubd2: -; AVX: # %bb.0: -; AVX-NEXT: vphsubd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSSE3-FAST-LABEL: phsubd2: +; SSSE3-FAST: # %bb.0: +; SSSE3-FAST-NEXT: phsubd %xmm0, %xmm0 +; SSSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: phsubd2: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX-SLOW-NEXT: vpsubd %xmm0, %xmm1, %xmm0 +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: phsubd2: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: retq %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %r = sub <4 x i32> %a, %b @@ -195,15 +280,30 @@ define <4 x i32> @phsubd2(<4 x i32> %x) { } define <4 x i32> @phsubd3(<4 x i32> %x) { -; SSSE3-LABEL: phsubd3: -; SSSE3: # %bb.0: -; SSSE3-NEXT: phsubd %xmm0, %xmm0 -; SSSE3-NEXT: retq +; SSSE3-SLOW-LABEL: phsubd3: +; SSSE3-SLOW: # %bb.0: +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSSE3-SLOW-NEXT: psubd %xmm0, %xmm1 +; SSSE3-SLOW-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: retq ; -; AVX-LABEL: phsubd3: -; AVX: # %bb.0: -; AVX-NEXT: vphsubd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSSE3-FAST-LABEL: phsubd3: +; SSSE3-FAST: # %bb.0: +; SSSE3-FAST-NEXT: phsubd %xmm0, %xmm0 +; SSSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: phsubd3: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; AVX-SLOW-NEXT: vpsubd %xmm0, %xmm1, %xmm0 +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: phsubd3: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: retq %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %r = sub <4 x i32> %a, %b @@ -211,15 +311,27 @@ define <4 x i32> @phsubd3(<4 x i32> %x) { } define <4 x i32> @phsubd4(<4 x i32> %x) { -; SSSE3-LABEL: phsubd4: -; SSSE3: # %bb.0: -; SSSE3-NEXT: phsubd %xmm0, %xmm0 -; SSSE3-NEXT: retq +; SSSE3-SLOW-LABEL: phsubd4: +; SSSE3-SLOW: # %bb.0: +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSSE3-SLOW-NEXT: psubd %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: retq ; -; AVX-LABEL: phsubd4: -; AVX: # %bb.0: -; AVX-NEXT: vphsubd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSSE3-FAST-LABEL: phsubd4: +; SSSE3-FAST: # %bb.0: +; SSSE3-FAST-NEXT: phsubd %xmm0, %xmm0 +; SSSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: phsubd4: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-SLOW-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: phsubd4: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: retq %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %r = sub <4 x i32> %a, %b @@ -284,15 +396,29 @@ define <4 x i32> @phsubd1_reverse(<4 x i32> %x, <4 x i32> %y) { } define <4 x i32> @phaddd_single_source1(<4 x i32> %x) { -; SSSE3-LABEL: phaddd_single_source1: -; SSSE3: # %bb.0: -; SSSE3-NEXT: phaddd %xmm0, %xmm0 -; SSSE3-NEXT: retq +; SSSE3-SLOW-LABEL: phaddd_single_source1: +; SSSE3-SLOW: # %bb.0: +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,2] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: retq ; -; AVX-LABEL: phaddd_single_source1: -; AVX: # %bb.0: -; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSSE3-FAST-LABEL: phaddd_single_source1: +; SSSE3-FAST: # %bb.0: +; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 +; SSSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: phaddd_single_source1: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,2] +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: phaddd_single_source1: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: retq %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %add = add <4 x i32> %l, %r @@ -300,17 +426,33 @@ define <4 x i32> @phaddd_single_source1(<4 x i32> %x) { } define <4 x i32> @phaddd_single_source2(<4 x i32> %x) { -; SSSE3-LABEL: phaddd_single_source2: -; SSSE3: # %bb.0: -; SSSE3-NEXT: phaddd %xmm0, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] -; SSSE3-NEXT: retq +; SSSE3-SLOW-LABEL: phaddd_single_source2: +; SSSE3-SLOW: # %bb.0: +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,2] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] +; SSSE3-SLOW-NEXT: retq ; -; AVX-LABEL: phaddd_single_source2: -; AVX: # %bb.0: -; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] -; AVX-NEXT: retq +; SSSE3-FAST-LABEL: phaddd_single_source2: +; SSSE3-FAST: # %bb.0: +; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 +; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] +; SSSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: phaddd_single_source2: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,2] +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: phaddd_single_source2: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] +; AVX-FAST-NEXT: retq %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %add = add <4 x i32> %l, %r @@ -319,15 +461,29 @@ define <4 x i32> @phaddd_single_source2(<4 x i32> %x) { } define <4 x i32> @phaddd_single_source3(<4 x i32> %x) { -; SSSE3-LABEL: phaddd_single_source3: -; SSSE3: # %bb.0: -; SSSE3-NEXT: phaddd %xmm0, %xmm0 -; SSSE3-NEXT: retq +; SSSE3-SLOW-LABEL: phaddd_single_source3: +; SSSE3-SLOW: # %bb.0: +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: retq ; -; AVX-LABEL: phaddd_single_source3: -; AVX: # %bb.0: -; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSSE3-FAST-LABEL: phaddd_single_source3: +; SSSE3-FAST: # %bb.0: +; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 +; SSSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: phaddd_single_source3: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] +; AVX-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: phaddd_single_source3: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: retq %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %add = add <4 x i32> %l, %r @@ -335,32 +491,58 @@ define <4 x i32> @phaddd_single_source3(<4 x i32> %x) { } define <4 x i32> @phaddd_single_source4(<4 x i32> %x) { -; SSSE3-LABEL: phaddd_single_source4: -; SSSE3: # %bb.0: -; SSSE3-NEXT: phaddd %xmm0, %xmm0 -; SSSE3-NEXT: retq +; SSSE3-SLOW-LABEL: phaddd_single_source4: +; SSSE3-SLOW: # %bb.0: +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,2,2] +; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: retq ; -; AVX-LABEL: phaddd_single_source4: -; AVX: # %bb.0: -; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSSE3-FAST-LABEL: phaddd_single_source4: +; SSSE3-FAST: # %bb.0: +; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 +; SSSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: phaddd_single_source4: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,2,2] +; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: phaddd_single_source4: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: retq %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %add = add <4 x i32> %l, %x ret <4 x i32> %add } define <4 x i32> @phaddd_single_source5(<4 x i32> %x) { -; SSSE3-LABEL: phaddd_single_source5: -; SSSE3: # %bb.0: -; SSSE3-NEXT: phaddd %xmm0, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; SSSE3-NEXT: retq +; SSSE3-SLOW-LABEL: phaddd_single_source5: +; SSSE3-SLOW: # %bb.0: +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,2,2] +; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm1 +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] +; SSSE3-SLOW-NEXT: retq ; -; AVX-LABEL: phaddd_single_source5: -; AVX: # %bb.0: -; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX-NEXT: retq +; SSSE3-FAST-LABEL: phaddd_single_source5: +; SSSE3-FAST: # %bb.0: +; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 +; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: phaddd_single_source5: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,2,2] +; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: phaddd_single_source5: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX-FAST-NEXT: retq %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %add = add <4 x i32> %l, %x %shuffle2 = shufflevector <4 x i32> %add, <4 x i32> undef, <4 x i32> @@ -368,17 +550,33 @@ define <4 x i32> @phaddd_single_source5(<4 x i32> %x) { } define <4 x i32> @phaddd_single_source6(<4 x i32> %x) { -; SSSE3-LABEL: phaddd_single_source6: -; SSSE3: # %bb.0: -; SSSE3-NEXT: phaddd %xmm0, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSSE3-NEXT: retq +; SSSE3-SLOW-LABEL: phaddd_single_source6: +; SSSE3-SLOW: # %bb.0: +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; SSSE3-SLOW-NEXT: retq ; -; AVX-LABEL: phaddd_single_source6: -; AVX: # %bb.0: -; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX-NEXT: retq +; SSSE3-FAST-LABEL: phaddd_single_source6: +; SSSE3-FAST: # %bb.0: +; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 +; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; SSSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: phaddd_single_source6: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] +; AVX-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: phaddd_single_source6: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX-FAST-NEXT: retq %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %add = add <4 x i32> %l, %r @@ -387,15 +585,30 @@ define <4 x i32> @phaddd_single_source6(<4 x i32> %x) { } define <8 x i16> @phaddw_single_source1(<8 x i16> %x) { -; SSSE3-LABEL: phaddw_single_source1: -; SSSE3: # %bb.0: -; SSSE3-NEXT: phaddw %xmm0, %xmm0 -; SSSE3-NEXT: retq +; SSSE3-SLOW-LABEL: phaddw_single_source1: +; SSSE3-SLOW: # %bb.0: +; SSSE3-SLOW-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-SLOW-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,12,13] +; SSSE3-SLOW-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15] +; SSSE3-SLOW-NEXT: paddw %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: retq ; -; AVX-LABEL: phaddw_single_source1: -; AVX: # %bb.0: -; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSSE3-FAST-LABEL: phaddw_single_source1: +; SSSE3-FAST: # %bb.0: +; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0 +; SSSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: phaddw_single_source1: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,4,5,4,5,6,7,0,1,4,5,8,9,12,13] +; AVX-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15] +; AVX-SLOW-NEXT: vpaddw %xmm0, %xmm1, %xmm0 +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: phaddw_single_source1: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: retq %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> %add = add <8 x i16> %l, %r @@ -403,19 +616,41 @@ define <8 x i16> @phaddw_single_source1(<8 x i16> %x) { } define <8 x i16> @phaddw_single_source2(<8 x i16> %x) { -; SSSE3-LABEL: phaddw_single_source2: -; SSSE3: # %bb.0: -; SSSE3-NEXT: phaddw %xmm0, %xmm0 -; SSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSSE3-NEXT: retq +; SSSE3-SLOW-LABEL: phaddw_single_source2: +; SSSE3-SLOW: # %bb.0: +; SSSE3-SLOW-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] +; SSSE3-SLOW-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSSE3-SLOW-NEXT: paddw %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; SSSE3-SLOW-NEXT: retq ; -; AVX-LABEL: phaddw_single_source2: -; AVX: # %bb.0: -; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; AVX-NEXT: retq +; SSSE3-FAST-LABEL: phaddw_single_source2: +; SSSE3-FAST: # %bb.0: +; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0 +; SSSE3-FAST-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7] +; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; SSSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: phaddw_single_source2: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] +; AVX-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; AVX-SLOW-NEXT: vpaddw %xmm0, %xmm1, %xmm0 +; AVX-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7] +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: phaddw_single_source2: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7] +; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; AVX-FAST-NEXT: retq %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> %add = add <8 x i16> %l, %r @@ -424,15 +659,33 @@ define <8 x i16> @phaddw_single_source2(<8 x i16> %x) { } define <8 x i16> @phaddw_single_source3(<8 x i16> %x) { -; SSSE3-LABEL: phaddw_single_source3: -; SSSE3: # %bb.0: -; SSSE3-NEXT: phaddw %xmm0, %xmm0 -; SSSE3-NEXT: retq +; SSSE3-SLOW-LABEL: phaddw_single_source3: +; SSSE3-SLOW: # %bb.0: +; SSSE3-SLOW-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] +; SSSE3-SLOW-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSSE3-SLOW-NEXT: paddw %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: retq ; -; AVX-LABEL: phaddw_single_source3: -; AVX: # %bb.0: -; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSSE3-FAST-LABEL: phaddw_single_source3: +; SSSE3-FAST: # %bb.0: +; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0 +; SSSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: phaddw_single_source3: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] +; AVX-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; AVX-SLOW-NEXT: vpaddw %xmm0, %xmm1, %xmm0 +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: phaddw_single_source3: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: retq %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> %add = add <8 x i16> %l, %r @@ -440,32 +693,63 @@ define <8 x i16> @phaddw_single_source3(<8 x i16> %x) { } define <8 x i16> @phaddw_single_source4(<8 x i16> %x) { -; SSSE3-LABEL: phaddw_single_source4: -; SSSE3: # %bb.0: -; SSSE3-NEXT: phaddw %xmm0, %xmm0 -; SSSE3-NEXT: retq +; SSSE3-SLOW-LABEL: phaddw_single_source4: +; SSSE3-SLOW: # %bb.0: +; SSSE3-SLOW-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-SLOW-NEXT: pslld $16, %xmm1 +; SSSE3-SLOW-NEXT: paddw %xmm0, %xmm1 +; SSSE3-SLOW-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: retq ; -; AVX-LABEL: phaddw_single_source4: -; AVX: # %bb.0: -; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSSE3-FAST-LABEL: phaddw_single_source4: +; SSSE3-FAST: # %bb.0: +; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0 +; SSSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: phaddw_single_source4: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpslld $16, %xmm0, %xmm1 +; AVX-SLOW-NEXT: vpaddw %xmm0, %xmm1, %xmm0 +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: phaddw_single_source4: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: retq %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> %add = add <8 x i16> %l, %x ret <8 x i16> %add } define <8 x i16> @phaddw_single_source6(<8 x i16> %x) { -; SSSE3-LABEL: phaddw_single_source6: -; SSSE3: # %bb.0: -; SSSE3-NEXT: phaddw %xmm0, %xmm0 -; SSSE3-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: retq +; SSSE3-SLOW-LABEL: phaddw_single_source6: +; SSSE3-SLOW: # %bb.0: +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSSE3-SLOW-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7] +; SSSE3-SLOW-NEXT: paddw %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSSE3-SLOW-NEXT: retq ; -; AVX-LABEL: phaddw_single_source6: -; AVX: # %bb.0: -; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX-NEXT: retq +; SSSE3-FAST-LABEL: phaddw_single_source6: +; SSSE3-FAST: # %bb.0: +; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0 +; SSSE3-FAST-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: phaddw_single_source6: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] +; AVX-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX-SLOW-NEXT: vpaddw %xmm0, %xmm1, %xmm0 +; AVX-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: phaddw_single_source6: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX-FAST-NEXT: retq %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> %add = add <8 x i16> %l, %r diff --git a/test/CodeGen/X86/required-vector-width.ll b/test/CodeGen/X86/required-vector-width.ll index 368c8acd4f8..6693e3c67a5 100644 --- a/test/CodeGen/X86/required-vector-width.ll +++ b/test/CodeGen/X86/required-vector-width.ll @@ -190,7 +190,8 @@ define i32 @_Z9test_charPcS_i_256(i8* nocapture readonly, i8* nocapture readonly ; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: vmovd %xmm0, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -319,7 +320,8 @@ define i32 @sad_16i8_256() "required-vector-width"="256" { ; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: vmovd %xmm0, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq diff --git a/test/CodeGen/X86/sad.ll b/test/CodeGen/X86/sad.ll index 314b0c74f9f..d7d1511d19d 100644 --- a/test/CodeGen/X86/sad.ll +++ b/test/CodeGen/X86/sad.ll @@ -56,7 +56,8 @@ define i32 @sad_16i8() nounwind { ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -80,7 +81,8 @@ define i32 @sad_16i8() nounwind { ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -152,16 +154,16 @@ define i32 @sad_32i8() nounwind { ; SSE2-NEXT: pxor %xmm12, %xmm12 ; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00 ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: pxor %xmm6, %xmm6 ; SSE2-NEXT: pxor %xmm13, %xmm13 ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: pxor %xmm15, %xmm15 ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: pxor %xmm14, %xmm14 ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: .LBB1_1: # %vector.body @@ -219,17 +221,17 @@ define i32 @sad_32i8() nounwind { ; SSE2-NEXT: psrad $31, %xmm6 ; SSE2-NEXT: paddd %xmm6, %xmm7 ; SSE2-NEXT: pxor %xmm6, %xmm7 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; SSE2-NEXT: paddd %xmm7, %xmm6 -; SSE2-NEXT: movdqa %xmm6, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm4, %xmm6 ; SSE2-NEXT: psrad $31, %xmm6 ; SSE2-NEXT: paddd %xmm6, %xmm4 ; SSE2-NEXT: pxor %xmm6, %xmm4 ; SSE2-NEXT: movdqa %xmm10, %xmm6 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Reload +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE2-NEXT: paddd %xmm4, %xmm7 -; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: psrad $31, %xmm4 ; SSE2-NEXT: paddd %xmm4, %xmm1 @@ -244,9 +246,9 @@ define i32 @sad_32i8() nounwind { ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm5 ; SSE2-NEXT: pxor %xmm1, %xmm5 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE2-NEXT: paddd %xmm5, %xmm1 -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm0 @@ -256,9 +258,9 @@ define i32 @sad_32i8() nounwind { ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: paddd %xmm0, %xmm2 ; SSE2-NEXT: pxor %xmm0, %xmm2 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE2-NEXT: paddd %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm8, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: paddd %xmm0, %xmm8 @@ -267,13 +269,13 @@ define i32 @sad_32i8() nounwind { ; SSE2-NEXT: addq $4, %rax ; SSE2-NEXT: jne .LBB1_1 ; SSE2-NEXT: # %bb.2: # %middle.block -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE2-NEXT: paddd %xmm15, %xmm0 ; SSE2-NEXT: paddd %xmm14, %xmm13 ; SSE2-NEXT: paddd %xmm0, %xmm13 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Folded Reload +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE2-NEXT: paddd %xmm13, %xmm6 ; SSE2-NEXT: paddd %xmm0, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,0,1] @@ -317,7 +319,8 @@ define i32 @sad_32i8() nounwind { ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -343,7 +346,8 @@ define i32 @sad_32i8() nounwind { ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -420,42 +424,42 @@ define i32 @sad_avx64i8() nounwind { ; SSE2-NEXT: pxor %xmm14, %xmm14 ; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00 ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: .LBB2_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 ; SSE2-NEXT: movaps a+1040(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: movdqa a+1024(%rax), %xmm12 ; SSE2-NEXT: movdqa a+1056(%rax), %xmm15 ; SSE2-NEXT: movdqa a+1072(%rax), %xmm4 @@ -516,7 +520,7 @@ define i32 @sad_avx64i8() nounwind { ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] ; SSE2-NEXT: psubd %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] ; SSE2-NEXT: psubd %xmm0, %xmm15 ; SSE2-NEXT: movdqa %xmm7, %xmm0 @@ -524,8 +528,8 @@ define i32 @sad_avx64i8() nounwind { ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3] ; SSE2-NEXT: psubd %xmm3, %xmm9 -; SSE2-NEXT: movdqa %xmm9, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE2-NEXT: movdqa %xmm2, %xmm9 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3],xmm9[4],xmm14[4],xmm9[5],xmm14[5],xmm9[6],xmm14[6],xmm9[7],xmm14[7] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] @@ -534,7 +538,7 @@ define i32 @sad_avx64i8() nounwind { ; SSE2-NEXT: movdqa %xmm7, %xmm0 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] ; SSE2-NEXT: psubd %xmm0, %xmm13 -; SSE2-NEXT: movdqa %xmm13, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm9, %xmm0 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7] @@ -563,16 +567,16 @@ define i32 @sad_avx64i8() nounwind { ; SSE2-NEXT: psrad $31, %xmm3 ; SSE2-NEXT: paddd %xmm3, %xmm1 ; SSE2-NEXT: pxor %xmm3, %xmm1 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE2-NEXT: paddd %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm6, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm6 ; SSE2-NEXT: pxor %xmm1, %xmm6 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE2-NEXT: paddd %xmm6, %xmm1 -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm5, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm5 @@ -584,118 +588,118 @@ define i32 @sad_avx64i8() nounwind { ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm4 ; SSE2-NEXT: pxor %xmm1, %xmm4 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE2-NEXT: paddd %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm8, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm8 ; SSE2-NEXT: pxor %xmm1, %xmm8 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE2-NEXT: paddd %xmm8, %xmm1 -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm11, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm11 ; SSE2-NEXT: pxor %xmm1, %xmm11 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE2-NEXT: paddd %xmm11, %xmm1 -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm2 ; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE2-NEXT: paddd %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm15, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm15 ; SSE2-NEXT: pxor %xmm1, %xmm15 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE2-NEXT: paddd %xmm15, %xmm1 -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm2 ; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE2-NEXT: paddd %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm10, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm10 ; SSE2-NEXT: pxor %xmm1, %xmm10 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE2-NEXT: paddd %xmm10, %xmm1 -; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm2 ; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE2-NEXT: paddd %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm12, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm12 ; SSE2-NEXT: pxor %xmm1, %xmm12 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE2-NEXT: paddd %xmm12, %xmm1 -; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm9, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: paddd %xmm0, %xmm9 ; SSE2-NEXT: pxor %xmm0, %xmm9 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE2-NEXT: paddd %xmm9, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm7, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: paddd %xmm0, %xmm7 ; SSE2-NEXT: pxor %xmm0, %xmm7 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE2-NEXT: paddd %xmm7, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm13, %xmm1 ; SSE2-NEXT: movdqa %xmm13, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: addq $4, %rax ; SSE2-NEXT: jne .LBB2_1 ; SSE2-NEXT: # %bb.2: # %middle.block -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Folded Reload +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE2-NEXT: paddd %xmm1, %xmm3 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Folded Reload +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE2-NEXT: paddd %xmm1, %xmm4 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm2 # 16-byte Folded Reload -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE2-NEXT: paddd (%rsp), %xmm1 # 16-byte Folded Reload ; SSE2-NEXT: paddd %xmm4, %xmm1 ; SSE2-NEXT: paddd %xmm2, %xmm1 @@ -737,30 +741,30 @@ define i32 @sad_avx64i8() nounwind { ; AVX1-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; AVX1-NEXT: vpsubd %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; AVX1-NEXT: vpsubd %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; AVX1-NEXT: vpsubd %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; AVX1-NEXT: vpsubd %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; AVX1-NEXT: vpsubd %xmm0, %xmm6, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; AVX1-NEXT: vpsubd %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; AVX1-NEXT: vpsubd %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; AVX1-NEXT: vpsubd %xmm6, %xmm5, %xmm4 @@ -803,27 +807,27 @@ define i32 @sad_avx64i8() nounwind { ; AVX1-NEXT: vpabsd %xmm4, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm13, %xmm2 ; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpabsd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-NEXT: vpaddd %xmm13, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm13 -; AVX1-NEXT: vpabsd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm1 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpabsd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload +; AVX1-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-NEXT: vpaddd %xmm8, %xmm1, %xmm1 -; AVX1-NEXT: vpabsd -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Folded Reload +; AVX1-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm8 ; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm0 ; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpabsd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload +; AVX1-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-NEXT: vpaddd %xmm9, %xmm1, %xmm1 -; AVX1-NEXT: vpabsd -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Folded Reload +; AVX1-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm9 ; AVX1-NEXT: vextractf128 $1, %ymm10, %xmm0 ; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpabsd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload +; AVX1-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-NEXT: vpaddd %xmm10, %xmm1, %xmm1 -; AVX1-NEXT: vpabsd -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Folded Reload +; AVX1-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm10 ; AVX1-NEXT: vextractf128 $1, %ymm12, %xmm0 ; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0 @@ -858,7 +862,8 @@ define i32 @sad_avx64i8() nounwind { ; AVX1-NEXT: vpaddd %xmm0, %xmm14, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: addq $24, %rsp ; AVX1-NEXT: vzeroupper @@ -886,10 +891,10 @@ define i32 @sad_avx64i8() nounwind { ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm13 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm14 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vmovdqu %ymm15, -{{[0-9]+}}(%rsp) # 32-byte Spill +; AVX2-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpsubd %ymm15, %ymm8, %ymm8 -; AVX2-NEXT: vmovdqu %ymm8, -{{[0-9]+}}(%rsp) # 32-byte Spill +; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpsubd %ymm15, %ymm9, %ymm9 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero @@ -903,9 +908,9 @@ define i32 @sad_avx64i8() nounwind { ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpsubd %ymm15, %ymm14, %ymm14 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vmovdqu -{{[0-9]+}}(%rsp), %ymm8 # 32-byte Reload +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-NEXT: vpsubd %ymm15, %ymm8, %ymm15 -; AVX2-NEXT: vpabsd -{{[0-9]+}}(%rsp), %ymm8 # 32-byte Folded Reload +; AVX2-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-NEXT: vpaddd %ymm7, %ymm8, %ymm7 ; AVX2-NEXT: vpabsd %ymm9, %ymm8 ; AVX2-NEXT: vpaddd %ymm5, %ymm8, %ymm5 @@ -935,7 +940,8 @@ define i32 @sad_avx64i8() nounwind { ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1430,7 +1436,8 @@ define i32 @sad_unroll_nonzero_initial(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: retq ; @@ -1448,7 +1455,8 @@ define i32 @sad_unroll_nonzero_initial(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1533,7 +1541,8 @@ define i32 @sad_double_reduction(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x i8>* % ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: retq ; @@ -1548,7 +1557,8 @@ define i32 @sad_double_reduction(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x i8>* % ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq diff --git a/test/CodeGen/X86/vector-reduce-add.ll b/test/CodeGen/X86/vector-reduce-add.ll index 7a5e5f34ad3..21c10c97f49 100644 --- a/test/CodeGen/X86/vector-reduce-add.ll +++ b/test/CodeGen/X86/vector-reduce-add.ll @@ -195,28 +195,21 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; define i32 @test_v4i32(<4 x i32> %a0) { -; SSE2-LABEL: test_v4i32: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v4i32: -; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE41-NEXT: paddd %xmm0, %xmm1 -; SSE41-NEXT: phaddd %xmm1, %xmm1 -; SSE41-NEXT: movd %xmm1, %eax -; SSE41-NEXT: retq +; SSE-LABEL: test_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: paddd %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE-NEXT: paddd %xmm1, %xmm0 +; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: retq ; ; AVX-LABEL: test_v4i32: ; AVX: # %bb.0: ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: retq ; @@ -224,7 +217,8 @@ define i32 @test_v4i32(<4 x i32> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: retq %1 = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> %a0) @@ -232,24 +226,15 @@ define i32 @test_v4i32(<4 x i32> %a0) { } define i32 @test_v8i32(<8 x i32> %a0) { -; SSE2-LABEL: test_v8i32: -; SSE2: # %bb.0: -; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v8i32: -; SSE41: # %bb.0: -; SSE41-NEXT: paddd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE41-NEXT: paddd %xmm0, %xmm1 -; SSE41-NEXT: phaddd %xmm1, %xmm1 -; SSE41-NEXT: movd %xmm1, %eax -; SSE41-NEXT: retq +; SSE-LABEL: test_v8i32: +; SSE: # %bb.0: +; SSE-NEXT: paddd %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: paddd %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE-NEXT: paddd %xmm1, %xmm0 +; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: retq ; ; AVX1-LABEL: test_v8i32: ; AVX1: # %bb.0: @@ -257,7 +242,8 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -268,7 +254,8 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -279,7 +266,8 @@ define i32 @test_v8i32(<8 x i32> %a0) { ; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -288,28 +276,17 @@ define i32 @test_v8i32(<8 x i32> %a0) { } define i32 @test_v16i32(<16 x i32> %a0) { -; SSE2-LABEL: test_v16i32: -; SSE2: # %bb.0: -; SSE2-NEXT: paddd %xmm3, %xmm1 -; SSE2-NEXT: paddd %xmm2, %xmm1 -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v16i32: -; SSE41: # %bb.0: -; SSE41-NEXT: paddd %xmm3, %xmm1 -; SSE41-NEXT: paddd %xmm2, %xmm1 -; SSE41-NEXT: paddd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE41-NEXT: paddd %xmm1, %xmm0 -; SSE41-NEXT: phaddd %xmm0, %xmm0 -; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: retq +; SSE-LABEL: test_v16i32: +; SSE: # %bb.0: +; SSE-NEXT: paddd %xmm3, %xmm1 +; SSE-NEXT: paddd %xmm2, %xmm1 +; SSE-NEXT: paddd %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: paddd %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE-NEXT: paddd %xmm0, %xmm1 +; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: retq ; ; AVX1-LABEL: test_v16i32: ; AVX1: # %bb.0: @@ -320,7 +297,8 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -332,7 +310,8 @@ define i32 @test_v16i32(<16 x i32> %a0) { ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -355,36 +334,21 @@ define i32 @test_v16i32(<16 x i32> %a0) { } define i32 @test_v32i32(<32 x i32> %a0) { -; SSE2-LABEL: test_v32i32: -; SSE2: # %bb.0: -; SSE2-NEXT: paddd %xmm6, %xmm2 -; SSE2-NEXT: paddd %xmm7, %xmm3 -; SSE2-NEXT: paddd %xmm5, %xmm3 -; SSE2-NEXT: paddd %xmm1, %xmm3 -; SSE2-NEXT: paddd %xmm4, %xmm2 -; SSE2-NEXT: paddd %xmm3, %xmm2 -; SSE2-NEXT: paddd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE2-NEXT: paddd %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v32i32: -; SSE41: # %bb.0: -; SSE41-NEXT: paddd %xmm6, %xmm2 -; SSE41-NEXT: paddd %xmm7, %xmm3 -; SSE41-NEXT: paddd %xmm5, %xmm3 -; SSE41-NEXT: paddd %xmm1, %xmm3 -; SSE41-NEXT: paddd %xmm4, %xmm2 -; SSE41-NEXT: paddd %xmm3, %xmm2 -; SSE41-NEXT: paddd %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE41-NEXT: paddd %xmm2, %xmm0 -; SSE41-NEXT: phaddd %xmm0, %xmm0 -; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: retq +; SSE-LABEL: test_v32i32: +; SSE: # %bb.0: +; SSE-NEXT: paddd %xmm6, %xmm2 +; SSE-NEXT: paddd %xmm7, %xmm3 +; SSE-NEXT: paddd %xmm5, %xmm3 +; SSE-NEXT: paddd %xmm1, %xmm3 +; SSE-NEXT: paddd %xmm4, %xmm2 +; SSE-NEXT: paddd %xmm3, %xmm2 +; SSE-NEXT: paddd %xmm0, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE-NEXT: paddd %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE-NEXT: paddd %xmm0, %xmm1 +; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: retq ; ; AVX1-LABEL: test_v32i32: ; AVX1: # %bb.0: @@ -401,7 +365,8 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -415,7 +380,8 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -443,29 +409,18 @@ define i32 @test_v32i32(<32 x i32> %a0) { ; define i16 @test_v8i16(<8 x i16> %a0) { -; SSE2-LABEL: test_v8i16: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: paddw %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE2-NEXT: paddw %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: paddw %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: # kill: def $ax killed $ax killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v8i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE41-NEXT: paddw %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE41-NEXT: paddw %xmm1, %xmm0 -; SSE41-NEXT: phaddw %xmm0, %xmm0 -; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: # kill: def $ax killed $ax killed $eax -; SSE41-NEXT: retq +; SSE-LABEL: test_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: paddw %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE-NEXT: paddw %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: paddw %xmm0, %xmm1 +; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: # kill: def $ax killed $ax killed $eax +; SSE-NEXT: retq ; ; AVX-LABEL: test_v8i16: ; AVX: # %bb.0: @@ -473,7 +428,8 @@ define i16 @test_v8i16(<8 x i16> %a0) { ; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: # kill: def $ax killed $ax killed $eax ; AVX-NEXT: retq @@ -484,7 +440,8 @@ define i16 @test_v8i16(<8 x i16> %a0) { ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: retq @@ -493,31 +450,19 @@ define i16 @test_v8i16(<8 x i16> %a0) { } define i16 @test_v16i16(<16 x i16> %a0) { -; SSE2-LABEL: test_v16i16: -; SSE2: # %bb.0: -; SSE2-NEXT: paddw %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: paddw %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE2-NEXT: paddw %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: paddw %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: # kill: def $ax killed $ax killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v16i16: -; SSE41: # %bb.0: -; SSE41-NEXT: paddw %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE41-NEXT: paddw %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE41-NEXT: paddw %xmm1, %xmm0 -; SSE41-NEXT: phaddw %xmm0, %xmm0 -; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: # kill: def $ax killed $ax killed $eax -; SSE41-NEXT: retq +; SSE-LABEL: test_v16i16: +; SSE: # %bb.0: +; SSE-NEXT: paddw %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: paddw %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE-NEXT: paddw %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: paddw %xmm0, %xmm1 +; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: # kill: def $ax killed $ax killed $eax +; SSE-NEXT: retq ; ; AVX1-LABEL: test_v16i16: ; AVX1: # %bb.0: @@ -527,7 +472,8 @@ define i16 @test_v16i16(<16 x i16> %a0) { ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax ; AVX1-NEXT: vzeroupper @@ -541,7 +487,8 @@ define i16 @test_v16i16(<16 x i16> %a0) { ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vphaddw %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper @@ -555,7 +502,8 @@ define i16 @test_v16i16(<16 x i16> %a0) { ; AVX512-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpaddw %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vphaddw %ymm0, %ymm0, %ymm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper @@ -565,35 +513,21 @@ define i16 @test_v16i16(<16 x i16> %a0) { } define i16 @test_v32i16(<32 x i16> %a0) { -; SSE2-LABEL: test_v32i16: -; SSE2: # %bb.0: -; SSE2-NEXT: paddw %xmm3, %xmm1 -; SSE2-NEXT: paddw %xmm2, %xmm1 -; SSE2-NEXT: paddw %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-NEXT: paddw %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE2-NEXT: paddw %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: paddw %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: # kill: def $ax killed $ax killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v32i16: -; SSE41: # %bb.0: -; SSE41-NEXT: paddw %xmm3, %xmm1 -; SSE41-NEXT: paddw %xmm2, %xmm1 -; SSE41-NEXT: paddw %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE41-NEXT: paddw %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE41-NEXT: paddw %xmm0, %xmm1 -; SSE41-NEXT: phaddw %xmm1, %xmm1 -; SSE41-NEXT: movd %xmm1, %eax -; SSE41-NEXT: # kill: def $ax killed $ax killed $eax -; SSE41-NEXT: retq +; SSE-LABEL: test_v32i16: +; SSE: # %bb.0: +; SSE-NEXT: paddw %xmm3, %xmm1 +; SSE-NEXT: paddw %xmm2, %xmm1 +; SSE-NEXT: paddw %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: paddw %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE-NEXT: paddw %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: paddw %xmm1, %xmm0 +; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: # kill: def $ax killed $ax killed $eax +; SSE-NEXT: retq ; ; AVX1-LABEL: test_v32i16: ; AVX1: # %bb.0: @@ -606,7 +540,8 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax ; AVX1-NEXT: vzeroupper @@ -621,7 +556,8 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vphaddw %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper @@ -648,43 +584,25 @@ define i16 @test_v32i16(<32 x i16> %a0) { } define i16 @test_v64i16(<64 x i16> %a0) { -; SSE2-LABEL: test_v64i16: -; SSE2: # %bb.0: -; SSE2-NEXT: paddw %xmm6, %xmm2 -; SSE2-NEXT: paddw %xmm7, %xmm3 -; SSE2-NEXT: paddw %xmm5, %xmm3 -; SSE2-NEXT: paddw %xmm1, %xmm3 -; SSE2-NEXT: paddw %xmm4, %xmm2 -; SSE2-NEXT: paddw %xmm3, %xmm2 -; SSE2-NEXT: paddw %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE2-NEXT: paddw %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE2-NEXT: paddw %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: paddw %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: # kill: def $ax killed $ax killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v64i16: -; SSE41: # %bb.0: -; SSE41-NEXT: paddw %xmm6, %xmm2 -; SSE41-NEXT: paddw %xmm7, %xmm3 -; SSE41-NEXT: paddw %xmm5, %xmm3 -; SSE41-NEXT: paddw %xmm1, %xmm3 -; SSE41-NEXT: paddw %xmm4, %xmm2 -; SSE41-NEXT: paddw %xmm3, %xmm2 -; SSE41-NEXT: paddw %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE41-NEXT: paddw %xmm2, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE41-NEXT: paddw %xmm0, %xmm1 -; SSE41-NEXT: phaddw %xmm1, %xmm1 -; SSE41-NEXT: movd %xmm1, %eax -; SSE41-NEXT: # kill: def $ax killed $ax killed $eax -; SSE41-NEXT: retq +; SSE-LABEL: test_v64i16: +; SSE: # %bb.0: +; SSE-NEXT: paddw %xmm6, %xmm2 +; SSE-NEXT: paddw %xmm7, %xmm3 +; SSE-NEXT: paddw %xmm5, %xmm3 +; SSE-NEXT: paddw %xmm1, %xmm3 +; SSE-NEXT: paddw %xmm4, %xmm2 +; SSE-NEXT: paddw %xmm3, %xmm2 +; SSE-NEXT: paddw %xmm0, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE-NEXT: paddw %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE-NEXT: paddw %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: paddw %xmm1, %xmm0 +; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: # kill: def $ax killed $ax killed $eax +; SSE-NEXT: retq ; ; AVX1-LABEL: test_v64i16: ; AVX1: # %bb.0: @@ -703,7 +621,8 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax ; AVX1-NEXT: vzeroupper @@ -720,7 +639,8 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vphaddw %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper diff --git a/test/CodeGen/X86/vector-reduce-fadd-fast.ll b/test/CodeGen/X86/vector-reduce-fadd-fast.ll index b17734b83e7..281c4f28d99 100644 --- a/test/CodeGen/X86/vector-reduce-fadd-fast.ll +++ b/test/CodeGen/X86/vector-reduce-fadd-fast.ll @@ -20,18 +20,20 @@ define float @test_v2f32(float %a0, <2 x float> %a1) { ; ; SSE41-LABEL: test_v2f32: ; SSE41: # %bb.0: -; SSE41-NEXT: movaps %xmm1, %xmm0 -; SSE41-NEXT: haddps %xmm1, %xmm0 +; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: addps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v2f32: ; AVX: # %bb.0: -; AVX-NEXT: vhaddps %xmm1, %xmm1, %xmm0 +; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] +; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2f32: ; AVX512: # %bb.0: -; AVX512-NEXT: vhaddps %xmm1, %xmm1, %xmm0 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] +; AVX512-NEXT: vaddps %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float %a0, <2 x float> %a1) ret float %1 @@ -50,24 +52,27 @@ define float @test_v4f32(float %a0, <4 x float> %a1) { ; ; SSE41-LABEL: test_v4f32: ; SSE41: # %bb.0: -; SSE41-NEXT: movaps %xmm1, %xmm0 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE41-NEXT: addps %xmm1, %xmm0 -; SSE41-NEXT: haddps %xmm0, %xmm0 +; SSE41-NEXT: movaps %xmm1, %xmm2 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE41-NEXT: addps %xmm1, %xmm2 +; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE41-NEXT: addps %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v4f32: ; AVX: # %bb.0: ; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] ; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4f32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] ; AVX512-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float %a0, <4 x float> %a1) ret float %1 @@ -88,10 +93,11 @@ define float @test_v8f32(float %a0, <8 x float> %a1) { ; SSE41-LABEL: test_v8f32: ; SSE41: # %bb.0: ; SSE41-NEXT: addps %xmm2, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm0 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE41-NEXT: addps %xmm1, %xmm0 -; SSE41-NEXT: haddps %xmm0, %xmm0 +; SSE41-NEXT: movaps %xmm1, %xmm2 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE41-NEXT: addps %xmm1, %xmm2 +; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE41-NEXT: addps %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v8f32: @@ -100,7 +106,8 @@ define float @test_v8f32(float %a0, <8 x float> %a1) { ; AVX-NEXT: vaddps %ymm0, %ymm1, %ymm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vhaddps %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -111,7 +118,8 @@ define float @test_v8f32(float %a0, <8 x float> %a1) { ; AVX512-NEXT: vaddps %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vaddps %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vhaddps %ymm0, %ymm0, %ymm0 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -138,10 +146,11 @@ define float @test_v16f32(float %a0, <16 x float> %a1) { ; SSE41-NEXT: addps %xmm4, %xmm2 ; SSE41-NEXT: addps %xmm3, %xmm1 ; SSE41-NEXT: addps %xmm2, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm0 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE41-NEXT: addps %xmm1, %xmm0 -; SSE41-NEXT: haddps %xmm0, %xmm0 +; SSE41-NEXT: movaps %xmm1, %xmm2 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE41-NEXT: addps %xmm1, %xmm2 +; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE41-NEXT: addps %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v16f32: @@ -151,7 +160,8 @@ define float @test_v16f32(float %a0, <16 x float> %a1) { ; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vhaddps %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -188,17 +198,20 @@ define float @test_v2f32_zero(<2 x float> %a0) { ; ; SSE41-LABEL: test_v2f32_zero: ; SSE41: # %bb.0: -; SSE41-NEXT: haddps %xmm0, %xmm0 +; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE41-NEXT: addps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v2f32_zero: ; AVX: # %bb.0: -; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2f32_zero: ; AVX512: # %bb.0: -; AVX512-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float 0.0, <2 x float> %a0) ret float %1 @@ -220,7 +233,8 @@ define float @test_v4f32_zero(<4 x float> %a0) { ; SSE41-NEXT: movaps %xmm0, %xmm1 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE41-NEXT: addps %xmm0, %xmm1 -; SSE41-NEXT: haddps %xmm1, %xmm1 +; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: addps %xmm0, %xmm1 ; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -228,14 +242,16 @@ define float @test_v4f32_zero(<4 x float> %a0) { ; AVX: # %bb.0: ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4f32_zero: ; AVX512: # %bb.0: ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float 0.0, <4 x float> %a0) ret float %1 @@ -259,7 +275,8 @@ define float @test_v8f32_zero(<8 x float> %a0) { ; SSE41-NEXT: movaps %xmm0, %xmm1 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE41-NEXT: addps %xmm0, %xmm1 -; SSE41-NEXT: haddps %xmm1, %xmm1 +; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: addps %xmm0, %xmm1 ; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -269,7 +286,8 @@ define float @test_v8f32_zero(<8 x float> %a0) { ; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vhaddps %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -280,7 +298,8 @@ define float @test_v8f32_zero(<8 x float> %a0) { ; AVX512-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vaddps %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vhaddps %ymm0, %ymm0, %ymm0 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -310,7 +329,8 @@ define float @test_v16f32_zero(<16 x float> %a0) { ; SSE41-NEXT: movaps %xmm0, %xmm1 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE41-NEXT: addps %xmm0, %xmm1 -; SSE41-NEXT: haddps %xmm1, %xmm1 +; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: addps %xmm0, %xmm1 ; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -321,7 +341,8 @@ define float @test_v16f32_zero(<16 x float> %a0) { ; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vhaddps %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -358,17 +379,20 @@ define float @test_v2f32_undef(<2 x float> %a0) { ; ; SSE41-LABEL: test_v2f32_undef: ; SSE41: # %bb.0: -; SSE41-NEXT: haddps %xmm0, %xmm0 +; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE41-NEXT: addps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v2f32_undef: ; AVX: # %bb.0: -; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2f32_undef: ; AVX512: # %bb.0: -; AVX512-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float undef, <2 x float> %a0) ret float %1 @@ -390,7 +414,8 @@ define float @test_v4f32_undef(<4 x float> %a0) { ; SSE41-NEXT: movaps %xmm0, %xmm1 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE41-NEXT: addps %xmm0, %xmm1 -; SSE41-NEXT: haddps %xmm1, %xmm1 +; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: addps %xmm0, %xmm1 ; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -398,14 +423,16 @@ define float @test_v4f32_undef(<4 x float> %a0) { ; AVX: # %bb.0: ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4f32_undef: ; AVX512: # %bb.0: ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float undef, <4 x float> %a0) ret float %1 @@ -429,7 +456,8 @@ define float @test_v8f32_undef(<8 x float> %a0) { ; SSE41-NEXT: movaps %xmm0, %xmm1 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE41-NEXT: addps %xmm0, %xmm1 -; SSE41-NEXT: haddps %xmm1, %xmm1 +; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: addps %xmm0, %xmm1 ; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -439,7 +467,8 @@ define float @test_v8f32_undef(<8 x float> %a0) { ; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vhaddps %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -450,7 +479,8 @@ define float @test_v8f32_undef(<8 x float> %a0) { ; AVX512-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vaddps %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vhaddps %ymm0, %ymm0, %ymm0 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -480,7 +510,8 @@ define float @test_v16f32_undef(<16 x float> %a0) { ; SSE41-NEXT: movaps %xmm0, %xmm1 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE41-NEXT: addps %xmm0, %xmm1 -; SSE41-NEXT: haddps %xmm1, %xmm1 +; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: addps %xmm0, %xmm1 ; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -491,7 +522,8 @@ define float @test_v16f32_undef(<16 x float> %a0) { ; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vhaddps %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -518,53 +550,43 @@ define float @test_v16f32_undef(<16 x float> %a0) { ; define double @test_v2f64(double %a0, <2 x double> %a1) { -; SSE2-LABEL: test_v2f64: -; SSE2: # %bb.0: -; SSE2-NEXT: movapd %xmm1, %xmm0 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE2-NEXT: addpd %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v2f64: -; SSE41: # %bb.0: -; SSE41-NEXT: movapd %xmm1, %xmm0 -; SSE41-NEXT: haddpd %xmm1, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: test_v2f64: +; SSE: # %bb.0: +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: addpd %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: test_v2f64: ; AVX: # %bb.0: -; AVX-NEXT: vhaddpd %xmm1, %xmm1, %xmm0 +; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] +; AVX-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2f64: ; AVX512: # %bb.0: -; AVX512-NEXT: vhaddpd %xmm1, %xmm1, %xmm0 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] +; AVX512-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double %a0, <2 x double> %a1) ret double %1 } define double @test_v4f64(double %a0, <4 x double> %a1) { -; SSE2-LABEL: test_v4f64: -; SSE2: # %bb.0: -; SSE2-NEXT: addpd %xmm2, %xmm1 -; SSE2-NEXT: movapd %xmm1, %xmm0 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE2-NEXT: addpd %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v4f64: -; SSE41: # %bb.0: -; SSE41-NEXT: movapd %xmm1, %xmm0 -; SSE41-NEXT: addpd %xmm2, %xmm0 -; SSE41-NEXT: haddpd %xmm0, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: test_v4f64: +; SSE: # %bb.0: +; SSE-NEXT: addpd %xmm2, %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: addpd %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: test_v4f64: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm0 ; AVX-NEXT: vaddpd %ymm0, %ymm1, %ymm0 -; AVX-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -573,7 +595,8 @@ define double @test_v4f64(double %a0, <4 x double> %a1) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm0 ; AVX512-NEXT: vaddpd %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -582,31 +605,23 @@ define double @test_v4f64(double %a0, <4 x double> %a1) { } define double @test_v8f64(double %a0, <8 x double> %a1) { -; SSE2-LABEL: test_v8f64: -; SSE2: # %bb.0: -; SSE2-NEXT: addpd %xmm4, %xmm2 -; SSE2-NEXT: addpd %xmm3, %xmm1 -; SSE2-NEXT: addpd %xmm2, %xmm1 -; SSE2-NEXT: movapd %xmm1, %xmm0 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE2-NEXT: addpd %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v8f64: -; SSE41: # %bb.0: -; SSE41-NEXT: movapd %xmm1, %xmm0 -; SSE41-NEXT: addpd %xmm4, %xmm2 -; SSE41-NEXT: addpd %xmm3, %xmm0 -; SSE41-NEXT: addpd %xmm2, %xmm0 -; SSE41-NEXT: haddpd %xmm0, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: test_v8f64: +; SSE: # %bb.0: +; SSE-NEXT: addpd %xmm4, %xmm2 +; SSE-NEXT: addpd %xmm3, %xmm1 +; SSE-NEXT: addpd %xmm2, %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: addpd %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: test_v8f64: ; AVX: # %bb.0: ; AVX-NEXT: vaddpd %ymm2, %ymm1, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -627,32 +642,19 @@ define double @test_v8f64(double %a0, <8 x double> %a1) { } define double @test_v16f64(double %a0, <16 x double> %a1) { -; SSE2-LABEL: test_v16f64: -; SSE2: # %bb.0: -; SSE2-NEXT: addpd %xmm6, %xmm2 -; SSE2-NEXT: addpd %xmm7, %xmm3 -; SSE2-NEXT: addpd %xmm5, %xmm1 -; SSE2-NEXT: addpd %xmm3, %xmm1 -; SSE2-NEXT: addpd {{[0-9]+}}(%rsp), %xmm4 -; SSE2-NEXT: addpd %xmm2, %xmm4 -; SSE2-NEXT: addpd %xmm1, %xmm4 -; SSE2-NEXT: movapd %xmm4, %xmm0 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] -; SSE2-NEXT: addpd %xmm4, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v16f64: -; SSE41: # %bb.0: -; SSE41-NEXT: movapd %xmm4, %xmm0 -; SSE41-NEXT: addpd %xmm6, %xmm2 -; SSE41-NEXT: addpd %xmm7, %xmm3 -; SSE41-NEXT: addpd %xmm5, %xmm1 -; SSE41-NEXT: addpd %xmm3, %xmm1 -; SSE41-NEXT: addpd {{[0-9]+}}(%rsp), %xmm0 -; SSE41-NEXT: addpd %xmm2, %xmm0 -; SSE41-NEXT: addpd %xmm1, %xmm0 -; SSE41-NEXT: haddpd %xmm0, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: test_v16f64: +; SSE: # %bb.0: +; SSE-NEXT: addpd %xmm6, %xmm2 +; SSE-NEXT: addpd %xmm7, %xmm3 +; SSE-NEXT: addpd %xmm5, %xmm1 +; SSE-NEXT: addpd %xmm3, %xmm1 +; SSE-NEXT: addpd {{[0-9]+}}(%rsp), %xmm4 +; SSE-NEXT: addpd %xmm2, %xmm4 +; SSE-NEXT: addpd %xmm1, %xmm4 +; SSE-NEXT: movapd %xmm4, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; SSE-NEXT: addpd %xmm4, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: test_v16f64: ; AVX: # %bb.0: @@ -661,7 +663,8 @@ define double @test_v16f64(double %a0, <16 x double> %a1) { ; AVX-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -687,53 +690,45 @@ define double @test_v16f64(double %a0, <16 x double> %a1) { ; define double @test_v2f64_zero(<2 x double> %a0) { -; SSE2-LABEL: test_v2f64_zero: -; SSE2: # %bb.0: -; SSE2-NEXT: movapd %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: addpd %xmm0, %xmm1 -; SSE2-NEXT: movapd %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v2f64_zero: -; SSE41: # %bb.0: -; SSE41-NEXT: haddpd %xmm0, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: test_v2f64_zero: +; SSE: # %bb.0: +; SSE-NEXT: movapd %xmm0, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: addpd %xmm0, %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: test_v2f64_zero: ; AVX: # %bb.0: -; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2f64_zero: ; AVX512: # %bb.0: -; AVX512-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double 0.0, <2 x double> %a0) ret double %1 } define double @test_v4f64_zero(<4 x double> %a0) { -; SSE2-LABEL: test_v4f64_zero: -; SSE2: # %bb.0: -; SSE2-NEXT: addpd %xmm1, %xmm0 -; SSE2-NEXT: movapd %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: addpd %xmm0, %xmm1 -; SSE2-NEXT: movapd %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v4f64_zero: -; SSE41: # %bb.0: -; SSE41-NEXT: addpd %xmm1, %xmm0 -; SSE41-NEXT: haddpd %xmm0, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: test_v4f64_zero: +; SSE: # %bb.0: +; SSE-NEXT: addpd %xmm1, %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: addpd %xmm0, %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: test_v4f64_zero: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -742,7 +737,8 @@ define double @test_v4f64_zero(<4 x double> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -751,31 +747,24 @@ define double @test_v4f64_zero(<4 x double> %a0) { } define double @test_v8f64_zero(<8 x double> %a0) { -; SSE2-LABEL: test_v8f64_zero: -; SSE2: # %bb.0: -; SSE2-NEXT: addpd %xmm3, %xmm1 -; SSE2-NEXT: addpd %xmm2, %xmm0 -; SSE2-NEXT: addpd %xmm1, %xmm0 -; SSE2-NEXT: movapd %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: addpd %xmm0, %xmm1 -; SSE2-NEXT: movapd %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v8f64_zero: -; SSE41: # %bb.0: -; SSE41-NEXT: addpd %xmm3, %xmm1 -; SSE41-NEXT: addpd %xmm2, %xmm0 -; SSE41-NEXT: addpd %xmm1, %xmm0 -; SSE41-NEXT: haddpd %xmm0, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: test_v8f64_zero: +; SSE: # %bb.0: +; SSE-NEXT: addpd %xmm3, %xmm1 +; SSE-NEXT: addpd %xmm2, %xmm0 +; SSE-NEXT: addpd %xmm1, %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: addpd %xmm0, %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: test_v8f64_zero: ; AVX: # %bb.0: ; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -796,32 +785,19 @@ define double @test_v8f64_zero(<8 x double> %a0) { } define double @test_v16f64_zero(<16 x double> %a0) { -; SSE2-LABEL: test_v16f64_zero: -; SSE2: # %bb.0: -; SSE2-NEXT: addpd %xmm6, %xmm2 -; SSE2-NEXT: addpd %xmm4, %xmm0 -; SSE2-NEXT: addpd %xmm2, %xmm0 -; SSE2-NEXT: addpd %xmm7, %xmm3 -; SSE2-NEXT: addpd %xmm5, %xmm1 -; SSE2-NEXT: addpd %xmm3, %xmm1 -; SSE2-NEXT: addpd %xmm0, %xmm1 -; SSE2-NEXT: movapd %xmm1, %xmm0 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE2-NEXT: addpd %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v16f64_zero: -; SSE41: # %bb.0: -; SSE41-NEXT: addpd %xmm6, %xmm2 -; SSE41-NEXT: addpd %xmm4, %xmm0 -; SSE41-NEXT: addpd %xmm2, %xmm0 -; SSE41-NEXT: addpd %xmm7, %xmm3 -; SSE41-NEXT: addpd %xmm5, %xmm1 -; SSE41-NEXT: addpd %xmm3, %xmm1 -; SSE41-NEXT: addpd %xmm0, %xmm1 -; SSE41-NEXT: haddpd %xmm1, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: test_v16f64_zero: +; SSE: # %bb.0: +; SSE-NEXT: addpd %xmm6, %xmm2 +; SSE-NEXT: addpd %xmm4, %xmm0 +; SSE-NEXT: addpd %xmm2, %xmm0 +; SSE-NEXT: addpd %xmm7, %xmm3 +; SSE-NEXT: addpd %xmm5, %xmm1 +; SSE-NEXT: addpd %xmm3, %xmm1 +; SSE-NEXT: addpd %xmm0, %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: addpd %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: test_v16f64_zero: ; AVX: # %bb.0: @@ -830,7 +806,8 @@ define double @test_v16f64_zero(<16 x double> %a0) { ; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -856,53 +833,45 @@ define double @test_v16f64_zero(<16 x double> %a0) { ; define double @test_v2f64_undef(<2 x double> %a0) { -; SSE2-LABEL: test_v2f64_undef: -; SSE2: # %bb.0: -; SSE2-NEXT: movapd %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: addpd %xmm0, %xmm1 -; SSE2-NEXT: movapd %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v2f64_undef: -; SSE41: # %bb.0: -; SSE41-NEXT: haddpd %xmm0, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: test_v2f64_undef: +; SSE: # %bb.0: +; SSE-NEXT: movapd %xmm0, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: addpd %xmm0, %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: test_v2f64_undef: ; AVX: # %bb.0: -; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2f64_undef: ; AVX512: # %bb.0: -; AVX512-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double undef, <2 x double> %a0) ret double %1 } define double @test_v4f64_undef(<4 x double> %a0) { -; SSE2-LABEL: test_v4f64_undef: -; SSE2: # %bb.0: -; SSE2-NEXT: addpd %xmm1, %xmm0 -; SSE2-NEXT: movapd %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: addpd %xmm0, %xmm1 -; SSE2-NEXT: movapd %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v4f64_undef: -; SSE41: # %bb.0: -; SSE41-NEXT: addpd %xmm1, %xmm0 -; SSE41-NEXT: haddpd %xmm0, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: test_v4f64_undef: +; SSE: # %bb.0: +; SSE-NEXT: addpd %xmm1, %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: addpd %xmm0, %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: test_v4f64_undef: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -911,7 +880,8 @@ define double @test_v4f64_undef(<4 x double> %a0) { ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -920,31 +890,24 @@ define double @test_v4f64_undef(<4 x double> %a0) { } define double @test_v8f64_undef(<8 x double> %a0) { -; SSE2-LABEL: test_v8f64_undef: -; SSE2: # %bb.0: -; SSE2-NEXT: addpd %xmm3, %xmm1 -; SSE2-NEXT: addpd %xmm2, %xmm0 -; SSE2-NEXT: addpd %xmm1, %xmm0 -; SSE2-NEXT: movapd %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: addpd %xmm0, %xmm1 -; SSE2-NEXT: movapd %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v8f64_undef: -; SSE41: # %bb.0: -; SSE41-NEXT: addpd %xmm3, %xmm1 -; SSE41-NEXT: addpd %xmm2, %xmm0 -; SSE41-NEXT: addpd %xmm1, %xmm0 -; SSE41-NEXT: haddpd %xmm0, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: test_v8f64_undef: +; SSE: # %bb.0: +; SSE-NEXT: addpd %xmm3, %xmm1 +; SSE-NEXT: addpd %xmm2, %xmm0 +; SSE-NEXT: addpd %xmm1, %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: addpd %xmm0, %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: test_v8f64_undef: ; AVX: # %bb.0: ; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -965,32 +928,19 @@ define double @test_v8f64_undef(<8 x double> %a0) { } define double @test_v16f64_undef(<16 x double> %a0) { -; SSE2-LABEL: test_v16f64_undef: -; SSE2: # %bb.0: -; SSE2-NEXT: addpd %xmm6, %xmm2 -; SSE2-NEXT: addpd %xmm4, %xmm0 -; SSE2-NEXT: addpd %xmm2, %xmm0 -; SSE2-NEXT: addpd %xmm7, %xmm3 -; SSE2-NEXT: addpd %xmm5, %xmm1 -; SSE2-NEXT: addpd %xmm3, %xmm1 -; SSE2-NEXT: addpd %xmm0, %xmm1 -; SSE2-NEXT: movapd %xmm1, %xmm0 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE2-NEXT: addpd %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v16f64_undef: -; SSE41: # %bb.0: -; SSE41-NEXT: addpd %xmm6, %xmm2 -; SSE41-NEXT: addpd %xmm4, %xmm0 -; SSE41-NEXT: addpd %xmm2, %xmm0 -; SSE41-NEXT: addpd %xmm7, %xmm3 -; SSE41-NEXT: addpd %xmm5, %xmm1 -; SSE41-NEXT: addpd %xmm3, %xmm1 -; SSE41-NEXT: addpd %xmm0, %xmm1 -; SSE41-NEXT: haddpd %xmm1, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: test_v16f64_undef: +; SSE: # %bb.0: +; SSE-NEXT: addpd %xmm6, %xmm2 +; SSE-NEXT: addpd %xmm4, %xmm0 +; SSE-NEXT: addpd %xmm2, %xmm0 +; SSE-NEXT: addpd %xmm7, %xmm3 +; SSE-NEXT: addpd %xmm5, %xmm1 +; SSE-NEXT: addpd %xmm3, %xmm1 +; SSE-NEXT: addpd %xmm0, %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: addpd %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: test_v16f64_undef: ; AVX: # %bb.0: @@ -999,7 +949,8 @@ define double @test_v16f64_undef(<16 x double> %a0) { ; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq diff --git a/test/CodeGen/X86/vector-shuffle-combining.ll b/test/CodeGen/X86/vector-shuffle-combining.ll index 5c0a223d496..2eb9362947e 100644 --- a/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/test/CodeGen/X86/vector-shuffle-combining.ll @@ -2700,36 +2700,21 @@ define <4 x i32> @combine_constant_insertion_v4i32(i32 %f) { } define <4 x float> @PR22377(<4 x float> %a, <4 x float> %b) { -; SSE2-LABEL: PR22377: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm0[1,3] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,0,2] -; SSE2-NEXT: addps %xmm0, %xmm1 -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: PR22377: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movaps %xmm0, %xmm1 -; SSSE3-NEXT: haddps %xmm0, %xmm1 -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1] -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: PR22377: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: haddps %xmm0, %xmm1 -; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1] -; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSE41-NEXT: retq +; SSE-LABEL: PR22377: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,0,2] +; SSE-NEXT: addps %xmm0, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: retq ; ; AVX-LABEL: PR22377: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm1 -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1] -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,3,1,3] +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,2] +; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm1 +; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX-NEXT: retq entry: %s1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> -- GitLab From f0303e4307941c869627facd190e9bab1977baf0 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 12 Oct 2018 17:01:46 +0000 Subject: [PATCH 0113/1116] [BPF] Add BTF generation for BPF target BTF is the debug format for BPF, a kernel virtual machine and widely used for tracing, networking and security, etc ([1]). Currently only instruction streams are passed to kernel, the kernel verifier verifies them before execution. In order to provide better visibility of bpf programs to user space tools, some debug information, e.g., function names and debug line information are desirable for kernel so tools can get such information with better annotation for jited instructions for performance or other reasons. The dwarf is too complicated in kernel and for BPF. Hence, BTF is designed to be the debug format for BPF ([2]). Right now, pahole supports BTF for types, which are generated based on dwarf sections in the ELF file. In order to annotate performance metrics for jited bpf insns, it is necessary to pass debug line info to the kernel. Furthermore, we want to pass the actual code to the kernel because of the following reasons: . bpf program typically is small so storage overhead should be small. . in bpf land, it is totally possible that an application loads the bpf program into the kernel and then that application quits, so holding debug info by the user space application is not practical. . having source codes directly kept by kernel would ease deployment since the original source code does not need ship on every hosts and kernel-devel package does not need to be deployed even if kernel headers are used. The only reliable time to get the source code is during compilation time. This will result in both more accurate information and easier deployment as stated in the above. Another consideration is for JIT. The project like bcc use MCJIT to compile a C program into bpf insns and load them to the kernel ([3]). The generated BTF sections will be readily available for such cases as well. This patch implemented generation of BTF info in llvm compiler. The BTF related sections will be generated when both -target bpf and -g are specified. Two sections are generated: .BTF contains all the type and string information, and .BTF.ext contains the func_info and line_info. The separation is related to how two sections are used differently in bpf loader, e.g., linux libbpf ([4]). The .BTF section can be loaded into the kernel directly while .BTF.ext needs loader manipulation before loading to the kernel. The format of the each section is roughly defined in llvm:include/llvm/MC/MCBTFContext.h and from the implementation in llvm:lib/MC/MCBTFContext.cpp. A later example also shows the contents in each section. The type and func_info are gathered during CodeGen/AsmPrinter by traversing dwarf debug_info. The line_info is gathered in MCObjectStreamer before writing to the object file. After all the information is gathered, the two sections are emitted in MCObjectStreamer::finishImpl. With cmake CMAKE_BUILD_TYPE=Debug, the compiler can dump out all the tables except insn offset, which will be resolved later as relocation records. The debug type "btf" is used for BTFContext dump. Dwarf tests the debug info generation with llvm-dwarfdump to decode the binary sections and check whether the result is expected. Currently we do not have such a tool yet. We will implement btf dump functionality in bpftool ([5]) as the bpftool is considered the recommended tool for bpf introspection. The implementation for type and func_info is tested with linux kernel test cases. The line_info is visually checked with dump from linux kernel libbpf ([4]) and checked with readelf dumping section raw data. Note that the .BTF and .BTF.ext information will not be emitted to assembly code and there is no assembler support for BTF either. In the below, with a clang/llvm built with CMAKE_BUILD_TYPE=Debug, Each table contents are shown for a simple C program. -bash-4.2$ cat -n test.c 1 struct A { 2 int a; 3 char b; 4 }; 5 6 int test(struct A *t) { 7 return t->a; 8 } -bash-4.2$ clang -O2 -target bpf -g -mllvm -debug-only=btf -c test.c Type Table: [1] FUNC name_off=1 info=0x0c000001 size/type=2 param_type=3 [2] INT name_off=12 info=0x01000000 size/type=4 desc=0x01000020 [3] PTR name_off=0 info=0x02000000 size/type=4 [4] STRUCT name_off=16 info=0x04000002 size/type=8 name_off=18 type=2 bit_offset=0 name_off=20 type=5 bit_offset=32 [5] INT name_off=22 info=0x01000000 size/type=1 desc=0x02000008 String Table: 0 : 1 : test 6 : .text 12 : int 16 : A 18 : a 20 : b 22 : char 27 : test.c 34 : int test(struct A *t) { 58 : return t->a; FuncInfo Table: sec_name_off=6 insn_offset= type_id=1 LineInfo Table: sec_name_off=6 insn_offset= file_name_off=27 line_off=34 line_num=6 column_num=0 insn_offset= file_name_off=27 line_off=58 line_num=7 column_num=3 -bash-4.2$ readelf -S test.o ...... [12] .BTF PROGBITS 0000000000000000 0000028d 00000000000000c1 0000000000000000 0 0 1 [13] .BTF.ext PROGBITS 0000000000000000 0000034e 0000000000000050 0000000000000000 0 0 1 [14] .rel.BTF.ext REL 0000000000000000 00000648 0000000000000030 0000000000000010 16 13 8 ...... -bash-4.2$ The latest linux kernel ([6]) can already support .BTF with type information. The [7] has the reference implementation in linux kernel side to support .BTF.ext func_info. The .BTF.ext line_info support is not implemented yet. If you have difficulty accessing [6], you can manually do the following to access the code: git clone https://github.com/yonghong-song/bpf-next-linux.git cd bpf-next-linux git checkout btf The change will push to linux kernel soon once this patch is landed. References: [1]. https://www.kernel.org/doc/Documentation/networking/filter.txt [2]. https://lwn.net/Articles/750695/ [3]. https://github.com/iovisor/bcc [4]. https://github.com/torvalds/linux/tree/master/tools/lib/bpf [5]. https://github.com/torvalds/linux/tree/master/tools/bpf/bpftool [6]. https://github.com/torvalds/linux [7]. https://github.com/yonghong-song/bpf-next-linux/tree/btf Signed-off-by: Song Liu Signed-off-by: Yonghong Song Acked-by: Alexei Starovoitov Differential Revision: https://reviews.llvm.org/D52950 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344366 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/MC/MCBTFContext.h | 364 +++++++++++++++++++ include/llvm/MC/MCContext.h | 7 + include/llvm/MC/MCObjectFileInfo.h | 8 + include/llvm/MC/MCObjectStreamer.h | 1 + lib/CodeGen/AsmPrinter/CMakeLists.txt | 1 + lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp | 501 ++++++++++++++++++++++++++ lib/CodeGen/AsmPrinter/Dwarf2BTF.h | 134 +++++++ lib/CodeGen/AsmPrinter/DwarfDebug.cpp | 10 + lib/CodeGen/AsmPrinter/DwarfDebug.h | 3 + lib/CodeGen/AsmPrinter/DwarfFile.cpp | 10 + lib/CodeGen/AsmPrinter/DwarfFile.h | 3 + lib/MC/CMakeLists.txt | 2 + lib/MC/MCBTFContext.cpp | 235 ++++++++++++ lib/MC/MCContext.cpp | 11 +- lib/MC/MCDwarf2BTF.cpp | 99 +++++ lib/MC/MCDwarf2BTF.h | 29 ++ lib/MC/MCObjectFileInfo.cpp | 3 + lib/MC/MCObjectStreamer.cpp | 34 ++ 18 files changed, 1454 insertions(+), 1 deletion(-) create mode 100644 include/llvm/MC/MCBTFContext.h create mode 100644 lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp create mode 100644 lib/CodeGen/AsmPrinter/Dwarf2BTF.h create mode 100644 lib/MC/MCBTFContext.cpp create mode 100644 lib/MC/MCDwarf2BTF.cpp create mode 100644 lib/MC/MCDwarf2BTF.h diff --git a/include/llvm/MC/MCBTFContext.h b/include/llvm/MC/MCBTFContext.h new file mode 100644 index 00000000000..f180a69340b --- /dev/null +++ b/include/llvm/MC/MCBTFContext.h @@ -0,0 +1,364 @@ +//===- MCBTFContext.h ---------------------------------------- *- C++ --*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +// This header file contains two parts. The first part is the BTF ELF +// specification in C format, and the second part is the various +// C++ classes to manipulate the data structure in order to generate +// the BTF related ELF sections. +//===----------------------------------------------------------------------===// +#ifndef LLVM_MC_MCBTFCONTEXT_H +#define LLVM_MC_MCBTFCONTEXT_H + +#include + +#define BTF_MAGIC 0xeB9F +#define BTF_VERSION 1 + +struct btf_header { + __u16 magic; + __u8 version; + __u8 flags; + __u32 hdr_len; + + /* All offsets are in bytes relative to the end of this header */ + __u32 type_off; /* offset of type section */ + __u32 type_len; /* length of type section */ + __u32 str_off; /* offset of string section */ + __u32 str_len; /* length of string section */ +}; + +/* Max # of type identifier */ +#define BTF_MAX_TYPE 0x0000ffff +/* Max offset into the string section */ +#define BTF_MAX_NAME_OFFSET 0x0000ffff +/* Max # of struct/union/enum members or func args */ +#define BTF_MAX_VLEN 0xffff + +struct btf_type { + __u32 name_off; + /* "info" bits arrangement + * bits 0-15: vlen (e.g. # of struct's members) + * bits 16-23: unused + * bits 24-27: kind (e.g. int, ptr, array...etc) + * bits 28-31: unused + */ + __u32 info; + /* "size" is used by INT, ENUM, STRUCT and UNION. + * "size" tells the size of the type it is describing. + * + * "type" is used by PTR, TYPEDEF, VOLATILE, CONST, RESTRICT, + * FUNC and FUNC_PROTO. + * "type" is a type_id referring to another type. + */ + union { + __u32 size; + __u32 type; + }; +}; + +#define BTF_INFO_KIND(info) (((info) >> 24) & 0x0f) +#define BTF_INFO_VLEN(info) ((info) & 0xffff) + +#define BTF_KIND_UNKN 0 /* Unknown */ +#define BTF_KIND_INT 1 /* Integer */ +#define BTF_KIND_PTR 2 /* Pointer */ +#define BTF_KIND_ARRAY 3 /* Array */ +#define BTF_KIND_STRUCT 4 /* Struct */ +#define BTF_KIND_UNION 5 /* Union */ +#define BTF_KIND_ENUM 6 /* Enumeration */ +#define BTF_KIND_FWD 7 /* Forward */ +#define BTF_KIND_TYPEDEF 8 /* Typedef */ +#define BTF_KIND_VOLATILE 9 /* Volatile */ +#define BTF_KIND_CONST 10 /* Const */ +#define BTF_KIND_RESTRICT 11 /* Restrict */ +#define BTF_KIND_FUNC 12 /* Function */ +#define BTF_KIND_FUNC_PROTO 13 /* Function Prototype */ +#define BTF_KIND_MAX 13 +#define NR_BTF_KINDS 14 + +/* For some specific BTF_KIND, "struct btf_type" is immediately + * followed by extra data. + */ + +/* BTF_KIND_INT is followed by a u32 and the following + * is the 32 bits arrangement: + */ +#define BTF_INT_ENCODING(VAL) (((VAL) & 0x0f000000) >> 24) +#define BTF_INT_OFFSET(VAL) (((VAL & 0x00ff0000)) >> 16) +#define BTF_INT_BITS(VAL) ((VAL) & 0x000000ff) + +/* Attributes stored in the BTF_INT_ENCODING */ +#define BTF_INT_SIGNED (1 << 0) +#define BTF_INT_CHAR (1 << 1) +#define BTF_INT_BOOL (1 << 2) + +/* BTF_KIND_ENUM is followed by multiple "struct btf_enum". + * The exact number of btf_enum is stored in the vlen (of the + * info in "struct btf_type"). + */ +struct btf_enum { + __u32 name_off; + __s32 val; +}; + +/* BTF_KIND_ARRAY is followed by one "struct btf_array" */ +struct btf_array { + __u32 type; + __u32 index_type; + __u32 nelems; +}; + +/* BTF_KIND_STRUCT and BTF_KIND_UNION are followed + * by multiple "struct btf_member". The exact number + * of btf_member is stored in the vlen (of the info in + * "struct btf_type"). + */ +struct btf_member { + __u32 name_off; + __u32 type; + __u32 offset; /* offset in bits */ +}; + +/* .BTF.ext section contains func_info and line_info. + */ +struct btf_ext_header { + __u16 magic; + __u8 version; + __u8 flags; + __u32 hdr_len; + + __u32 func_info_off; + __u32 func_info_len; + __u32 line_info_off; + __u32 line_info_len; +}; + +struct bpf_func_info { + __u32 insn_offset; + __u32 type_id; +}; + +struct btf_sec_func_info { + __u32 sec_name_off; + __u32 num_func_info; +}; + +struct bpf_line_info { + __u32 insn_offset; + __u32 file_name_off; + __u32 line_off; + __u32 line_col; /* line num: line_col >> 10, col num: line_col & 0x3ff */ +}; + +struct btf_sec_line_info { + __u32 sec_name_off; + __u32 num_line_info; +}; + +namespace llvm { + +const char *const btf_kind_str[NR_BTF_KINDS] = { + [BTF_KIND_UNKN] = "UNKNOWN", + [BTF_KIND_INT] = "INT", + [BTF_KIND_PTR] = "PTR", + [BTF_KIND_ARRAY] = "ARRAY", + [BTF_KIND_STRUCT] = "STRUCT", + [BTF_KIND_UNION] = "UNION", + [BTF_KIND_ENUM] = "ENUM", + [BTF_KIND_FWD] = "FWD", + [BTF_KIND_TYPEDEF] = "TYPEDEF", + [BTF_KIND_VOLATILE] = "VOLATILE", + [BTF_KIND_CONST] = "CONST", + [BTF_KIND_RESTRICT] = "RESTRICT", + [BTF_KIND_FUNC] = "FUNC", + [BTF_KIND_FUNC_PROTO] = "FUNC_PROTO", +}; + +#include "llvm/ADT/SmallVector.h" +#include + +class MCBTFContext; +class MCObjectStreamer; + +// This is base class of all BTF KIND. It is also used directly +// by the reference kinds: +// BTF_KIND_CONST, BTF_KIND_PTR, BTF_KIND_VOLATILE, +// BTF_KIND_TYPEDEF, BTF_KIND_RESTRICT, and BTF_KIND_FWD +class BTFTypeEntry { +protected: + size_t Id; /* type index in the BTF list, started from 1 */ + struct btf_type BTFType; + +public: + BTFTypeEntry(size_t id, struct btf_type &type) : + Id(id), BTFType(type) {} + unsigned char getKind() { return BTF_INFO_KIND(BTFType.info); } + void setId(size_t Id) { this->Id = Id; } + size_t getId() { return Id; } + void setNameOff(unsigned NameOff) { BTFType.name_off = NameOff; } + + unsigned getTypeIndex() { return BTFType.type; } + unsigned getNameOff() { return BTFType.name_off; } + virtual size_t getSize() { return sizeof(struct btf_type); } + virtual void print(raw_ostream &s, MCBTFContext& BTFContext); + virtual void emitData(MCObjectStreamer *MCOS); +}; + +// BTF_KIND_INT +class BTFTypeEntryInt : public BTFTypeEntry { + unsigned IntVal; // encoding, offset, bits + +public: + BTFTypeEntryInt(size_t id, struct btf_type &type, unsigned intval) : + BTFTypeEntry(id, type), IntVal(intval) {} + size_t getSize() { return BTFTypeEntry::getSize() + sizeof(unsigned); } + void print(raw_ostream &s, MCBTFContext& BTFContext); + void emitData(MCObjectStreamer *MCOS); +}; + +// BTF_KIND_ENUM +class BTFTypeEntryEnum : public BTFTypeEntry { + std::vector EnumValues; + +public: + BTFTypeEntryEnum(size_t id, struct btf_type &type, + std::vector &values) : + BTFTypeEntry(id, type), EnumValues(values) {} + size_t getSize() { + return BTFTypeEntry::getSize() + + BTF_INFO_VLEN(BTFType.info) * sizeof(struct btf_enum); + } + void print(raw_ostream &s, MCBTFContext& BTFContext); + void emitData(MCObjectStreamer *MCOS); +}; + +// BTF_KIND_ARRAY +class BTFTypeEntryArray : public BTFTypeEntry { + struct btf_array ArrayInfo; + +public: + BTFTypeEntryArray(size_t id, struct btf_type &type, + struct btf_array &arrayinfo) : + BTFTypeEntry(id, type), ArrayInfo(arrayinfo) {} + size_t getSize() { + return BTFTypeEntry::getSize() + sizeof(struct btf_array); + } + void print(raw_ostream &s, MCBTFContext& BTFContext); + void emitData(MCObjectStreamer *MCOS); +}; + +// BTF_KIND_STRUCT and BTF_KIND_UNION +class BTFTypeEntryStruct : public BTFTypeEntry { + std::vector Members; + +public: + BTFTypeEntryStruct(size_t id, struct btf_type &type, + std::vector &members) : + BTFTypeEntry(id, type), Members(members) {} + size_t getSize() { + return BTFTypeEntry::getSize() + + BTF_INFO_VLEN(BTFType.info) * sizeof(struct btf_member); + } + void print(raw_ostream &s, MCBTFContext& BTFContext); + void emitData(MCObjectStreamer *MCOS); +}; + +// BTF_KIND_FUNC and BTF_KIND_FUNC_PROTO +class BTFTypeEntryFunc : public BTFTypeEntry { + std::vector Parameters; + +public: + BTFTypeEntryFunc(size_t id, struct btf_type &type, + std::vector ¶ms) : + BTFTypeEntry(id, type), Parameters(params) {} + size_t getSize() { + return BTFTypeEntry::getSize() + + BTF_INFO_VLEN(BTFType.info) * sizeof(unsigned); + } + void print(raw_ostream &s, MCBTFContext& BTFContext); + void emitData(MCObjectStreamer *MCOS); +}; + +class BTFStringTable { + size_t Size; // total size in bytes + std::map OffsetToIdMap; + std::vector Table; + + public: + BTFStringTable() : Size(0) {} + size_t getSize() { return Size; } + std::vector &getTable() { return Table; } + size_t addString(std::string S) { + // check whether the string already exists + for (auto &OffsetM : OffsetToIdMap) { + if (Table[OffsetM.second] == S) + return OffsetM.first; + } + // not find, add to the string table + size_t Offset = Size; + OffsetToIdMap[Offset] = Table.size(); + Table.push_back(S); + Size += S.size() + 1; + return Offset; + } + std::string &getStringAtOffset(size_t Offset) { + return Table[OffsetToIdMap[Offset]]; + } + void showTable(raw_ostream &OS) { + for (auto OffsetM : OffsetToIdMap) + OS << OffsetM.first << " : " << Table[OffsetM.second] + << "\n"; + } +}; + +struct BTFFuncInfo { + const MCSymbol *Label; + unsigned int TypeId; +}; + +struct BTFLineInfo { + MCSymbol *Label; + unsigned int FileNameOff; + unsigned int LineOff; + unsigned int LineNum; + unsigned int ColumnNum; +}; + +class MCBTFContext { + std::vector> TypeEntries; + BTFStringTable StringTable; + std::map> FuncInfoTable; + std::map> LineInfoTable; + + friend class BTFTypeEntry; + friend class BTFTypeEntryInt; + friend class BTFTypeEntryEnum; + friend class BTFTypeEntryArray; + friend class BTFTypeEntryStruct; + friend class BTFTypeEntryFunc; + +public: + void dump(raw_ostream& OS); + void emitAll(MCObjectStreamer *MCOS); + void emitCommonHeader(MCObjectStreamer *MCOS); + void emitBTFSection(MCObjectStreamer *MCOS); + void emitBTFExtSection(MCObjectStreamer *MCOS); + + size_t addString(std::string S) { + return StringTable.addString(S); + } + void addTypeEntry(std::unique_ptr Entry); + void addFuncInfo(unsigned SecNameOff, BTFFuncInfo Info) { + FuncInfoTable[SecNameOff].push_back(Info); + } + void addLineInfo(unsigned SecNameOff, BTFLineInfo Info) { + LineInfoTable[SecNameOff].push_back(Info); + } +}; + +} +#endif diff --git a/include/llvm/MC/MCContext.h b/include/llvm/MC/MCContext.h index 3b8ac8b79e2..d5c49408c68 100644 --- a/include/llvm/MC/MCContext.h +++ b/include/llvm/MC/MCContext.h @@ -56,6 +56,7 @@ namespace llvm { class MCSymbolWasm; class SMLoc; class SourceMgr; + class MCBTFContext; /// Context object for machine code objects. This class owns all of the /// sections that it creates. @@ -278,6 +279,9 @@ namespace llvm { /// Map of currently defined macros. StringMap MacroMap; + /// for BTF debug information + std::unique_ptr BTFCtx; + public: explicit MCContext(const MCAsmInfo *MAI, const MCRegisterInfo *MRI, const MCObjectFileInfo *MOFI, @@ -286,6 +290,9 @@ namespace llvm { MCContext &operator=(const MCContext &) = delete; ~MCContext(); + void setBTFContext(std::unique_ptr Ctx); + std::unique_ptr &getBTFContext() { return BTFCtx; } + const SourceMgr *getSourceManager() const { return SrcMgr; } void setInlineSourceManager(SourceMgr *SM) { InlineSrcMgr = SM; } diff --git a/include/llvm/MC/MCObjectFileInfo.h b/include/llvm/MC/MCObjectFileInfo.h index 8cf9e1cc55a..1dda7b0712f 100644 --- a/include/llvm/MC/MCObjectFileInfo.h +++ b/include/llvm/MC/MCObjectFileInfo.h @@ -207,6 +207,10 @@ protected: MCSection *SXDataSection; MCSection *GFIDsSection; + // BTF specific sections. + MCSection *BTFSection; + MCSection *BTFExtSection; + public: void InitMCObjectFileInfo(const Triple &TT, bool PIC, MCContext &ctx, bool LargeCodeModel = false); @@ -372,6 +376,10 @@ public: return EHFrameSection; } + // BTF specific sections. + MCSection *getBTFSection() const { return BTFSection; } + MCSection *getBTFExtSection() const { return BTFExtSection; } + enum Environment { IsMachO, IsELF, IsCOFF, IsWasm }; Environment getObjectFileType() const { return Env; } diff --git a/include/llvm/MC/MCObjectStreamer.h b/include/llvm/MC/MCObjectStreamer.h index c9e577b7e29..9d15086ac63 100644 --- a/include/llvm/MC/MCObjectStreamer.h +++ b/include/llvm/MC/MCObjectStreamer.h @@ -138,6 +138,7 @@ public: unsigned PointerSize); void EmitDwarfAdvanceFrameAddr(const MCSymbol *LastLabel, const MCSymbol *Label); + void EmitBTFAdvanceLineAddr(const MCSymbol *Label, unsigned Size); void EmitCVLocDirective(unsigned FunctionId, unsigned FileNo, unsigned Line, unsigned Column, bool PrologueEnd, bool IsStmt, StringRef FileName, SMLoc Loc) override; diff --git a/lib/CodeGen/AsmPrinter/CMakeLists.txt b/lib/CodeGen/AsmPrinter/CMakeLists.txt index 6cba4a0d4b8..14c895a9c82 100644 --- a/lib/CodeGen/AsmPrinter/CMakeLists.txt +++ b/lib/CodeGen/AsmPrinter/CMakeLists.txt @@ -17,6 +17,7 @@ add_llvm_library(LLVMAsmPrinter DwarfFile.cpp DwarfStringPool.cpp DwarfUnit.cpp + Dwarf2BTF.cpp EHStreamer.cpp ErlangGCPrinter.cpp OcamlGCPrinter.cpp diff --git a/lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp b/lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp new file mode 100644 index 00000000000..20eab4d1fb8 --- /dev/null +++ b/lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp @@ -0,0 +1,501 @@ +//===- Dwarf2BTF.cpp ------------------------------------------ *- C++ --*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "DwarfUnit.h" +#include "Dwarf2BTF.h" +#include "llvm/MC/MCBTFContext.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCSectionELF.h" + +namespace llvm { + +unsigned char Die2BTFEntry::getDieKind(const DIE & Die) { + auto Tag = Die.getTag(); + + switch (Tag) { + case dwarf::DW_TAG_base_type: + if (getBaseTypeEncoding(Die) == BTF_INVALID_ENCODING) + return BTF_KIND_UNKN; + return BTF_KIND_INT; + case dwarf::DW_TAG_const_type: + return BTF_KIND_CONST; + case dwarf::DW_TAG_pointer_type: + return BTF_KIND_PTR; + case dwarf::DW_TAG_restrict_type: + return BTF_KIND_RESTRICT; + case dwarf::DW_TAG_volatile_type: + return BTF_KIND_VOLATILE; + case dwarf::DW_TAG_typedef: + return BTF_KIND_TYPEDEF; + case dwarf::DW_TAG_structure_type: + case dwarf::DW_TAG_class_type: + if (Die.findAttribute(dwarf::DW_AT_declaration).getType() + != DIEValue::isNone) + return BTF_KIND_FWD; + else + return BTF_KIND_STRUCT; + case dwarf::DW_TAG_union_type: + if (Die.findAttribute(dwarf::DW_AT_declaration).getType() + != DIEValue::isNone) + return BTF_KIND_FWD; + else + return BTF_KIND_UNION; + case dwarf::DW_TAG_enumeration_type: + return BTF_KIND_ENUM; + case dwarf::DW_TAG_array_type: + return BTF_KIND_ARRAY; + case dwarf::DW_TAG_subprogram: + return BTF_KIND_FUNC; + case dwarf::DW_TAG_subroutine_type: + return BTF_KIND_FUNC_PROTO; + default: + break; + } + + return BTF_KIND_UNKN; +} + +std::unique_ptr Die2BTFEntry::dieToBTFTypeEntry(const DIE &Die) { + unsigned char Kind = getDieKind(Die); + + switch (Kind) { + case BTF_KIND_INT: + return make_unique(Die); + case BTF_KIND_PTR: + case BTF_KIND_TYPEDEF: + case BTF_KIND_VOLATILE: + case BTF_KIND_CONST: + case BTF_KIND_RESTRICT: + case BTF_KIND_FWD: + return make_unique(Die); + case BTF_KIND_ARRAY: + return make_unique(Die); + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + return make_unique(Die); + case BTF_KIND_ENUM: + return make_unique(Die); + case BTF_KIND_FUNC: + case BTF_KIND_FUNC_PROTO: + return make_unique(Die); + default: + break; + } + return nullptr; +} + +bool Die2BTFEntry::shouldSkipDie(const DIE &Die) { + auto Tag = Die.getTag(); + + switch (Tag) { + case dwarf::DW_TAG_const_type: + case dwarf::DW_TAG_pointer_type: + case dwarf::DW_TAG_restrict_type: + case dwarf::DW_TAG_typedef: + case dwarf::DW_TAG_volatile_type: + { + auto TypeV = Die.findAttribute(dwarf::DW_AT_type); + if (TypeV.getType() == DIEValue::isNone) + return false; + auto &TypeDie = TypeV.getDIEEntry().getEntry(); + return Die2BTFEntry::shouldSkipDie(TypeDie); + } + default: + return getDieKind(Die) == BTF_KIND_UNKN; + } + return true; +} +unsigned char Die2BTFEntry::getBaseTypeEncoding(const DIE &Die) { + auto V = Die.findAttribute(dwarf::DW_AT_encoding); + + if (V.getType() != DIEValue::isInteger) + return BTF_INVALID_ENCODING; + + switch (V.getDIEInteger().getValue()) { + case dwarf::DW_ATE_boolean: + return BTF_INT_BOOL; + case dwarf::DW_ATE_signed: + return BTF_INT_SIGNED; + case dwarf::DW_ATE_signed_char: + return BTF_INT_CHAR; + case dwarf::DW_ATE_unsigned: + return 0; + case dwarf::DW_ATE_unsigned_char: + return BTF_INT_CHAR; + case dwarf::DW_ATE_imaginary_float: + case dwarf::DW_ATE_packed_decimal: + case dwarf::DW_ATE_numeric_string: + case dwarf::DW_ATE_edited: + case dwarf::DW_ATE_signed_fixed: + case dwarf::DW_ATE_address: + case dwarf::DW_ATE_complex_float: + case dwarf::DW_ATE_float: + default: + break; + } + return BTF_INVALID_ENCODING; +} + +Die2BTFEntry::Die2BTFEntry(const DIE &Die) : Die(Die) { + unsigned char Kind = getDieKind(Die); + + switch (Kind) { + case BTF_KIND_CONST: + case BTF_KIND_FWD: + case BTF_KIND_PTR: + case BTF_KIND_RESTRICT: + case BTF_KIND_TYPEDEF: + case BTF_KIND_VOLATILE: + break; + default: + assert("Invalid Die passed into BTFTypeEntry()"); + break; + } + + BTFType.info = (Kind & 0xf) << 24; +} + +void Die2BTFEntry::completeData(class Dwarf2BTF &Dwarf2BTF) { + auto TypeV = Die.findAttribute(dwarf::DW_AT_type); + if (TypeV.getType() == DIEValue::isNone) { + BTFType.type = 0; + } else { + auto &TypeDie = TypeV.getDIEEntry().getEntry(); + auto Type = Dwarf2BTF.getTypeIndex(TypeDie); + BTFType.type = Type; + } + + unsigned char Kind = getDieKind(Die); + if (Kind != BTF_KIND_FWD) { + BTFType.name_off = 0; + } else { + auto NameV = Die.findAttribute(dwarf::DW_AT_name); + auto Str = NameV.getDIEString().getString(); + BTFType.name_off = Dwarf2BTF.addBTFString(Str); + } + + auto typeEntry = make_unique(Id, BTFType); + Dwarf2BTF.addBTFTypeEntry(std::move(typeEntry)); +} + +Die2BTFEntryInt::Die2BTFEntryInt(const DIE &Die) : Die2BTFEntry(Die) { + unsigned char Kind = getDieKind(Die); + + switch (Kind) { + case BTF_KIND_INT: + break; + default: + assert("Invalid Die passed into BTFTypeEntryInt()"); + break; + } + + // handle BTF_INT_ENCODING in IntVal + auto Encoding = Die2BTFEntry::getBaseTypeEncoding(Die); + assert((Encoding != BTF_INVALID_ENCODING) && + "Invalid Die passed to BTFTypeEntryInt()"); + __u32 IntVal = (Encoding & 0xf) << 24; + + // handle BTF_INT_OFFSET in IntVal + auto V = Die.findAttribute(dwarf::DW_AT_bit_offset); + if (V.getType() == DIEValue::isInteger) + IntVal |= (V.getDIEInteger().getValue() & 0xff) << 16; + + // get btf_type.size + V = Die.findAttribute(dwarf::DW_AT_byte_size); + __u32 Size = V.getDIEInteger().getValue() & 0xffffffff; + +// handle BTF_INT_BITS in IntVal + V = Die.findAttribute(dwarf::DW_AT_bit_size); + if (V.getType() == DIEValue::isInteger) + IntVal |= V.getDIEInteger().getValue() & 0xff; + else + IntVal |= (Size << 3) & 0xff; + + BTFType.info = BTF_KIND_INT << 24; + BTFType.size = Size; + this->IntVal = IntVal; +} + +void Die2BTFEntryInt::completeData(class Dwarf2BTF &Dwarf2BTF) { + auto NameV = Die.findAttribute(dwarf::DW_AT_name); + auto TypeV = Die.findAttribute(dwarf::DW_AT_type); + auto Str = NameV.getDIEString().getString(); + + BTFType.name_off = Dwarf2BTF.addBTFString(Str); + + auto typeEntry = make_unique(Id, BTFType, IntVal); + Dwarf2BTF.addBTFTypeEntry(std::move(typeEntry)); +} + +Die2BTFEntryEnum::Die2BTFEntryEnum(const DIE &Die) : Die2BTFEntry(Die) { + // get btf_type.size + auto V = Die.findAttribute(dwarf::DW_AT_byte_size); + __u32 Size = V.getDIEInteger().getValue() & 0xffffffff; + + int Vlen = 0; + for (auto &ChildDie : Die.children()) + if (ChildDie.getTag() == dwarf::DW_TAG_enumerator) + Vlen++; + + BTFType.info = (BTF_KIND_ENUM << 24) | (Vlen & BTF_MAX_VLEN); + BTFType.type = Size; +} + +void Die2BTFEntryEnum::completeData(class Dwarf2BTF &Dwarf2BTF) { + auto TypeV = Die.findAttribute(dwarf::DW_AT_type); + auto NameV = Die.findAttribute(dwarf::DW_AT_name); + + if (NameV.getType() != DIEValue::isNone) { + auto Str = NameV.getDIEString().getString(); + BTFType.name_off = Dwarf2BTF.addBTFString(Str); + } else + BTFType.name_off = 0; + + for (auto &ChildDie : Die.children()) { + struct btf_enum BTFEnum; + auto ChildNameV = ChildDie.findAttribute(dwarf::DW_AT_name); + auto Str = ChildNameV.getDIEString().getString(); + + BTFEnum.name_off = Dwarf2BTF.addBTFString(Str); + auto ChildValueV = ChildDie.findAttribute(dwarf::DW_AT_const_value); + BTFEnum.val = (__s32)(ChildValueV.getDIEInteger().getValue()); + + EnumValues.push_back(BTFEnum); + } + + auto typeEntry = make_unique(Id, BTFType, EnumValues); + Dwarf2BTF.addBTFTypeEntry(std::move(typeEntry)); +} + +Die2BTFEntryArray::Die2BTFEntryArray(const DIE &Die) : + Die2BTFEntry(Die) { + BTFType.info = (BTF_KIND_ARRAY << 24); + BTFType.size = 0; +} + +void Die2BTFEntryArray::completeData(class Dwarf2BTF &Dwarf2BTF) { + auto NameV = Die.findAttribute(dwarf::DW_AT_name); + + std::string Str; + if (NameV.getType() != DIEValue::isNone) + Str = NameV.getDIEString().getString(); + BTFType.name_off = Dwarf2BTF.addBTFString(Str); + + auto &ArrayTypeDie = Die.findAttribute(dwarf::DW_AT_type).getDIEEntry().getEntry(); + ArrayInfo.type = Dwarf2BTF.getTypeIndex(ArrayTypeDie); + + // The number of elements should count all subranges + unsigned Nelems = 1; + bool IsFirstSubrange = true; + for (auto &ChildDie : Die.children()) { + if (ChildDie.getTag() == dwarf::DW_TAG_subrange_type) { + if (IsFirstSubrange) { + auto TypeV = ChildDie.findAttribute(dwarf::DW_AT_type); + auto &TypeDie = TypeV.getDIEEntry().getEntry(); + ArrayInfo.index_type = Dwarf2BTF.getTypeIndex(TypeDie); + IsFirstSubrange = false; + } + auto CountV = ChildDie.findAttribute(dwarf::DW_AT_count); + if (CountV.getType() == DIEValue::isNone) { + // array like a[] which essentially a pointer + Nelems = 0; + break; + } + Nelems *= (__u32)(CountV.getDIEInteger().getValue()); + } + } + ArrayInfo.nelems = Nelems; + + auto TypeEntry = make_unique(Id, BTFType, ArrayInfo); + Dwarf2BTF.addBTFTypeEntry(std::move(TypeEntry)); +} + +Die2BTFEntryStruct::Die2BTFEntryStruct(const DIE &Die) : Die2BTFEntry(Die) { + // get btf_type.size + auto V = Die.findAttribute(dwarf::DW_AT_byte_size); + __u32 Size = V.getDIEInteger().getValue() & 0xffffffff; + auto Kind = Die2BTFEntry::getDieKind(Die); + + int Vlen = 0; + for (auto &ChildDie : Die.children()) + if (ChildDie.getTag() == dwarf::DW_TAG_member) + Vlen++; + + BTFType.size = Size; + BTFType.info = (Kind << 24) | (Vlen & BTF_MAX_VLEN); +} + +void Die2BTFEntryStruct::completeData(class Dwarf2BTF &Dwarf2BTF) { + auto NameV = Die.findAttribute(dwarf::DW_AT_name); + + if (NameV.getType() != DIEValue::isNone) { + auto Str = NameV.getDIEString().getString(); + BTFType.name_off = Dwarf2BTF.addBTFString(Str); + } else + BTFType.name_off = 0; + + + for (auto &ChildDie : Die.children()) { + if (ChildDie.getTag() != dwarf::DW_TAG_member) + continue; + + struct btf_member BTFMember; + auto ChildNameV = ChildDie.findAttribute(dwarf::DW_AT_name); + + if (ChildNameV.getType() != DIEValue::isNone) { + auto Str = ChildNameV.getDIEString().getString(); + BTFMember.name_off = Dwarf2BTF.addBTFString(Str); + } else + BTFMember.name_off = 0; + + auto TypeV = ChildDie.findAttribute(dwarf::DW_AT_type); + auto &TypeDie = TypeV.getDIEEntry().getEntry(); + BTFMember.type = Dwarf2BTF.getTypeIndex(TypeDie); + + auto MemLocV = ChildDie.findAttribute(dwarf::DW_AT_data_member_location); + unsigned MemLoc = MemLocV.getDIEInteger().getValue() * 8; + + auto ByteSizeV = ChildDie.findAttribute(dwarf::DW_AT_byte_size); + if (ByteSizeV.getType() != DIEValue::isNone) { + unsigned ByteSize = ByteSizeV.getDIEInteger().getValue(); + auto BitOffsetV = ChildDie.findAttribute(dwarf::DW_AT_bit_offset); + unsigned BitOffset = BitOffsetV.getDIEInteger().getValue(); + auto BitSizeV = ChildDie.findAttribute(dwarf::DW_AT_bit_size); + unsigned BitSize = BitSizeV.getDIEInteger().getValue(); + if (Dwarf2BTF.isLittleEndian()) + MemLoc += ByteSize * 8 - BitSize - BitOffset; + else + MemLoc += BitOffset; + } + BTFMember.offset = MemLoc; + + Members.push_back(BTFMember); + } + + auto typeEntry = make_unique(Id, BTFType, Members); + Dwarf2BTF.addBTFTypeEntry(std::move(typeEntry)); +} + +Die2BTFEntryFunc::Die2BTFEntryFunc(const DIE &Die) : Die2BTFEntry(Die) { + auto Kind = Die2BTFEntry::getDieKind(Die); + + int Vlen = 0; + for (auto &ChildDie : Die.children()) + if (ChildDie.getTag() == dwarf::DW_TAG_formal_parameter) + Vlen++; + + BTFType.size = 0; + BTFType.info = (Kind << 24) | (Vlen & BTF_MAX_VLEN); +} + +void Die2BTFEntryFunc::completeData(class Dwarf2BTF &Dwarf2BTF) { + auto NameV = Die.findAttribute(dwarf::DW_AT_name); + if (NameV.getType() == DIEValue::isNone) { + BTFType.name_off = 0; + } else { + auto Str = NameV.getDIEString().getString(); + BTFType.name_off = Dwarf2BTF.addBTFString(Str); + } + + auto RetTypeV = Die.findAttribute(dwarf::DW_AT_type); + if (RetTypeV.getType() != DIEValue::isNone) { + auto &TypeDie = RetTypeV.getDIEEntry().getEntry(); + BTFType.type = Dwarf2BTF.getTypeIndex(TypeDie); + } else { + BTFType.type = 0; + } + + for (auto &ChildDie : Die.children()) { + if (ChildDie.getTag() == dwarf::DW_TAG_formal_parameter) { + auto TypeV = ChildDie.findAttribute(dwarf::DW_AT_abstract_origin); + if (TypeV.getType() != DIEValue::isNone) { + auto &AbsOriginDie = TypeV.getDIEEntry().getEntry(); + assert(AbsOriginDie.getTag() == dwarf::DW_TAG_formal_parameter); + TypeV = AbsOriginDie.findAttribute(dwarf::DW_AT_type); + } else { + TypeV = ChildDie.findAttribute(dwarf::DW_AT_type); + } + auto &TypeDie = TypeV.getDIEEntry().getEntry(); + Parameters.push_back(Dwarf2BTF.getTypeIndex(TypeDie)); + } else if (ChildDie.getTag() == dwarf::DW_TAG_unspecified_parameters) { + Parameters.push_back(0); + } + } + + auto typeEntry = make_unique(Id, BTFType, Parameters); + Dwarf2BTF.addBTFTypeEntry(std::move(typeEntry)); + + if (BTF_INFO_KIND(BTFType.info) == BTF_KIND_FUNC) { + auto LowPCV = Die.findAttribute(dwarf::DW_AT_low_pc); + if (LowPCV.getType() != DIEValue::isNone) { + const MCSymbol *Label = LowPCV.getDIELabel().getValue(); + BTFFuncInfo FuncInfo; + unsigned SecNameOff; + + FuncInfo.Label = Label; + FuncInfo.TypeId = Id; + if (Label->isInSection()) { + MCSection &Section = Label->getSection(); + MCSectionELF *SectionELF = dyn_cast(&Section); + assert(SectionELF); + SecNameOff = Dwarf2BTF.addBTFString(SectionELF->getSectionName().str()); + } else { + SecNameOff = Dwarf2BTF.addBTFString(".text"); + } + Dwarf2BTF.addBTFFuncInfo(SecNameOff, FuncInfo); + } + } +} + +Dwarf2BTF::Dwarf2BTF(MCContext &Context, bool IsLittleEndian) + : OuterCtx(Context), IsLE(IsLittleEndian) { + BTFContext = make_unique(); +} + +void Dwarf2BTF::addTypeEntry(const DIE &Die) { + for (auto &ChildDie : Die.children()) + addTypeEntry(ChildDie); + if (Die2BTFEntry::shouldSkipDie(Die)) + return; + auto Kind = Die2BTFEntry::getDieKind(Die); + if (Kind != BTF_KIND_UNKN) { + auto TypeEntry = Die2BTFEntry::dieToBTFTypeEntry(Die); + if (TypeEntry != nullptr) { + TypeEntry->setId(TypeEntries.size() + 1); + DieToIdMap[const_cast(&Die)] = TypeEntry->getId(); + TypeEntries.push_back(std::move(TypeEntry)); + } + } +} + +void Dwarf2BTF::addBTFTypeEntry(std::unique_ptr Entry) { + BTFContext->addTypeEntry(std::move(Entry)); +} + +void Dwarf2BTF::completeData() { + BTFContext->addString("\0"); + + for (auto &TypeEntry : TypeEntries) + TypeEntry->completeData(*this); +} + +void Dwarf2BTF::addDwarfCU(DwarfUnit *TheU) { + DIE &CuDie = TheU->getUnitDie(); + + assert((CuDie.getTag() == dwarf::DW_TAG_compile_unit) && + "Not a compile unit"); + addTypeEntry(CuDie); +} + +void Dwarf2BTF::finish() { + completeData(); + OuterCtx.setBTFContext(std::move(BTFContext)); +} + +} diff --git a/lib/CodeGen/AsmPrinter/Dwarf2BTF.h b/lib/CodeGen/AsmPrinter/Dwarf2BTF.h new file mode 100644 index 00000000000..3df4dd802a7 --- /dev/null +++ b/lib/CodeGen/AsmPrinter/Dwarf2BTF.h @@ -0,0 +1,134 @@ +//===- Dwarf2BTF.h -------------------------------------------- *- C++ --*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DWARF2BTF_H +#define LLVM_LIB_CODEGEN_ASMPRINTER_DWARF2BTF_H + +#include "DwarfUnit.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/AsmPrinter.h" +#include "llvm/CodeGen/DIE.h" +#include "llvm/MC/MCBTFContext.h" +#include + +namespace llvm { + +class Dwarf2BTF; +class MCBTFContext; + +#define BTF_INVALID_ENCODING 0xff + +class Die2BTFEntry { +protected: + const DIE &Die; + size_t Id; /* type index in the BTF list, started from 1 */ + struct btf_type BTFType; + +public: + // Return desired BTF_KIND for the Die, return BTF_KIND_UNKN for + // invalid/unsupported Die + static unsigned char getDieKind(const DIE &Die); + + // Return proper BTF_INT_ENCODING of a basetype. + // Return BTF_INVALID_ENCODING for unsupported (float, etc.) + static unsigned char getBaseTypeEncoding(const DIE &Die); + + // Return whether this Die should be skipped. + // We currently skip unsupported data type (e.g. float) + // and references to unsupported types + static bool shouldSkipDie(const DIE &Die); + + static std::unique_ptr dieToBTFTypeEntry(const DIE &Die); + + Die2BTFEntry(const DIE &Die); + void setId(size_t Id) { this->Id = Id; } + size_t getId() { return Id; } + virtual void completeData(class Dwarf2BTF &Dwarf2BTF); +}; + +// BTF_KIND_INT +class Die2BTFEntryInt : public Die2BTFEntry { + __u32 IntVal; // encoding, offset, bits + +public: + Die2BTFEntryInt(const DIE &Die); + void completeData(class Dwarf2BTF &Dwarf2BTF); +}; + +// BTF_KIND_ENUM +class Die2BTFEntryEnum : public Die2BTFEntry { + std::vector EnumValues; + +public: + Die2BTFEntryEnum(const DIE &Die); + void completeData(class Dwarf2BTF &Dwarf2BTF); +}; + +// BTF_KIND_ARRAY +class Die2BTFEntryArray : public Die2BTFEntry { + struct btf_array ArrayInfo; + +public: + Die2BTFEntryArray(const DIE &Die); + void completeData(class Dwarf2BTF &Dwarf2BTF); +}; + +// BTF_KIND_STRUCT and BTF_KIND_UNION +class Die2BTFEntryStruct : public Die2BTFEntry { + std::vector Members; + +public: + Die2BTFEntryStruct(const DIE &Die); + void completeData(class Dwarf2BTF &Dwarf2BTF); +}; + +// BTF_KIND_FUNC and BTF_KIND_FUNC_PROTO +class Die2BTFEntryFunc : public Die2BTFEntry { + std::vector<__u32> Parameters; + +public: + Die2BTFEntryFunc(const DIE &Die); + void completeData(class Dwarf2BTF &Dwarf2BTF); +}; + +class Dwarf2BTF { + std::vector> TypeEntries; + std::map DieToIdMap; + std::unique_ptr BTFContext; + MCContext &OuterCtx; + bool IsLE; + +public: + Dwarf2BTF(MCContext &Context, bool IsLittleEndian); + bool isLittleEndian() { return IsLE; } + void addDwarfCU(DwarfUnit *TheU); + void finish(); + __u32 getTypeIndex(DIE &Die) { + DIE *DiePtr = const_cast(&Die); + assert((DieToIdMap.find(DiePtr) != DieToIdMap.end()) && + "Die not added to in the BTFContext"); + return DieToIdMap[DiePtr]; + } + size_t addBTFString(std::string S) { + return BTFContext->addString(S); + } + void addBTFTypeEntry(std::unique_ptr Entry); + void addBTFFuncInfo(unsigned SecNameOff, BTFFuncInfo FuncInfo) { + BTFContext->addFuncInfo(SecNameOff, FuncInfo); + } + +private: + void addTypeEntry(const DIE &Die); + bool alreadyAdded(DIE &Die) { + return DieToIdMap.find(const_cast(&Die)) != DieToIdMap.end(); + } + void completeData(); +}; + +} +#endif diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp index 94e12658cfe..184ec4dabe9 100644 --- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp +++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp @@ -971,6 +971,10 @@ void DwarfDebug::endModule() { // Emit the pubnames and pubtypes sections if requested. emitDebugPubSections(); + const Triple &TT = Asm->TM.getTargetTriple(); + if (TT.getArch() == Triple::bpfel || TT.getArch() == Triple::bpfeb) + emitBTFSection(TT.getArch() == Triple::bpfel); + // clean up. // FIXME: AbstractVariables.clear(); } @@ -2455,6 +2459,12 @@ MCDwarfDwoLineTable *DwarfDebug::getDwoLineTable(const DwarfCompileUnit &CU) { return &SplitTypeUnitFileTable; } +void DwarfDebug::emitBTFSection(bool IsLittleEndian) { + DwarfFile &Holder = useSplitDwarf() ? SkeletonHolder : InfoHolder; + + Holder.emitBTFSection(IsLittleEndian); +} + uint64_t DwarfDebug::makeTypeSignature(StringRef Identifier) { MD5 Hash; Hash.update(Identifier); diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.h b/lib/CodeGen/AsmPrinter/DwarfDebug.h index fecf8056765..1350317db02 100644 --- a/lib/CodeGen/AsmPrinter/DwarfDebug.h +++ b/lib/CodeGen/AsmPrinter/DwarfDebug.h @@ -567,6 +567,9 @@ class DwarfDebug : public DebugHandlerBase { /// Emit the reference to the section. void emitSectionReference(const DwarfCompileUnit &CU); + // Emit the BTF sections + void emitBTFSection(bool IsLittleEndian); + protected: /// Gather pre-function debug information. void beginFunctionImpl(const MachineFunction *MF) override; diff --git a/lib/CodeGen/AsmPrinter/DwarfFile.cpp b/lib/CodeGen/AsmPrinter/DwarfFile.cpp index 0ab9ea87c23..7ac16b34c4c 100644 --- a/lib/CodeGen/AsmPrinter/DwarfFile.cpp +++ b/lib/CodeGen/AsmPrinter/DwarfFile.cpp @@ -7,6 +7,7 @@ // //===----------------------------------------------------------------------===// +#include "Dwarf2BTF.h" #include "DwarfFile.h" #include "DwarfCompileUnit.h" #include "DwarfDebug.h" @@ -15,6 +16,8 @@ #include "llvm/CodeGen/AsmPrinter.h" #include "llvm/CodeGen/DIE.h" #include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/MC/MCBTFContext.h" +#include "llvm/MC/MCContext.h" #include "llvm/MC/MCStreamer.h" #include #include @@ -88,6 +91,13 @@ void DwarfFile::emitStrings(MCSection *StrSection, MCSection *OffsetSection, StrPool.emit(*Asm, StrSection, OffsetSection, UseRelativeOffsets); } +void DwarfFile::emitBTFSection(bool IsLittleEndian) { + Dwarf2BTF Dwarf2BTF(Asm->OutContext, IsLittleEndian); + for (auto &TheU : CUs) + Dwarf2BTF.addDwarfCU(TheU.get()); + Dwarf2BTF.finish(); +} + bool DwarfFile::addScopeVariable(LexicalScope *LS, DbgVariable *Var) { auto &ScopeVars = ScopeVariables[LS]; const DILocalVariable *DV = Var->getVariable(); diff --git a/lib/CodeGen/AsmPrinter/DwarfFile.h b/lib/CodeGen/AsmPrinter/DwarfFile.h index c315f44a8d8..9aafe2613f6 100644 --- a/lib/CodeGen/AsmPrinter/DwarfFile.h +++ b/lib/CodeGen/AsmPrinter/DwarfFile.h @@ -114,6 +114,9 @@ public: void emitStrings(MCSection *StrSection, MCSection *OffsetSection = nullptr, bool UseRelativeOffsets = false); + // Emit all data for the BTF section + void emitBTFSection(bool IsLittleEndian); + /// Returns the string pool. DwarfStringPool &getStringPool() { return StrPool; } diff --git a/lib/MC/CMakeLists.txt b/lib/MC/CMakeLists.txt index ba36d99e8f7..85bf1616fd6 100644 --- a/lib/MC/CMakeLists.txt +++ b/lib/MC/CMakeLists.txt @@ -10,11 +10,13 @@ add_llvm_library(LLVMMC MCAsmMacro.cpp MCAsmStreamer.cpp MCAssembler.cpp + MCBTFContext.cpp MCCodeEmitter.cpp MCCodePadder.cpp MCCodeView.cpp MCContext.cpp MCDwarf.cpp + MCDwarf2BTF.cpp MCELFObjectTargetWriter.cpp MCELFStreamer.cpp MCExpr.cpp diff --git a/lib/MC/MCBTFContext.cpp b/lib/MC/MCBTFContext.cpp new file mode 100644 index 00000000000..cb846ee5e51 --- /dev/null +++ b/lib/MC/MCBTFContext.cpp @@ -0,0 +1,235 @@ +//===- lib/MC/MCBTFContext.cpp - Machine Code BTF Context -----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCBTFContext.h" +#include "llvm/MC/MCObjectFileInfo.h" +#include "llvm/MC/MCObjectStreamer.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Support/raw_ostream.h" +#include +#include +#include + +using namespace llvm; + +#define DEBUG_TYPE "btf" + +void MCBTFContext::addTypeEntry(std::unique_ptr Entry) { + TypeEntries.push_back(std::move(Entry)); +} + +void MCBTFContext::dump(raw_ostream &OS) { + OS << "Type Table:\n"; + for (size_t i = 0; i < TypeEntries.size(); i++) { + auto TypeEntry = TypeEntries[i].get(); + TypeEntry->print(OS, *this); + } + + OS << "\nString Table:\n"; + StringTable.showTable(OS); + + OS << "\nFuncInfo Table:\n"; + for (auto &FuncSec : FuncInfoTable) { + OS << "sec_name_off=" << FuncSec.first << "\n"; + for (auto &FuncInfo : FuncSec.second) { + OS << "\tinsn_offset= type_id=" + << FuncInfo.TypeId << "\n"; + } + } + + OS << "\nLineInfo Table:\n"; + for (auto &LineSec : LineInfoTable) { + OS << "sec_name_off=" << LineSec.first << "\n"; + for (auto &LineInfo : LineSec.second) { + OS << "\tinsn_offset= file_name_off=" + << LineInfo.FileNameOff + << " line_off=" << LineInfo.LineOff + << " line_num=" << LineInfo.LineNum + << " column_num=" << LineInfo.ColumnNum + << "\n"; + } + } +} + +void MCBTFContext::emitCommonHeader(MCObjectStreamer *MCOS) { + MCOS->EmitIntValue(BTF_MAGIC, 2); + MCOS->EmitIntValue(BTF_VERSION, 1); + MCOS->EmitIntValue(0, 1); +} + +void MCBTFContext::emitBTFSection(MCObjectStreamer *MCOS) { + MCContext &context = MCOS->getContext(); + MCOS->SwitchSection(context.getObjectFileInfo()->getBTFSection()); + + // emit header + emitCommonHeader(MCOS); + MCOS->EmitIntValue(sizeof(struct btf_header), 4); + + uint32_t type_len = 0, str_len; + for (auto &TypeEntry : TypeEntries) + type_len += TypeEntry->getSize(); + str_len = StringTable.getSize(); + + MCOS->EmitIntValue(0, 4); + MCOS->EmitIntValue(type_len, 4); + MCOS->EmitIntValue(type_len, 4); + MCOS->EmitIntValue(str_len, 4); + + // emit type table + for (auto &TypeEntry: TypeEntries) + TypeEntry->emitData(MCOS); + + // emit string table + for (auto &S : StringTable.getTable()) { + for (auto C : S) + MCOS->EmitIntValue(C, 1); + MCOS->EmitIntValue('\0', 1); + } +} + +void MCBTFContext::emitBTFExtSection(MCObjectStreamer *MCOS) { + MCContext &context = MCOS->getContext(); + MCOS->SwitchSection(context.getObjectFileInfo()->getBTFExtSection()); + + // emit header + emitCommonHeader(MCOS); + MCOS->EmitIntValue(sizeof(struct btf_ext_header), 4); + + uint32_t func_len = 0, line_len = 0; + for (auto &FuncSec : FuncInfoTable) { + func_len += sizeof(struct btf_sec_func_info); + func_len += FuncSec.second.size() * sizeof(struct bpf_func_info); + } + for (auto &LineSec : LineInfoTable) { + line_len += sizeof(struct btf_sec_line_info); + line_len += LineSec.second.size() * sizeof(struct bpf_line_info); + } + + MCOS->EmitIntValue(0, 4); + MCOS->EmitIntValue(func_len, 4); + MCOS->EmitIntValue(func_len, 4); + MCOS->EmitIntValue(line_len, 4); + + // emit func_info table + for (const auto &FuncSec : FuncInfoTable) { + MCOS->EmitIntValue(FuncSec.first, 4); + MCOS->EmitIntValue(FuncSec.second.size(), 4); + for (const auto &FuncInfo : FuncSec.second) { + MCOS->EmitBTFAdvanceLineAddr(FuncInfo.Label, 4); + MCOS->EmitIntValue(FuncInfo.TypeId, 4); + } + } + + // emit line_info table + for (const auto &LineSec : LineInfoTable) { + MCOS->EmitIntValue(LineSec.first, 4); + MCOS->EmitIntValue(LineSec.second.size(), 4); + for (const auto &LineInfo : LineSec.second) { + MCOS->EmitBTFAdvanceLineAddr(LineInfo.Label, 4); + MCOS->EmitIntValue(LineInfo.FileNameOff, 4); + MCOS->EmitIntValue(LineInfo.LineOff, 4); + MCOS->EmitIntValue(LineInfo.LineNum << 10 | LineInfo.ColumnNum, 4); + } + } +} + +void MCBTFContext::emitAll(MCObjectStreamer *MCOS) { + LLVM_DEBUG(dump(dbgs())); + emitBTFSection(MCOS); + emitBTFExtSection(MCOS); +} + +void BTFTypeEntry::print(raw_ostream &OS, MCBTFContext& MCBTFContext) { + OS << "[" << Id << "] " + << btf_kind_str[BTF_INFO_KIND(BTFType.info)] + << " name_off=" << BTFType.name_off + << " info=" << format("0x%08lx", BTFType.info) + << " size/type=" << BTFType.size << "\n"; +} + +void BTFTypeEntry::emitData(MCObjectStreamer *MCOS) { + MCOS->EmitIntValue(BTFType.name_off, 4); + MCOS->EmitIntValue(BTFType.info, 4); + MCOS->EmitIntValue(BTFType.size, 4); +} + +void BTFTypeEntryInt::print(raw_ostream &OS, MCBTFContext& MCBTFContext) { + BTFTypeEntry::print(OS, MCBTFContext); + OS << "\tdesc=" << format("0x%08lx", IntVal) << "\n"; +} + +void BTFTypeEntryInt::emitData(MCObjectStreamer *MCOS) { + BTFTypeEntry::emitData(MCOS); + MCOS->EmitIntValue(IntVal, 4); +} + +void BTFTypeEntryEnum::print(raw_ostream &OS, MCBTFContext& MCBTFContext) { + BTFTypeEntry::print(OS, MCBTFContext); + for (size_t i = 0; i < BTF_INFO_VLEN(BTFType.info); i++) { + auto &EnumValue = EnumValues[i]; + OS << "\tname_off=" << EnumValue.name_off + << " value=" << EnumValue.val << "\n"; + } +} + +void BTFTypeEntryEnum::emitData(MCObjectStreamer *MCOS) { + BTFTypeEntry::emitData(MCOS); + for (auto &EnumValue : EnumValues) { + MCOS->EmitIntValue(EnumValue.name_off, 4); + MCOS->EmitIntValue(EnumValue.val, 4); + } +} + +void BTFTypeEntryArray::print(raw_ostream &OS, MCBTFContext& MCBTFContext) { + BTFTypeEntry::print(OS, MCBTFContext); + OS << "\telem_type=" << format("0x%08lx", ArrayInfo.type) + << " index_type=" << format("0x%08lx", ArrayInfo.index_type) + << " num_element=" << ArrayInfo.nelems << "\n"; +} + +void BTFTypeEntryArray::emitData(MCObjectStreamer *MCOS) { + BTFTypeEntry::emitData(MCOS); + MCOS->EmitIntValue(ArrayInfo.type, 4); + MCOS->EmitIntValue(ArrayInfo.index_type, 4); + MCOS->EmitIntValue(ArrayInfo.nelems, 4); +} + +void BTFTypeEntryStruct::print(raw_ostream &OS, MCBTFContext& MCBTFContext) { + BTFTypeEntry::print(OS, MCBTFContext); + for (size_t i = 0; i < BTF_INFO_VLEN(BTFType.info); i++) { + auto &Member = Members[i]; + OS << "\tname_off=" << Member.name_off + << " type=" << Member.type + << " bit_offset=" << Member.offset << "\n"; + } +} + +void BTFTypeEntryStruct::emitData(MCObjectStreamer *MCOS) { + BTFTypeEntry::emitData(MCOS); + for (auto &Member : Members) { + MCOS->EmitIntValue(Member.name_off, 4); + MCOS->EmitIntValue(Member.type, 4); + MCOS->EmitIntValue(Member.offset, 4); + } +} + +void BTFTypeEntryFunc::print(raw_ostream &OS, MCBTFContext& MCBTFContext) { + BTFTypeEntry::print(OS, MCBTFContext); + for (size_t i = 0; i < BTF_INFO_VLEN(BTFType.info); i++) { + auto Parameter = Parameters[i]; + OS << "\tparam_type=" << Parameter << "\n"; + } +} + +void BTFTypeEntryFunc::emitData(MCObjectStreamer *MCOS) { + BTFTypeEntry::emitData(MCOS); + for (auto &Parameter: Parameters) + MCOS->EmitIntValue(Parameter, 4); +} diff --git a/lib/MC/MCContext.cpp b/lib/MC/MCContext.cpp index fab517075c5..18250a474b7 100644 --- a/lib/MC/MCContext.cpp +++ b/lib/MC/MCContext.cpp @@ -17,6 +17,7 @@ #include "llvm/BinaryFormat/COFF.h" #include "llvm/BinaryFormat/ELF.h" #include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCBTFContext.h" #include "llvm/MC/MCCodeView.h" #include "llvm/MC/MCDwarf.h" #include "llvm/MC/MCExpr.h" @@ -60,7 +61,7 @@ MCContext::MCContext(const MCAsmInfo *mai, const MCRegisterInfo *mri, : SrcMgr(mgr), InlineSrcMgr(nullptr), MAI(mai), MRI(mri), MOFI(mofi), Symbols(Allocator), UsedNames(Allocator), CurrentDwarfLoc(0, 0, 0, DWARF2_FLAG_IS_STMT, 0, 0), - AutoReset(DoAutoReset) { + AutoReset(DoAutoReset), BTFCtx(nullptr) { SecureLogFile = AsSecureLogFileName; if (SrcMgr && SrcMgr->getNumBuffers()) @@ -114,6 +115,14 @@ void MCContext::reset() { GenDwarfFileNumber = 0; HadError = false; + BTFCtx.reset(); +} + +//===----------------------------------------------------------------------===// +// BTFCtx Manipulation +//===----------------------------------------------------------------------===// +void MCContext::setBTFContext(std::unique_ptr Ctx) { + BTFCtx = std::move(Ctx); } //===----------------------------------------------------------------------===// diff --git a/lib/MC/MCDwarf2BTF.cpp b/lib/MC/MCDwarf2BTF.cpp new file mode 100644 index 00000000000..08a70e6f318 --- /dev/null +++ b/lib/MC/MCDwarf2BTF.cpp @@ -0,0 +1,99 @@ +//===- MCDwarf2BTF.cpp ---------------------------------------- *- C++ --*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "MCDwarf2BTF.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCObjectStreamer.h" +#include "llvm/MC/MCSection.h" +#include "llvm/MC/MCSectionELF.h" +#include "llvm/MC/MCBTFContext.h" +#include "llvm/Support/Endian.h" +#include "llvm/Support/EndianStream.h" +#include + +using namespace llvm; + +void MCDwarf2BTF::addFiles(MCObjectStreamer *MCOS, std::string &FileName, + std::vector &Files) { + std::vector Content; + + std::ifstream Inputfile(FileName); + std::string Line; + Content.push_back(Line); // line 0 for empty string + while (std::getline(Inputfile, Line)) + Content.push_back(Line); + + Files.push_back(FileContent(FileName, Content)); +} + +void MCDwarf2BTF::addLines(MCObjectStreamer *MCOS, StringRef &SectionName, + std::vector &Files, + const MCLineSection::MCDwarfLineEntryCollection &LineEntries) { + MCContext &Context = MCOS->getContext(); + auto &BTFCxt = Context.getBTFContext(); + + unsigned SecNameOff = BTFCxt->addString(SectionName.str()); + for (const MCDwarfLineEntry &LineEntry : LineEntries) { + BTFLineInfo LineInfo; + unsigned FileNum = LineEntry.getFileNum(); + unsigned Line = LineEntry.getLine(); + + LineInfo.Label = LineEntry.getLabel(); + if (FileNum < Files.size()) { + LineInfo.FileNameOff = BTFCxt->addString(Files[FileNum].first); + if (Line < Files[FileNum].second.size()) + LineInfo.LineOff = BTFCxt->addString(Files[FileNum].second[Line]); + else + LineInfo.LineOff = 0; + } else { + LineInfo.FileNameOff = 0; + LineInfo.LineOff = 0; + } + LineInfo.LineNum = Line; + LineInfo.ColumnNum = LineEntry.getColumn(); + BTFCxt->addLineInfo(SecNameOff, LineInfo); + } +} + +void MCDwarf2BTF::addDwarfLineInfo(MCObjectStreamer *MCOS) { + MCContext &Context = MCOS->getContext(); + + auto &LineTables = Context.getMCDwarfLineTables(); + if (LineTables.empty()) + return; + + for (const auto &CUIDTablePair : LineTables) { + std::vector Dirs; + std::vector Files; + + for (auto &Dir : CUIDTablePair.second.getMCDwarfDirs()) + Dirs.push_back(Dir); + for (auto &File : CUIDTablePair.second.getMCDwarfFiles()) { + std::string FileName; + if (File.DirIndex == 0) + FileName = File.Name; + else + FileName = Dirs[File.DirIndex - 1] + "/" + File.Name; + MCDwarf2BTF::addFiles(MCOS, FileName, Files); + } + for (const auto &LineSec: CUIDTablePair.second.getMCLineSections().getMCLineEntries()) { + MCSection *Section = LineSec.first; + const MCLineSection::MCDwarfLineEntryCollection &LineEntries = LineSec.second; + + StringRef SectionName; + if (MCSectionELF *SectionELF = dyn_cast(Section)) + SectionName = SectionELF->getSectionName(); + else + return; + MCDwarf2BTF::addLines(MCOS, SectionName, Files, LineEntries); + } + } +} diff --git a/lib/MC/MCDwarf2BTF.h b/lib/MC/MCDwarf2BTF.h new file mode 100644 index 00000000000..22d1b7741a5 --- /dev/null +++ b/lib/MC/MCDwarf2BTF.h @@ -0,0 +1,29 @@ +//===- MCDwarf2BTF.h ------------------------------------------ *- C++ --*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +#ifndef LLVM_LIB_MC_MCDWARF2BTF_H +#define LLVM_LIB_MC_MCDWARF2BTF_H + +#include "llvm/MC/MCDwarf.h" + +namespace llvm { + +using FileContent = std::pair>; + +class MCDwarf2BTF { +public: + static void addFiles(MCObjectStreamer *MCOS, std::string &FileName, + std::vector &Files); + static void addLines(MCObjectStreamer *MCOS, StringRef &SectionName, + std::vector &Files, + const MCLineSection::MCDwarfLineEntryCollection &LineEntries); + static void addDwarfLineInfo(MCObjectStreamer *MCOS); +}; + +} +#endif diff --git a/lib/MC/MCObjectFileInfo.cpp b/lib/MC/MCObjectFileInfo.cpp index edfccfcb9ed..bddcf459ac0 100644 --- a/lib/MC/MCObjectFileInfo.cpp +++ b/lib/MC/MCObjectFileInfo.cpp @@ -468,6 +468,9 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(const Triple &T, bool Large) { Ctx->getELFSection(".eh_frame", EHSectionType, EHSectionFlags); StackSizesSection = Ctx->getELFSection(".stack_sizes", ELF::SHT_PROGBITS, 0); + + BTFSection = Ctx->getELFSection(".BTF", ELF::SHT_PROGBITS, 0); + BTFExtSection = Ctx->getELFSection(".BTF.ext", ELF::SHT_PROGBITS, 0); } void MCObjectFileInfo::initCOFFMCObjectFileInfo(const Triple &T) { diff --git a/lib/MC/MCObjectStreamer.cpp b/lib/MC/MCObjectStreamer.cpp index 8c88db009bd..4f74f4101c8 100644 --- a/lib/MC/MCObjectStreamer.cpp +++ b/lib/MC/MCObjectStreamer.cpp @@ -14,6 +14,7 @@ #include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCCodeView.h" #include "llvm/MC/MCContext.h" +#include "llvm/MC/MCBTFContext.h" #include "llvm/MC/MCDwarf.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCObjectWriter.h" @@ -21,6 +22,7 @@ #include "llvm/MC/MCSymbol.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/SourceMgr.h" +#include "MCDwarf2BTF.h" using namespace llvm; MCObjectStreamer::MCObjectStreamer(MCContext &Context, @@ -439,6 +441,31 @@ void MCObjectStreamer::EmitDwarfAdvanceFrameAddr(const MCSymbol *LastLabel, insert(new MCDwarfCallFrameFragment(*AddrDelta)); } +void MCObjectStreamer::EmitBTFAdvanceLineAddr(const MCSymbol *Label, + unsigned Size) { + const MCExpr *Value = MCSymbolRefExpr::create(Label, getContext()); + MCDataFragment *DF = getOrCreateDataFragment(); + + // Avoid fixups when possible. + int64_t AbsValue; + SMLoc Loc; + + if (Value->evaluateAsAbsolute(AbsValue, getAssemblerPtr())) { + if (!isUIntN(8 * Size, AbsValue) && !isIntN(8 * Size, AbsValue)) { + getContext().reportError( + Loc, "value evaluated as " + Twine(AbsValue) + " is out of range."); + return; + } + EmitIntValue(AbsValue, Size); + return; + } + + DF->getFixups().push_back( + MCFixup::create(DF->getContents().size(), Value, + MCFixup::getKindForSize(Size, false), Loc)); + DF->getContents().resize(DF->getContents().size() + Size, 0); +} + void MCObjectStreamer::EmitCVLocDirective(unsigned FunctionId, unsigned FileNo, unsigned Line, unsigned Column, bool PrologueEnd, bool IsStmt, @@ -688,6 +715,13 @@ void MCObjectStreamer::FinishImpl() { // Dump out the dwarf file & directory tables and line tables. MCDwarfLineTable::Emit(this, getAssembler().getDWARFLinetableParams()); + auto &BTFCtx = getContext().getBTFContext(); + if (BTFCtx) { + MCDwarf2BTF::addDwarfLineInfo(this); + BTFCtx->emitAll(this); + BTFCtx.reset(); + } + flushPendingLabels(); getAssembler().Finish(); } -- GitLab From 2649515d3f3ba7deba9958fea4ef4b65132884a5 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Fri, 12 Oct 2018 17:22:07 +0000 Subject: [PATCH 0114/1116] [Support] exit with custom return code for SIGPIPE Summary: We tell the user to file a bug report on LLVM right now, and SIGPIPE isn't LLVM's fault so our error message is wrong. Allows frontends to detect SIGPIPE from writing to closed readers. This can be seen commonly from piping into head, tee, or split. Fixes PR25349, rdar://problem/14285346, b/77310947 Reviewers: jfb Reviewed By: jfb Subscribers: majnemer, kristina, llvm-commits, thakis, srhines Differential Revision: https://reviews.llvm.org/D53000 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344372 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Support/Unix/Signals.inc | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/lib/Support/Unix/Signals.inc b/lib/Support/Unix/Signals.inc index de26695d64e..ad88d5e9690 100644 --- a/lib/Support/Unix/Signals.inc +++ b/lib/Support/Unix/Signals.inc @@ -47,6 +47,7 @@ #include "llvm/Support/raw_ostream.h" #include #include +#include #ifdef HAVE_BACKTRACE # include BACKTRACE_HEADER // For backtrace(). #endif @@ -334,6 +335,10 @@ static RETSIGTYPE SignalHandler(int Sig) { if (auto OldInterruptFunction = InterruptFunction.exchange(nullptr)) return OldInterruptFunction(); + // Send a special return code that drivers can check for, from sysexits.h. + if (Sig == SIGPIPE) + exit(EX_IOERR); + raise(Sig); // Execute the default handler. return; } -- GitLab From 1172319f2e0dc803f720874c96ae9a85fd265c93 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Fri, 12 Oct 2018 17:23:25 +0000 Subject: [PATCH 0115/1116] [BPF] Some fixes after rL344366 * Move #include outside of namespaces * Add missing #include * Add out-of-line virtual destructor to BTFTypeEntry designated initializers should also be fixed git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344376 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/MC/MCBTFContext.h | 32 +++++++++++++++------------- lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp | 2 ++ lib/CodeGen/AsmPrinter/Dwarf2BTF.h | 1 + lib/MC/MCBTFContext.cpp | 4 ++-- 4 files changed, 22 insertions(+), 17 deletions(-) diff --git a/include/llvm/MC/MCBTFContext.h b/include/llvm/MC/MCBTFContext.h index f180a69340b..fd9edbcf7a8 100644 --- a/include/llvm/MC/MCBTFContext.h +++ b/include/llvm/MC/MCBTFContext.h @@ -13,22 +13,26 @@ #ifndef LLVM_MC_MCBTFCONTEXT_H #define LLVM_MC_MCBTFCONTEXT_H +#include "llvm/MC/MCSymbol.h" +#include "llvm/Support/raw_ostream.h" #include +#include +#include -#define BTF_MAGIC 0xeB9F -#define BTF_VERSION 1 +#define BTF_MAGIC 0xeB9F +#define BTF_VERSION 1 struct btf_header { - __u16 magic; - __u8 version; - __u8 flags; - __u32 hdr_len; - - /* All offsets are in bytes relative to the end of this header */ - __u32 type_off; /* offset of type section */ - __u32 type_len; /* length of type section */ - __u32 str_off; /* offset of string section */ - __u32 str_len; /* length of string section */ + __u16 magic; + __u8 version; + __u8 flags; + __u32 hdr_len; + + /* All offsets are in bytes relative to the end of this header */ + __u32 type_off; /* offset of type section */ + __u32 type_len; /* length of type section */ + __u32 str_off; /* offset of string section */ + __u32 str_len; /* length of string section */ }; /* Max # of type identifier */ @@ -178,9 +182,6 @@ const char *const btf_kind_str[NR_BTF_KINDS] = { [BTF_KIND_FUNC_PROTO] = "FUNC_PROTO", }; -#include "llvm/ADT/SmallVector.h" -#include - class MCBTFContext; class MCObjectStreamer; @@ -196,6 +197,7 @@ protected: public: BTFTypeEntry(size_t id, struct btf_type &type) : Id(id), BTFType(type) {} + virtual ~BTFTypeEntry(); unsigned char getKind() { return BTF_INFO_KIND(BTFType.info); } void setId(size_t Id) { this->Id = Id; } size_t getId() { return Id; } diff --git a/lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp b/lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp index 20eab4d1fb8..8b16e389963 100644 --- a/lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp +++ b/lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp @@ -15,6 +15,8 @@ namespace llvm { +Die2BTFEntry::~Die2BTFEntry() {} + unsigned char Die2BTFEntry::getDieKind(const DIE & Die) { auto Tag = Die.getTag(); diff --git a/lib/CodeGen/AsmPrinter/Dwarf2BTF.h b/lib/CodeGen/AsmPrinter/Dwarf2BTF.h index 3df4dd802a7..125441d37b3 100644 --- a/lib/CodeGen/AsmPrinter/Dwarf2BTF.h +++ b/lib/CodeGen/AsmPrinter/Dwarf2BTF.h @@ -30,6 +30,7 @@ protected: struct btf_type BTFType; public: + virtual ~Die2BTFEntry(); // Return desired BTF_KIND for the Die, return BTF_KIND_UNKN for // invalid/unsupported Die static unsigned char getDieKind(const DIE &Die); diff --git a/lib/MC/MCBTFContext.cpp b/lib/MC/MCBTFContext.cpp index cb846ee5e51..d1c30dd0b88 100644 --- a/lib/MC/MCBTFContext.cpp +++ b/lib/MC/MCBTFContext.cpp @@ -11,8 +11,6 @@ #include "llvm/MC/MCBTFContext.h" #include "llvm/MC/MCObjectFileInfo.h" #include "llvm/MC/MCObjectStreamer.h" -#include "llvm/MC/MCSymbol.h" -#include "llvm/Support/raw_ostream.h" #include #include #include @@ -21,6 +19,8 @@ using namespace llvm; #define DEBUG_TYPE "btf" +BTFTypeEntry::~BTFTypeEntry() {} + void MCBTFContext::addTypeEntry(std::unique_ptr Entry) { TypeEntries.push_back(std::move(Entry)); } -- GitLab From 91defc144e119483c2d4196a875bfa957ec81303 Mon Sep 17 00:00:00 2001 From: Zachary Turner Date: Fri, 12 Oct 2018 17:26:19 +0000 Subject: [PATCH 0116/1116] Better support for POSIX paths in PDBs. This a resubmission of a patch which was previously reverted due to breaking several lld tests. The issues causing those failures have been fixed, so the patch is now resubmitted. ---Original Commit Message--- While it doesn't make a *ton* of sense for POSIX paths to be in PDBs, it's possible to occur in real scenarios involving cross compilation. The tools need to be able to handle this, because certain types of debugging scenarios are possible without a running process and so don't necessarily require you to be on a Windows system. These include post-mortem debugging and binary forensics (e.g. using a debugger to disassemble functions and examine symbols without running the process). There's changes in clang, LLD, and lldb in this patch. After this the cross-platform disassembly and source-list tests pass on Linux. Furthermore, the behavior of LLD can now be summarized by a much simpler rule than before: Unless you specify /pdbsourcepath and /pdbaltpath, the PDB ends up with paths that are valid within the context of the machine that the link is performed on. Differential Revision: https://reviews.llvm.org/D53149 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344377 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/AsmPrinter/CodeViewDebug.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp index 4d45a103c5a..8232f076a93 100644 --- a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp +++ b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp @@ -73,6 +73,7 @@ #include "llvm/Support/Error.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FormatVariadic.h" +#include "llvm/Support/Path.h" #include "llvm/Support/SMLoc.h" #include "llvm/Support/ScopedPrinter.h" #include "llvm/Target/TargetLoweringObjectFile.h" @@ -134,7 +135,9 @@ StringRef CodeViewDebug::getFullFilepath(const DIFile *File) { // If this is a Unix-style path, just use it as is. Don't try to canonicalize // it textually because one of the path components could be a symlink. - if (!Dir.empty() && Dir[0] == '/') { + if (Dir.startswith("/") || Filename.startswith("/")) { + if (llvm::sys::path::is_absolute(Filename, llvm::sys::path::Style::posix)) + return Filename; Filepath = Dir; if (Dir.back() != '/') Filepath += '/'; -- GitLab From a73f1cf1de29de3f3cd66589e9dc6139cf17d89e Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Fri, 12 Oct 2018 17:41:12 +0000 Subject: [PATCH 0117/1116] [BPF] Don't include linux/types.h and fix style git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344381 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/MC/MCBTFContext.h | 255 ++++++++++++------------ lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp | 280 +++++++++++++-------------- lib/CodeGen/AsmPrinter/Dwarf2BTF.h | 16 +- lib/CodeGen/AsmPrinter/DwarfFile.h | 4 +- lib/MC/MCBTFContext.cpp | 48 ++--- lib/MC/MCDwarf2BTF.cpp | 17 +- lib/MC/MCDwarf2BTF.h | 11 +- 7 files changed, 309 insertions(+), 322 deletions(-) diff --git a/include/llvm/MC/MCBTFContext.h b/include/llvm/MC/MCBTFContext.h index fd9edbcf7a8..43103273acb 100644 --- a/include/llvm/MC/MCBTFContext.h +++ b/include/llvm/MC/MCBTFContext.h @@ -15,10 +15,16 @@ #include "llvm/MC/MCSymbol.h" #include "llvm/Support/raw_ostream.h" -#include #include #include +typedef __signed__ char __s8; +typedef unsigned char __u8; +typedef __signed__ short __s16; +typedef unsigned short __u16; +typedef __signed__ int __s32; +typedef unsigned int __u32; + #define BTF_MAGIC 0xeB9F #define BTF_VERSION 1 @@ -36,53 +42,53 @@ struct btf_header { }; /* Max # of type identifier */ -#define BTF_MAX_TYPE 0x0000ffff +#define BTF_MAX_TYPE 0x0000ffff /* Max offset into the string section */ -#define BTF_MAX_NAME_OFFSET 0x0000ffff +#define BTF_MAX_NAME_OFFSET 0x0000ffff /* Max # of struct/union/enum members or func args */ -#define BTF_MAX_VLEN 0xffff +#define BTF_MAX_VLEN 0xffff struct btf_type { - __u32 name_off; - /* "info" bits arrangement - * bits 0-15: vlen (e.g. # of struct's members) - * bits 16-23: unused - * bits 24-27: kind (e.g. int, ptr, array...etc) - * bits 28-31: unused - */ - __u32 info; - /* "size" is used by INT, ENUM, STRUCT and UNION. - * "size" tells the size of the type it is describing. - * - * "type" is used by PTR, TYPEDEF, VOLATILE, CONST, RESTRICT, - * FUNC and FUNC_PROTO. - * "type" is a type_id referring to another type. - */ - union { - __u32 size; - __u32 type; - }; + __u32 name_off; + /* "info" bits arrangement + * bits 0-15: vlen (e.g. # of struct's members) + * bits 16-23: unused + * bits 24-27: kind (e.g. int, ptr, array...etc) + * bits 28-31: unused + */ + __u32 info; + /* "size" is used by INT, ENUM, STRUCT and UNION. + * "size" tells the size of the type it is describing. + * + * "type" is used by PTR, TYPEDEF, VOLATILE, CONST, RESTRICT, + * FUNC and FUNC_PROTO. + * "type" is a type_id referring to another type. + */ + union { + __u32 size; + __u32 type; + }; }; -#define BTF_INFO_KIND(info) (((info) >> 24) & 0x0f) -#define BTF_INFO_VLEN(info) ((info) & 0xffff) - -#define BTF_KIND_UNKN 0 /* Unknown */ -#define BTF_KIND_INT 1 /* Integer */ -#define BTF_KIND_PTR 2 /* Pointer */ -#define BTF_KIND_ARRAY 3 /* Array */ -#define BTF_KIND_STRUCT 4 /* Struct */ -#define BTF_KIND_UNION 5 /* Union */ -#define BTF_KIND_ENUM 6 /* Enumeration */ -#define BTF_KIND_FWD 7 /* Forward */ -#define BTF_KIND_TYPEDEF 8 /* Typedef */ -#define BTF_KIND_VOLATILE 9 /* Volatile */ -#define BTF_KIND_CONST 10 /* Const */ -#define BTF_KIND_RESTRICT 11 /* Restrict */ -#define BTF_KIND_FUNC 12 /* Function */ -#define BTF_KIND_FUNC_PROTO 13 /* Function Prototype */ -#define BTF_KIND_MAX 13 -#define NR_BTF_KINDS 14 +#define BTF_INFO_KIND(info) (((info) >> 24) & 0x0f) +#define BTF_INFO_VLEN(info) ((info)&0xffff) + +#define BTF_KIND_UNKN 0 /* Unknown */ +#define BTF_KIND_INT 1 /* Integer */ +#define BTF_KIND_PTR 2 /* Pointer */ +#define BTF_KIND_ARRAY 3 /* Array */ +#define BTF_KIND_STRUCT 4 /* Struct */ +#define BTF_KIND_UNION 5 /* Union */ +#define BTF_KIND_ENUM 6 /* Enumeration */ +#define BTF_KIND_FWD 7 /* Forward */ +#define BTF_KIND_TYPEDEF 8 /* Typedef */ +#define BTF_KIND_VOLATILE 9 /* Volatile */ +#define BTF_KIND_CONST 10 /* Const */ +#define BTF_KIND_RESTRICT 11 /* Restrict */ +#define BTF_KIND_FUNC 12 /* Function */ +#define BTF_KIND_FUNC_PROTO 13 /* Function Prototype */ +#define BTF_KIND_MAX 13 +#define NR_BTF_KINDS 14 /* For some specific BTF_KIND, "struct btf_type" is immediately * followed by extra data. @@ -91,29 +97,29 @@ struct btf_type { /* BTF_KIND_INT is followed by a u32 and the following * is the 32 bits arrangement: */ -#define BTF_INT_ENCODING(VAL) (((VAL) & 0x0f000000) >> 24) -#define BTF_INT_OFFSET(VAL) (((VAL & 0x00ff0000)) >> 16) -#define BTF_INT_BITS(VAL) ((VAL) & 0x000000ff) +#define BTF_INT_ENCODING(VAL) (((VAL)&0x0f000000) >> 24) +#define BTF_INT_OFFSET(VAL) (((VAL & 0x00ff0000)) >> 16) +#define BTF_INT_BITS(VAL) ((VAL)&0x000000ff) /* Attributes stored in the BTF_INT_ENCODING */ -#define BTF_INT_SIGNED (1 << 0) -#define BTF_INT_CHAR (1 << 1) -#define BTF_INT_BOOL (1 << 2) +#define BTF_INT_SIGNED (1 << 0) +#define BTF_INT_CHAR (1 << 1) +#define BTF_INT_BOOL (1 << 2) /* BTF_KIND_ENUM is followed by multiple "struct btf_enum". * The exact number of btf_enum is stored in the vlen (of the * info in "struct btf_type"). */ struct btf_enum { - __u32 name_off; - __s32 val; + __u32 name_off; + __s32 val; }; /* BTF_KIND_ARRAY is followed by one "struct btf_array" */ struct btf_array { - __u32 type; - __u32 index_type; - __u32 nelems; + __u32 type; + __u32 index_type; + __u32 nelems; }; /* BTF_KIND_STRUCT and BTF_KIND_UNION are followed @@ -122,64 +128,57 @@ struct btf_array { * "struct btf_type"). */ struct btf_member { - __u32 name_off; - __u32 type; - __u32 offset; /* offset in bits */ + __u32 name_off; + __u32 type; + __u32 offset; /* offset in bits */ }; /* .BTF.ext section contains func_info and line_info. */ struct btf_ext_header { - __u16 magic; - __u8 version; - __u8 flags; - __u32 hdr_len; - - __u32 func_info_off; - __u32 func_info_len; - __u32 line_info_off; - __u32 line_info_len; + __u16 magic; + __u8 version; + __u8 flags; + __u32 hdr_len; + + __u32 func_info_off; + __u32 func_info_len; + __u32 line_info_off; + __u32 line_info_len; }; struct bpf_func_info { - __u32 insn_offset; - __u32 type_id; + __u32 insn_offset; + __u32 type_id; }; struct btf_sec_func_info { - __u32 sec_name_off; - __u32 num_func_info; + __u32 sec_name_off; + __u32 num_func_info; }; struct bpf_line_info { - __u32 insn_offset; - __u32 file_name_off; - __u32 line_off; - __u32 line_col; /* line num: line_col >> 10, col num: line_col & 0x3ff */ + __u32 insn_offset; + __u32 file_name_off; + __u32 line_off; + __u32 line_col; /* line num: line_col >> 10, col num: line_col & 0x3ff */ }; struct btf_sec_line_info { - __u32 sec_name_off; - __u32 num_line_info; + __u32 sec_name_off; + __u32 num_line_info; }; namespace llvm { const char *const btf_kind_str[NR_BTF_KINDS] = { - [BTF_KIND_UNKN] = "UNKNOWN", - [BTF_KIND_INT] = "INT", - [BTF_KIND_PTR] = "PTR", - [BTF_KIND_ARRAY] = "ARRAY", - [BTF_KIND_STRUCT] = "STRUCT", - [BTF_KIND_UNION] = "UNION", - [BTF_KIND_ENUM] = "ENUM", - [BTF_KIND_FWD] = "FWD", - [BTF_KIND_TYPEDEF] = "TYPEDEF", - [BTF_KIND_VOLATILE] = "VOLATILE", - [BTF_KIND_CONST] = "CONST", - [BTF_KIND_RESTRICT] = "RESTRICT", - [BTF_KIND_FUNC] = "FUNC", - [BTF_KIND_FUNC_PROTO] = "FUNC_PROTO", + [BTF_KIND_UNKN] = "UNKNOWN", [BTF_KIND_INT] = "INT", + [BTF_KIND_PTR] = "PTR", [BTF_KIND_ARRAY] = "ARRAY", + [BTF_KIND_STRUCT] = "STRUCT", [BTF_KIND_UNION] = "UNION", + [BTF_KIND_ENUM] = "ENUM", [BTF_KIND_FWD] = "FWD", + [BTF_KIND_TYPEDEF] = "TYPEDEF", [BTF_KIND_VOLATILE] = "VOLATILE", + [BTF_KIND_CONST] = "CONST", [BTF_KIND_RESTRICT] = "RESTRICT", + [BTF_KIND_FUNC] = "FUNC", [BTF_KIND_FUNC_PROTO] = "FUNC_PROTO", }; class MCBTFContext; @@ -191,12 +190,11 @@ class MCObjectStreamer; // BTF_KIND_TYPEDEF, BTF_KIND_RESTRICT, and BTF_KIND_FWD class BTFTypeEntry { protected: - size_t Id; /* type index in the BTF list, started from 1 */ + size_t Id; /* type index in the BTF list, started from 1 */ struct btf_type BTFType; public: - BTFTypeEntry(size_t id, struct btf_type &type) : - Id(id), BTFType(type) {} + BTFTypeEntry(size_t id, struct btf_type &type) : Id(id), BTFType(type) {} virtual ~BTFTypeEntry(); unsigned char getKind() { return BTF_INFO_KIND(BTFType.info); } void setId(size_t Id) { this->Id = Id; } @@ -206,19 +204,19 @@ public: unsigned getTypeIndex() { return BTFType.type; } unsigned getNameOff() { return BTFType.name_off; } virtual size_t getSize() { return sizeof(struct btf_type); } - virtual void print(raw_ostream &s, MCBTFContext& BTFContext); + virtual void print(raw_ostream &s, MCBTFContext &BTFContext); virtual void emitData(MCObjectStreamer *MCOS); }; // BTF_KIND_INT class BTFTypeEntryInt : public BTFTypeEntry { - unsigned IntVal; // encoding, offset, bits + unsigned IntVal; // encoding, offset, bits public: - BTFTypeEntryInt(size_t id, struct btf_type &type, unsigned intval) : - BTFTypeEntry(id, type), IntVal(intval) {} + BTFTypeEntryInt(size_t id, struct btf_type &type, unsigned intval) + : BTFTypeEntry(id, type), IntVal(intval) {} size_t getSize() { return BTFTypeEntry::getSize() + sizeof(unsigned); } - void print(raw_ostream &s, MCBTFContext& BTFContext); + void print(raw_ostream &s, MCBTFContext &BTFContext); void emitData(MCObjectStreamer *MCOS); }; @@ -228,13 +226,13 @@ class BTFTypeEntryEnum : public BTFTypeEntry { public: BTFTypeEntryEnum(size_t id, struct btf_type &type, - std::vector &values) : - BTFTypeEntry(id, type), EnumValues(values) {} + std::vector &values) + : BTFTypeEntry(id, type), EnumValues(values) {} size_t getSize() { return BTFTypeEntry::getSize() + - BTF_INFO_VLEN(BTFType.info) * sizeof(struct btf_enum); + BTF_INFO_VLEN(BTFType.info) * sizeof(struct btf_enum); } - void print(raw_ostream &s, MCBTFContext& BTFContext); + void print(raw_ostream &s, MCBTFContext &BTFContext); void emitData(MCObjectStreamer *MCOS); }; @@ -244,12 +242,12 @@ class BTFTypeEntryArray : public BTFTypeEntry { public: BTFTypeEntryArray(size_t id, struct btf_type &type, - struct btf_array &arrayinfo) : - BTFTypeEntry(id, type), ArrayInfo(arrayinfo) {} + struct btf_array &arrayinfo) + : BTFTypeEntry(id, type), ArrayInfo(arrayinfo) {} size_t getSize() { - return BTFTypeEntry::getSize() + sizeof(struct btf_array); + return BTFTypeEntry::getSize() + sizeof(struct btf_array); } - void print(raw_ostream &s, MCBTFContext& BTFContext); + void print(raw_ostream &s, MCBTFContext &BTFContext); void emitData(MCObjectStreamer *MCOS); }; @@ -259,13 +257,13 @@ class BTFTypeEntryStruct : public BTFTypeEntry { public: BTFTypeEntryStruct(size_t id, struct btf_type &type, - std::vector &members) : - BTFTypeEntry(id, type), Members(members) {} + std::vector &members) + : BTFTypeEntry(id, type), Members(members) {} size_t getSize() { return BTFTypeEntry::getSize() + - BTF_INFO_VLEN(BTFType.info) * sizeof(struct btf_member); + BTF_INFO_VLEN(BTFType.info) * sizeof(struct btf_member); } - void print(raw_ostream &s, MCBTFContext& BTFContext); + void print(raw_ostream &s, MCBTFContext &BTFContext); void emitData(MCObjectStreamer *MCOS); }; @@ -275,22 +273,22 @@ class BTFTypeEntryFunc : public BTFTypeEntry { public: BTFTypeEntryFunc(size_t id, struct btf_type &type, - std::vector ¶ms) : - BTFTypeEntry(id, type), Parameters(params) {} + std::vector ¶ms) + : BTFTypeEntry(id, type), Parameters(params) {} size_t getSize() { return BTFTypeEntry::getSize() + - BTF_INFO_VLEN(BTFType.info) * sizeof(unsigned); + BTF_INFO_VLEN(BTFType.info) * sizeof(unsigned); } - void print(raw_ostream &s, MCBTFContext& BTFContext); + void print(raw_ostream &s, MCBTFContext &BTFContext); void emitData(MCObjectStreamer *MCOS); }; class BTFStringTable { - size_t Size; // total size in bytes + size_t Size; // total size in bytes std::map OffsetToIdMap; std::vector Table; - public: +public: BTFStringTable() : Size(0) {} size_t getSize() { return Size; } std::vector &getTable() { return Table; } @@ -312,22 +310,21 @@ class BTFStringTable { } void showTable(raw_ostream &OS) { for (auto OffsetM : OffsetToIdMap) - OS << OffsetM.first << " : " << Table[OffsetM.second] - << "\n"; + OS << OffsetM.first << " : " << Table[OffsetM.second] << "\n"; } }; -struct BTFFuncInfo { - const MCSymbol *Label; - unsigned int TypeId; +struct BTFFuncInfo { + const MCSymbol *Label; + unsigned int TypeId; }; -struct BTFLineInfo { - MCSymbol *Label; - unsigned int FileNameOff; - unsigned int LineOff; - unsigned int LineNum; - unsigned int ColumnNum; +struct BTFLineInfo { + MCSymbol *Label; + unsigned int FileNameOff; + unsigned int LineOff; + unsigned int LineNum; + unsigned int ColumnNum; }; class MCBTFContext { @@ -344,15 +341,13 @@ class MCBTFContext { friend class BTFTypeEntryFunc; public: - void dump(raw_ostream& OS); + void dump(raw_ostream &OS); void emitAll(MCObjectStreamer *MCOS); void emitCommonHeader(MCObjectStreamer *MCOS); void emitBTFSection(MCObjectStreamer *MCOS); void emitBTFExtSection(MCObjectStreamer *MCOS); - size_t addString(std::string S) { - return StringTable.addString(S); - } + size_t addString(std::string S) { return StringTable.addString(S); } void addTypeEntry(std::unique_ptr Entry); void addFuncInfo(unsigned SecNameOff, BTFFuncInfo Info) { FuncInfoTable[SecNameOff].push_back(Info); diff --git a/lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp b/lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp index 8b16e389963..44484c2ae05 100644 --- a/lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp +++ b/lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp @@ -7,8 +7,8 @@ // //===----------------------------------------------------------------------===// -#include "DwarfUnit.h" #include "Dwarf2BTF.h" +#include "DwarfUnit.h" #include "llvm/MC/MCBTFContext.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCSectionELF.h" @@ -17,47 +17,47 @@ namespace llvm { Die2BTFEntry::~Die2BTFEntry() {} -unsigned char Die2BTFEntry::getDieKind(const DIE & Die) { +unsigned char Die2BTFEntry::getDieKind(const DIE &Die) { auto Tag = Die.getTag(); switch (Tag) { - case dwarf::DW_TAG_base_type: - if (getBaseTypeEncoding(Die) == BTF_INVALID_ENCODING) - return BTF_KIND_UNKN; - return BTF_KIND_INT; - case dwarf::DW_TAG_const_type: - return BTF_KIND_CONST; - case dwarf::DW_TAG_pointer_type: - return BTF_KIND_PTR; - case dwarf::DW_TAG_restrict_type: - return BTF_KIND_RESTRICT; - case dwarf::DW_TAG_volatile_type: - return BTF_KIND_VOLATILE; - case dwarf::DW_TAG_typedef: - return BTF_KIND_TYPEDEF; - case dwarf::DW_TAG_structure_type: - case dwarf::DW_TAG_class_type: - if (Die.findAttribute(dwarf::DW_AT_declaration).getType() - != DIEValue::isNone) - return BTF_KIND_FWD; - else - return BTF_KIND_STRUCT; - case dwarf::DW_TAG_union_type: - if (Die.findAttribute(dwarf::DW_AT_declaration).getType() - != DIEValue::isNone) - return BTF_KIND_FWD; - else - return BTF_KIND_UNION; - case dwarf::DW_TAG_enumeration_type: - return BTF_KIND_ENUM; - case dwarf::DW_TAG_array_type: - return BTF_KIND_ARRAY; - case dwarf::DW_TAG_subprogram: - return BTF_KIND_FUNC; - case dwarf::DW_TAG_subroutine_type: - return BTF_KIND_FUNC_PROTO; - default: - break; + case dwarf::DW_TAG_base_type: + if (getBaseTypeEncoding(Die) == BTF_INVALID_ENCODING) + return BTF_KIND_UNKN; + return BTF_KIND_INT; + case dwarf::DW_TAG_const_type: + return BTF_KIND_CONST; + case dwarf::DW_TAG_pointer_type: + return BTF_KIND_PTR; + case dwarf::DW_TAG_restrict_type: + return BTF_KIND_RESTRICT; + case dwarf::DW_TAG_volatile_type: + return BTF_KIND_VOLATILE; + case dwarf::DW_TAG_typedef: + return BTF_KIND_TYPEDEF; + case dwarf::DW_TAG_structure_type: + case dwarf::DW_TAG_class_type: + if (Die.findAttribute(dwarf::DW_AT_declaration).getType() != + DIEValue::isNone) + return BTF_KIND_FWD; + else + return BTF_KIND_STRUCT; + case dwarf::DW_TAG_union_type: + if (Die.findAttribute(dwarf::DW_AT_declaration).getType() != + DIEValue::isNone) + return BTF_KIND_FWD; + else + return BTF_KIND_UNION; + case dwarf::DW_TAG_enumeration_type: + return BTF_KIND_ENUM; + case dwarf::DW_TAG_array_type: + return BTF_KIND_ARRAY; + case dwarf::DW_TAG_subprogram: + return BTF_KIND_FUNC; + case dwarf::DW_TAG_subroutine_type: + return BTF_KIND_FUNC_PROTO; + default: + break; } return BTF_KIND_UNKN; @@ -67,27 +67,27 @@ std::unique_ptr Die2BTFEntry::dieToBTFTypeEntry(const DIE &Die) { unsigned char Kind = getDieKind(Die); switch (Kind) { - case BTF_KIND_INT: - return make_unique(Die); - case BTF_KIND_PTR: - case BTF_KIND_TYPEDEF: - case BTF_KIND_VOLATILE: - case BTF_KIND_CONST: - case BTF_KIND_RESTRICT: - case BTF_KIND_FWD: - return make_unique(Die); - case BTF_KIND_ARRAY: - return make_unique(Die); - case BTF_KIND_STRUCT: - case BTF_KIND_UNION: - return make_unique(Die); - case BTF_KIND_ENUM: - return make_unique(Die); - case BTF_KIND_FUNC: - case BTF_KIND_FUNC_PROTO: - return make_unique(Die); - default: - break; + case BTF_KIND_INT: + return make_unique(Die); + case BTF_KIND_PTR: + case BTF_KIND_TYPEDEF: + case BTF_KIND_VOLATILE: + case BTF_KIND_CONST: + case BTF_KIND_RESTRICT: + case BTF_KIND_FWD: + return make_unique(Die); + case BTF_KIND_ARRAY: + return make_unique(Die); + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + return make_unique(Die); + case BTF_KIND_ENUM: + return make_unique(Die); + case BTF_KIND_FUNC: + case BTF_KIND_FUNC_PROTO: + return make_unique(Die); + default: + break; } return nullptr; } @@ -96,20 +96,19 @@ bool Die2BTFEntry::shouldSkipDie(const DIE &Die) { auto Tag = Die.getTag(); switch (Tag) { - case dwarf::DW_TAG_const_type: - case dwarf::DW_TAG_pointer_type: - case dwarf::DW_TAG_restrict_type: - case dwarf::DW_TAG_typedef: - case dwarf::DW_TAG_volatile_type: - { - auto TypeV = Die.findAttribute(dwarf::DW_AT_type); - if (TypeV.getType() == DIEValue::isNone) - return false; - auto &TypeDie = TypeV.getDIEEntry().getEntry(); - return Die2BTFEntry::shouldSkipDie(TypeDie); - } - default: - return getDieKind(Die) == BTF_KIND_UNKN; + case dwarf::DW_TAG_const_type: + case dwarf::DW_TAG_pointer_type: + case dwarf::DW_TAG_restrict_type: + case dwarf::DW_TAG_typedef: + case dwarf::DW_TAG_volatile_type: { + auto TypeV = Die.findAttribute(dwarf::DW_AT_type); + if (TypeV.getType() == DIEValue::isNone) + return false; + auto &TypeDie = TypeV.getDIEEntry().getEntry(); + return Die2BTFEntry::shouldSkipDie(TypeDie); + } + default: + return getDieKind(Die) == BTF_KIND_UNKN; } return true; } @@ -120,26 +119,26 @@ unsigned char Die2BTFEntry::getBaseTypeEncoding(const DIE &Die) { return BTF_INVALID_ENCODING; switch (V.getDIEInteger().getValue()) { - case dwarf::DW_ATE_boolean: - return BTF_INT_BOOL; - case dwarf::DW_ATE_signed: - return BTF_INT_SIGNED; - case dwarf::DW_ATE_signed_char: - return BTF_INT_CHAR; - case dwarf::DW_ATE_unsigned: - return 0; - case dwarf::DW_ATE_unsigned_char: - return BTF_INT_CHAR; - case dwarf::DW_ATE_imaginary_float: - case dwarf::DW_ATE_packed_decimal: - case dwarf::DW_ATE_numeric_string: - case dwarf::DW_ATE_edited: - case dwarf::DW_ATE_signed_fixed: - case dwarf::DW_ATE_address: - case dwarf::DW_ATE_complex_float: - case dwarf::DW_ATE_float: - default: - break; + case dwarf::DW_ATE_boolean: + return BTF_INT_BOOL; + case dwarf::DW_ATE_signed: + return BTF_INT_SIGNED; + case dwarf::DW_ATE_signed_char: + return BTF_INT_CHAR; + case dwarf::DW_ATE_unsigned: + return 0; + case dwarf::DW_ATE_unsigned_char: + return BTF_INT_CHAR; + case dwarf::DW_ATE_imaginary_float: + case dwarf::DW_ATE_packed_decimal: + case dwarf::DW_ATE_numeric_string: + case dwarf::DW_ATE_edited: + case dwarf::DW_ATE_signed_fixed: + case dwarf::DW_ATE_address: + case dwarf::DW_ATE_complex_float: + case dwarf::DW_ATE_float: + default: + break; } return BTF_INVALID_ENCODING; } @@ -148,53 +147,53 @@ Die2BTFEntry::Die2BTFEntry(const DIE &Die) : Die(Die) { unsigned char Kind = getDieKind(Die); switch (Kind) { - case BTF_KIND_CONST: - case BTF_KIND_FWD: - case BTF_KIND_PTR: - case BTF_KIND_RESTRICT: - case BTF_KIND_TYPEDEF: - case BTF_KIND_VOLATILE: - break; - default: - assert("Invalid Die passed into BTFTypeEntry()"); - break; + case BTF_KIND_CONST: + case BTF_KIND_FWD: + case BTF_KIND_PTR: + case BTF_KIND_RESTRICT: + case BTF_KIND_TYPEDEF: + case BTF_KIND_VOLATILE: + break; + default: + assert("Invalid Die passed into BTFTypeEntry()"); + break; } BTFType.info = (Kind & 0xf) << 24; } void Die2BTFEntry::completeData(class Dwarf2BTF &Dwarf2BTF) { - auto TypeV = Die.findAttribute(dwarf::DW_AT_type); - if (TypeV.getType() == DIEValue::isNone) { - BTFType.type = 0; - } else { - auto &TypeDie = TypeV.getDIEEntry().getEntry(); - auto Type = Dwarf2BTF.getTypeIndex(TypeDie); - BTFType.type = Type; - } + auto TypeV = Die.findAttribute(dwarf::DW_AT_type); + if (TypeV.getType() == DIEValue::isNone) { + BTFType.type = 0; + } else { + auto &TypeDie = TypeV.getDIEEntry().getEntry(); + auto Type = Dwarf2BTF.getTypeIndex(TypeDie); + BTFType.type = Type; + } - unsigned char Kind = getDieKind(Die); - if (Kind != BTF_KIND_FWD) { - BTFType.name_off = 0; - } else { - auto NameV = Die.findAttribute(dwarf::DW_AT_name); - auto Str = NameV.getDIEString().getString(); - BTFType.name_off = Dwarf2BTF.addBTFString(Str); - } + unsigned char Kind = getDieKind(Die); + if (Kind != BTF_KIND_FWD) { + BTFType.name_off = 0; + } else { + auto NameV = Die.findAttribute(dwarf::DW_AT_name); + auto Str = NameV.getDIEString().getString(); + BTFType.name_off = Dwarf2BTF.addBTFString(Str); + } - auto typeEntry = make_unique(Id, BTFType); - Dwarf2BTF.addBTFTypeEntry(std::move(typeEntry)); + auto typeEntry = make_unique(Id, BTFType); + Dwarf2BTF.addBTFTypeEntry(std::move(typeEntry)); } Die2BTFEntryInt::Die2BTFEntryInt(const DIE &Die) : Die2BTFEntry(Die) { unsigned char Kind = getDieKind(Die); switch (Kind) { - case BTF_KIND_INT: - break; - default: - assert("Invalid Die passed into BTFTypeEntryInt()"); - break; + case BTF_KIND_INT: + break; + default: + assert("Invalid Die passed into BTFTypeEntryInt()"); + break; } // handle BTF_INT_ENCODING in IntVal @@ -212,7 +211,7 @@ Die2BTFEntryInt::Die2BTFEntryInt(const DIE &Die) : Die2BTFEntry(Die) { V = Die.findAttribute(dwarf::DW_AT_byte_size); __u32 Size = V.getDIEInteger().getValue() & 0xffffffff; -// handle BTF_INT_BITS in IntVal + // handle BTF_INT_BITS in IntVal V = Die.findAttribute(dwarf::DW_AT_bit_size); if (V.getType() == DIEValue::isInteger) IntVal |= V.getDIEInteger().getValue() & 0xff; @@ -225,14 +224,14 @@ Die2BTFEntryInt::Die2BTFEntryInt(const DIE &Die) : Die2BTFEntry(Die) { } void Die2BTFEntryInt::completeData(class Dwarf2BTF &Dwarf2BTF) { - auto NameV = Die.findAttribute(dwarf::DW_AT_name); - auto TypeV = Die.findAttribute(dwarf::DW_AT_type); - auto Str = NameV.getDIEString().getString(); + auto NameV = Die.findAttribute(dwarf::DW_AT_name); + auto TypeV = Die.findAttribute(dwarf::DW_AT_type); + auto Str = NameV.getDIEString().getString(); - BTFType.name_off = Dwarf2BTF.addBTFString(Str); + BTFType.name_off = Dwarf2BTF.addBTFString(Str); - auto typeEntry = make_unique(Id, BTFType, IntVal); - Dwarf2BTF.addBTFTypeEntry(std::move(typeEntry)); + auto typeEntry = make_unique(Id, BTFType, IntVal); + Dwarf2BTF.addBTFTypeEntry(std::move(typeEntry)); } Die2BTFEntryEnum::Die2BTFEntryEnum(const DIE &Die) : Die2BTFEntry(Die) { @@ -275,8 +274,7 @@ void Die2BTFEntryEnum::completeData(class Dwarf2BTF &Dwarf2BTF) { Dwarf2BTF.addBTFTypeEntry(std::move(typeEntry)); } -Die2BTFEntryArray::Die2BTFEntryArray(const DIE &Die) : - Die2BTFEntry(Die) { +Die2BTFEntryArray::Die2BTFEntryArray(const DIE &Die) : Die2BTFEntry(Die) { BTFType.info = (BTF_KIND_ARRAY << 24); BTFType.size = 0; } @@ -289,7 +287,8 @@ void Die2BTFEntryArray::completeData(class Dwarf2BTF &Dwarf2BTF) { Str = NameV.getDIEString().getString(); BTFType.name_off = Dwarf2BTF.addBTFString(Str); - auto &ArrayTypeDie = Die.findAttribute(dwarf::DW_AT_type).getDIEEntry().getEntry(); + auto &ArrayTypeDie = + Die.findAttribute(dwarf::DW_AT_type).getDIEEntry().getEntry(); ArrayInfo.type = Dwarf2BTF.getTypeIndex(ArrayTypeDie); // The number of elements should count all subranges @@ -342,7 +341,6 @@ void Die2BTFEntryStruct::completeData(class Dwarf2BTF &Dwarf2BTF) { } else BTFType.name_off = 0; - for (auto &ChildDie : Die.children()) { if (ChildDie.getTag() != dwarf::DW_TAG_member) continue; @@ -456,7 +454,7 @@ void Die2BTFEntryFunc::completeData(class Dwarf2BTF &Dwarf2BTF) { } Dwarf2BTF::Dwarf2BTF(MCContext &Context, bool IsLittleEndian) - : OuterCtx(Context), IsLE(IsLittleEndian) { + : OuterCtx(Context), IsLE(IsLittleEndian) { BTFContext = make_unique(); } @@ -470,7 +468,7 @@ void Dwarf2BTF::addTypeEntry(const DIE &Die) { auto TypeEntry = Die2BTFEntry::dieToBTFTypeEntry(Die); if (TypeEntry != nullptr) { TypeEntry->setId(TypeEntries.size() + 1); - DieToIdMap[const_cast(&Die)] = TypeEntry->getId(); + DieToIdMap[const_cast(&Die)] = TypeEntry->getId(); TypeEntries.push_back(std::move(TypeEntry)); } } @@ -500,4 +498,4 @@ void Dwarf2BTF::finish() { OuterCtx.setBTFContext(std::move(BTFContext)); } -} +} // namespace llvm diff --git a/lib/CodeGen/AsmPrinter/Dwarf2BTF.h b/lib/CodeGen/AsmPrinter/Dwarf2BTF.h index 125441d37b3..a472d68ed7e 100644 --- a/lib/CodeGen/AsmPrinter/Dwarf2BTF.h +++ b/lib/CodeGen/AsmPrinter/Dwarf2BTF.h @@ -26,7 +26,7 @@ class MCBTFContext; class Die2BTFEntry { protected: const DIE &Die; - size_t Id; /* type index in the BTF list, started from 1 */ + size_t Id; /* type index in the BTF list, started from 1 */ struct btf_type BTFType; public: @@ -54,7 +54,7 @@ public: // BTF_KIND_INT class Die2BTFEntryInt : public Die2BTFEntry { - __u32 IntVal; // encoding, offset, bits + __u32 IntVal; // encoding, offset, bits public: Die2BTFEntryInt(const DIE &Die); @@ -99,7 +99,7 @@ public: class Dwarf2BTF { std::vector> TypeEntries; - std::map DieToIdMap; + std::map DieToIdMap; std::unique_ptr BTFContext; MCContext &OuterCtx; bool IsLE; @@ -110,14 +110,12 @@ public: void addDwarfCU(DwarfUnit *TheU); void finish(); __u32 getTypeIndex(DIE &Die) { - DIE *DiePtr = const_cast(&Die); + DIE *DiePtr = const_cast(&Die); assert((DieToIdMap.find(DiePtr) != DieToIdMap.end()) && "Die not added to in the BTFContext"); return DieToIdMap[DiePtr]; } - size_t addBTFString(std::string S) { - return BTFContext->addString(S); - } + size_t addBTFString(std::string S) { return BTFContext->addString(S); } void addBTFTypeEntry(std::unique_ptr Entry); void addBTFFuncInfo(unsigned SecNameOff, BTFFuncInfo FuncInfo) { BTFContext->addFuncInfo(SecNameOff, FuncInfo); @@ -126,10 +124,10 @@ public: private: void addTypeEntry(const DIE &Die); bool alreadyAdded(DIE &Die) { - return DieToIdMap.find(const_cast(&Die)) != DieToIdMap.end(); + return DieToIdMap.find(const_cast(&Die)) != DieToIdMap.end(); } void completeData(); }; -} +} // namespace llvm #endif diff --git a/lib/CodeGen/AsmPrinter/DwarfFile.h b/lib/CodeGen/AsmPrinter/DwarfFile.h index 9aafe2613f6..114f98f725d 100644 --- a/lib/CodeGen/AsmPrinter/DwarfFile.h +++ b/lib/CodeGen/AsmPrinter/DwarfFile.h @@ -137,9 +137,7 @@ public: return ScopeVariables; } - DenseMap &getScopeLabels() { - return ScopeLabels; - } + DenseMap &getScopeLabels() { return ScopeLabels; } DenseMap &getAbstractSPDies() { return AbstractSPDies; diff --git a/lib/MC/MCBTFContext.cpp b/lib/MC/MCBTFContext.cpp index d1c30dd0b88..cb121c41552 100644 --- a/lib/MC/MCBTFContext.cpp +++ b/lib/MC/MCBTFContext.cpp @@ -7,8 +7,8 @@ // //===----------------------------------------------------------------------===// -#include "llvm/MC/MCContext.h" #include "llvm/MC/MCBTFContext.h" +#include "llvm/MC/MCContext.h" #include "llvm/MC/MCObjectFileInfo.h" #include "llvm/MC/MCObjectStreamer.h" #include @@ -22,7 +22,7 @@ using namespace llvm; BTFTypeEntry::~BTFTypeEntry() {} void MCBTFContext::addTypeEntry(std::unique_ptr Entry) { - TypeEntries.push_back(std::move(Entry)); + TypeEntries.push_back(std::move(Entry)); } void MCBTFContext::dump(raw_ostream &OS) { @@ -39,8 +39,7 @@ void MCBTFContext::dump(raw_ostream &OS) { for (auto &FuncSec : FuncInfoTable) { OS << "sec_name_off=" << FuncSec.first << "\n"; for (auto &FuncInfo : FuncSec.second) { - OS << "\tinsn_offset= type_id=" - << FuncInfo.TypeId << "\n"; + OS << "\tinsn_offset= type_id=" << FuncInfo.TypeId << "\n"; } } @@ -48,12 +47,9 @@ void MCBTFContext::dump(raw_ostream &OS) { for (auto &LineSec : LineInfoTable) { OS << "sec_name_off=" << LineSec.first << "\n"; for (auto &LineInfo : LineSec.second) { - OS << "\tinsn_offset= file_name_off=" - << LineInfo.FileNameOff - << " line_off=" << LineInfo.LineOff - << " line_num=" << LineInfo.LineNum - << " column_num=" << LineInfo.ColumnNum - << "\n"; + OS << "\tinsn_offset= file_name_off=" << LineInfo.FileNameOff + << " line_off=" << LineInfo.LineOff << " line_num=" << LineInfo.LineNum + << " column_num=" << LineInfo.ColumnNum << "\n"; } } } @@ -83,7 +79,7 @@ void MCBTFContext::emitBTFSection(MCObjectStreamer *MCOS) { MCOS->EmitIntValue(str_len, 4); // emit type table - for (auto &TypeEntry: TypeEntries) + for (auto &TypeEntry : TypeEntries) TypeEntry->emitData(MCOS); // emit string table @@ -146,9 +142,8 @@ void MCBTFContext::emitAll(MCObjectStreamer *MCOS) { emitBTFExtSection(MCOS); } -void BTFTypeEntry::print(raw_ostream &OS, MCBTFContext& MCBTFContext) { - OS << "[" << Id << "] " - << btf_kind_str[BTF_INFO_KIND(BTFType.info)] +void BTFTypeEntry::print(raw_ostream &OS, MCBTFContext &MCBTFContext) { + OS << "[" << Id << "] " << btf_kind_str[BTF_INFO_KIND(BTFType.info)] << " name_off=" << BTFType.name_off << " info=" << format("0x%08lx", BTFType.info) << " size/type=" << BTFType.size << "\n"; @@ -160,7 +155,7 @@ void BTFTypeEntry::emitData(MCObjectStreamer *MCOS) { MCOS->EmitIntValue(BTFType.size, 4); } -void BTFTypeEntryInt::print(raw_ostream &OS, MCBTFContext& MCBTFContext) { +void BTFTypeEntryInt::print(raw_ostream &OS, MCBTFContext &MCBTFContext) { BTFTypeEntry::print(OS, MCBTFContext); OS << "\tdesc=" << format("0x%08lx", IntVal) << "\n"; } @@ -170,12 +165,12 @@ void BTFTypeEntryInt::emitData(MCObjectStreamer *MCOS) { MCOS->EmitIntValue(IntVal, 4); } -void BTFTypeEntryEnum::print(raw_ostream &OS, MCBTFContext& MCBTFContext) { - BTFTypeEntry::print(OS, MCBTFContext); +void BTFTypeEntryEnum::print(raw_ostream &OS, MCBTFContext &MCBTFContext) { + BTFTypeEntry::print(OS, MCBTFContext); for (size_t i = 0; i < BTF_INFO_VLEN(BTFType.info); i++) { auto &EnumValue = EnumValues[i]; - OS << "\tname_off=" << EnumValue.name_off - << " value=" << EnumValue.val << "\n"; + OS << "\tname_off=" << EnumValue.name_off << " value=" << EnumValue.val + << "\n"; } } @@ -187,7 +182,7 @@ void BTFTypeEntryEnum::emitData(MCObjectStreamer *MCOS) { } } -void BTFTypeEntryArray::print(raw_ostream &OS, MCBTFContext& MCBTFContext) { +void BTFTypeEntryArray::print(raw_ostream &OS, MCBTFContext &MCBTFContext) { BTFTypeEntry::print(OS, MCBTFContext); OS << "\telem_type=" << format("0x%08lx", ArrayInfo.type) << " index_type=" << format("0x%08lx", ArrayInfo.index_type) @@ -201,12 +196,11 @@ void BTFTypeEntryArray::emitData(MCObjectStreamer *MCOS) { MCOS->EmitIntValue(ArrayInfo.nelems, 4); } -void BTFTypeEntryStruct::print(raw_ostream &OS, MCBTFContext& MCBTFContext) { +void BTFTypeEntryStruct::print(raw_ostream &OS, MCBTFContext &MCBTFContext) { BTFTypeEntry::print(OS, MCBTFContext); - for (size_t i = 0; i < BTF_INFO_VLEN(BTFType.info); i++) { + for (size_t i = 0; i < BTF_INFO_VLEN(BTFType.info); i++) { auto &Member = Members[i]; - OS << "\tname_off=" << Member.name_off - << " type=" << Member.type + OS << "\tname_off=" << Member.name_off << " type=" << Member.type << " bit_offset=" << Member.offset << "\n"; } } @@ -220,9 +214,9 @@ void BTFTypeEntryStruct::emitData(MCObjectStreamer *MCOS) { } } -void BTFTypeEntryFunc::print(raw_ostream &OS, MCBTFContext& MCBTFContext) { +void BTFTypeEntryFunc::print(raw_ostream &OS, MCBTFContext &MCBTFContext) { BTFTypeEntry::print(OS, MCBTFContext); - for (size_t i = 0; i < BTF_INFO_VLEN(BTFType.info); i++) { + for (size_t i = 0; i < BTF_INFO_VLEN(BTFType.info); i++) { auto Parameter = Parameters[i]; OS << "\tparam_type=" << Parameter << "\n"; } @@ -230,6 +224,6 @@ void BTFTypeEntryFunc::print(raw_ostream &OS, MCBTFContext& MCBTFContext) { void BTFTypeEntryFunc::emitData(MCObjectStreamer *MCOS) { BTFTypeEntry::emitData(MCOS); - for (auto &Parameter: Parameters) + for (auto &Parameter : Parameters) MCOS->EmitIntValue(Parameter, 4); } diff --git a/lib/MC/MCDwarf2BTF.cpp b/lib/MC/MCDwarf2BTF.cpp index 08a70e6f318..9809a2153ec 100644 --- a/lib/MC/MCDwarf2BTF.cpp +++ b/lib/MC/MCDwarf2BTF.cpp @@ -10,11 +10,11 @@ #include "MCDwarf2BTF.h" #include "llvm/ADT/SmallVector.h" #include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCBTFContext.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCObjectStreamer.h" #include "llvm/MC/MCSection.h" #include "llvm/MC/MCSectionELF.h" -#include "llvm/MC/MCBTFContext.h" #include "llvm/Support/Endian.h" #include "llvm/Support/EndianStream.h" #include @@ -22,7 +22,7 @@ using namespace llvm; void MCDwarf2BTF::addFiles(MCObjectStreamer *MCOS, std::string &FileName, - std::vector &Files) { + std::vector &Files) { std::vector Content; std::ifstream Inputfile(FileName); @@ -34,9 +34,10 @@ void MCDwarf2BTF::addFiles(MCObjectStreamer *MCOS, std::string &FileName, Files.push_back(FileContent(FileName, Content)); } -void MCDwarf2BTF::addLines(MCObjectStreamer *MCOS, StringRef &SectionName, - std::vector &Files, - const MCLineSection::MCDwarfLineEntryCollection &LineEntries) { +void MCDwarf2BTF::addLines( + MCObjectStreamer *MCOS, StringRef &SectionName, + std::vector &Files, + const MCLineSection::MCDwarfLineEntryCollection &LineEntries) { MCContext &Context = MCOS->getContext(); auto &BTFCxt = Context.getBTFContext(); @@ -84,9 +85,11 @@ void MCDwarf2BTF::addDwarfLineInfo(MCObjectStreamer *MCOS) { FileName = Dirs[File.DirIndex - 1] + "/" + File.Name; MCDwarf2BTF::addFiles(MCOS, FileName, Files); } - for (const auto &LineSec: CUIDTablePair.second.getMCLineSections().getMCLineEntries()) { + for (const auto &LineSec : + CUIDTablePair.second.getMCLineSections().getMCLineEntries()) { MCSection *Section = LineSec.first; - const MCLineSection::MCDwarfLineEntryCollection &LineEntries = LineSec.second; + const MCLineSection::MCDwarfLineEntryCollection &LineEntries = + LineSec.second; StringRef SectionName; if (MCSectionELF *SectionELF = dyn_cast(Section)) diff --git a/lib/MC/MCDwarf2BTF.h b/lib/MC/MCDwarf2BTF.h index 22d1b7741a5..69983374a09 100644 --- a/lib/MC/MCDwarf2BTF.h +++ b/lib/MC/MCDwarf2BTF.h @@ -18,12 +18,13 @@ using FileContent = std::pair>; class MCDwarf2BTF { public: static void addFiles(MCObjectStreamer *MCOS, std::string &FileName, - std::vector &Files); - static void addLines(MCObjectStreamer *MCOS, StringRef &SectionName, - std::vector &Files, - const MCLineSection::MCDwarfLineEntryCollection &LineEntries); + std::vector &Files); + static void + addLines(MCObjectStreamer *MCOS, StringRef &SectionName, + std::vector &Files, + const MCLineSection::MCDwarfLineEntryCollection &LineEntries); static void addDwarfLineInfo(MCObjectStreamer *MCOS); }; -} +} // namespace llvm #endif -- GitLab From 9a80e3fe5a8cbb8e16744aded8c3015b3873153a Mon Sep 17 00:00:00 2001 From: Eric Liu Date: Fri, 12 Oct 2018 17:55:21 +0000 Subject: [PATCH 0118/1116] Disambiguate: s/make_unique/llvm::make_unique/. NFC git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344385 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp b/lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp index 44484c2ae05..5afd2c902ca 100644 --- a/lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp +++ b/lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp @@ -68,24 +68,24 @@ std::unique_ptr Die2BTFEntry::dieToBTFTypeEntry(const DIE &Die) { switch (Kind) { case BTF_KIND_INT: - return make_unique(Die); + return llvm::make_unique(Die); case BTF_KIND_PTR: case BTF_KIND_TYPEDEF: case BTF_KIND_VOLATILE: case BTF_KIND_CONST: case BTF_KIND_RESTRICT: case BTF_KIND_FWD: - return make_unique(Die); + return llvm::make_unique(Die); case BTF_KIND_ARRAY: - return make_unique(Die); + return llvm::make_unique(Die); case BTF_KIND_STRUCT: case BTF_KIND_UNION: - return make_unique(Die); + return llvm::make_unique(Die); case BTF_KIND_ENUM: - return make_unique(Die); + return llvm::make_unique(Die); case BTF_KIND_FUNC: case BTF_KIND_FUNC_PROTO: - return make_unique(Die); + return llvm::make_unique(Die); default: break; } @@ -181,7 +181,7 @@ void Die2BTFEntry::completeData(class Dwarf2BTF &Dwarf2BTF) { BTFType.name_off = Dwarf2BTF.addBTFString(Str); } - auto typeEntry = make_unique(Id, BTFType); + auto typeEntry = llvm::make_unique(Id, BTFType); Dwarf2BTF.addBTFTypeEntry(std::move(typeEntry)); } @@ -230,7 +230,7 @@ void Die2BTFEntryInt::completeData(class Dwarf2BTF &Dwarf2BTF) { BTFType.name_off = Dwarf2BTF.addBTFString(Str); - auto typeEntry = make_unique(Id, BTFType, IntVal); + auto typeEntry = llvm::make_unique(Id, BTFType, IntVal); Dwarf2BTF.addBTFTypeEntry(std::move(typeEntry)); } @@ -270,7 +270,7 @@ void Die2BTFEntryEnum::completeData(class Dwarf2BTF &Dwarf2BTF) { EnumValues.push_back(BTFEnum); } - auto typeEntry = make_unique(Id, BTFType, EnumValues); + auto typeEntry = llvm::make_unique(Id, BTFType, EnumValues); Dwarf2BTF.addBTFTypeEntry(std::move(typeEntry)); } @@ -313,7 +313,7 @@ void Die2BTFEntryArray::completeData(class Dwarf2BTF &Dwarf2BTF) { } ArrayInfo.nelems = Nelems; - auto TypeEntry = make_unique(Id, BTFType, ArrayInfo); + auto TypeEntry = llvm::make_unique(Id, BTFType, ArrayInfo); Dwarf2BTF.addBTFTypeEntry(std::move(TypeEntry)); } @@ -378,7 +378,7 @@ void Die2BTFEntryStruct::completeData(class Dwarf2BTF &Dwarf2BTF) { Members.push_back(BTFMember); } - auto typeEntry = make_unique(Id, BTFType, Members); + auto typeEntry = llvm::make_unique(Id, BTFType, Members); Dwarf2BTF.addBTFTypeEntry(std::move(typeEntry)); } @@ -428,7 +428,7 @@ void Die2BTFEntryFunc::completeData(class Dwarf2BTF &Dwarf2BTF) { } } - auto typeEntry = make_unique(Id, BTFType, Parameters); + auto typeEntry = llvm::make_unique(Id, BTFType, Parameters); Dwarf2BTF.addBTFTypeEntry(std::move(typeEntry)); if (BTF_INFO_KIND(BTFType.info) == BTF_KIND_FUNC) { @@ -455,7 +455,7 @@ void Die2BTFEntryFunc::completeData(class Dwarf2BTF &Dwarf2BTF) { Dwarf2BTF::Dwarf2BTF(MCContext &Context, bool IsLittleEndian) : OuterCtx(Context), IsLE(IsLittleEndian) { - BTFContext = make_unique(); + BTFContext = llvm::make_unique(); } void Dwarf2BTF::addTypeEntry(const DIE &Die) { -- GitLab From ee500a522114e465a9738a5969705969cfe5243e Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Fri, 12 Oct 2018 17:57:07 +0000 Subject: [PATCH 0119/1116] [BPF] Use cstdint {,u}int*_t instead of linux/types.h __u32 __u16 ... git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344387 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/MC/MCBTFContext.h | 85 +++++++++++++--------------- lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp | 12 ++-- lib/CodeGen/AsmPrinter/Dwarf2BTF.h | 6 +- 3 files changed, 48 insertions(+), 55 deletions(-) diff --git a/include/llvm/MC/MCBTFContext.h b/include/llvm/MC/MCBTFContext.h index 43103273acb..5b4bafd7aee 100644 --- a/include/llvm/MC/MCBTFContext.h +++ b/include/llvm/MC/MCBTFContext.h @@ -18,27 +18,20 @@ #include #include -typedef __signed__ char __s8; -typedef unsigned char __u8; -typedef __signed__ short __s16; -typedef unsigned short __u16; -typedef __signed__ int __s32; -typedef unsigned int __u32; - #define BTF_MAGIC 0xeB9F #define BTF_VERSION 1 struct btf_header { - __u16 magic; - __u8 version; - __u8 flags; - __u32 hdr_len; + uint16_t magic; + uint8_t version; + uint8_t flags; + uint32_t hdr_len; /* All offsets are in bytes relative to the end of this header */ - __u32 type_off; /* offset of type section */ - __u32 type_len; /* length of type section */ - __u32 str_off; /* offset of string section */ - __u32 str_len; /* length of string section */ + uint32_t type_off; // offset of type section + uint32_t type_len; // length of type section + uint32_t str_off; // offset of string section + uint32_t str_len; // length of string section }; /* Max # of type identifier */ @@ -49,14 +42,14 @@ struct btf_header { #define BTF_MAX_VLEN 0xffff struct btf_type { - __u32 name_off; + uint32_t name_off; /* "info" bits arrangement * bits 0-15: vlen (e.g. # of struct's members) * bits 16-23: unused * bits 24-27: kind (e.g. int, ptr, array...etc) * bits 28-31: unused */ - __u32 info; + uint32_t info; /* "size" is used by INT, ENUM, STRUCT and UNION. * "size" tells the size of the type it is describing. * @@ -65,8 +58,8 @@ struct btf_type { * "type" is a type_id referring to another type. */ union { - __u32 size; - __u32 type; + uint32_t size; + uint32_t type; }; }; @@ -111,15 +104,15 @@ struct btf_type { * info in "struct btf_type"). */ struct btf_enum { - __u32 name_off; - __s32 val; + uint32_t name_off; + int32_t val; }; /* BTF_KIND_ARRAY is followed by one "struct btf_array" */ struct btf_array { - __u32 type; - __u32 index_type; - __u32 nelems; + uint32_t type; + uint32_t index_type; + uint32_t nelems; }; /* BTF_KIND_STRUCT and BTF_KIND_UNION are followed @@ -128,45 +121,45 @@ struct btf_array { * "struct btf_type"). */ struct btf_member { - __u32 name_off; - __u32 type; - __u32 offset; /* offset in bits */ + uint32_t name_off; + uint32_t type; + uint32_t offset; /* offset in bits */ }; /* .BTF.ext section contains func_info and line_info. */ struct btf_ext_header { - __u16 magic; - __u8 version; - __u8 flags; - __u32 hdr_len; - - __u32 func_info_off; - __u32 func_info_len; - __u32 line_info_off; - __u32 line_info_len; + uint16_t magic; + uint8_t version; + uint8_t flags; + uint32_t hdr_len; + + uint32_t func_info_off; + uint32_t func_info_len; + uint32_t line_info_off; + uint32_t line_info_len; }; struct bpf_func_info { - __u32 insn_offset; - __u32 type_id; + uint32_t insn_offset; + uint32_t type_id; }; struct btf_sec_func_info { - __u32 sec_name_off; - __u32 num_func_info; + uint32_t sec_name_off; + uint32_t num_func_info; }; struct bpf_line_info { - __u32 insn_offset; - __u32 file_name_off; - __u32 line_off; - __u32 line_col; /* line num: line_col >> 10, col num: line_col & 0x3ff */ + uint32_t insn_offset; + uint32_t file_name_off; + uint32_t line_off; + uint32_t line_col; /* line num: line_col >> 10, col num: line_col & 0x3ff */ }; struct btf_sec_line_info { - __u32 sec_name_off; - __u32 num_line_info; + uint32_t sec_name_off; + uint32_t num_line_info; }; namespace llvm { diff --git a/lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp b/lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp index 5afd2c902ca..20cc61df9b6 100644 --- a/lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp +++ b/lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp @@ -200,7 +200,7 @@ Die2BTFEntryInt::Die2BTFEntryInt(const DIE &Die) : Die2BTFEntry(Die) { auto Encoding = Die2BTFEntry::getBaseTypeEncoding(Die); assert((Encoding != BTF_INVALID_ENCODING) && "Invalid Die passed to BTFTypeEntryInt()"); - __u32 IntVal = (Encoding & 0xf) << 24; + uint32_t IntVal = (Encoding & 0xf) << 24; // handle BTF_INT_OFFSET in IntVal auto V = Die.findAttribute(dwarf::DW_AT_bit_offset); @@ -209,7 +209,7 @@ Die2BTFEntryInt::Die2BTFEntryInt(const DIE &Die) : Die2BTFEntry(Die) { // get btf_type.size V = Die.findAttribute(dwarf::DW_AT_byte_size); - __u32 Size = V.getDIEInteger().getValue() & 0xffffffff; + uint32_t Size = V.getDIEInteger().getValue() & 0xffffffff; // handle BTF_INT_BITS in IntVal V = Die.findAttribute(dwarf::DW_AT_bit_size); @@ -237,7 +237,7 @@ void Die2BTFEntryInt::completeData(class Dwarf2BTF &Dwarf2BTF) { Die2BTFEntryEnum::Die2BTFEntryEnum(const DIE &Die) : Die2BTFEntry(Die) { // get btf_type.size auto V = Die.findAttribute(dwarf::DW_AT_byte_size); - __u32 Size = V.getDIEInteger().getValue() & 0xffffffff; + uint32_t Size = V.getDIEInteger().getValue() & 0xffffffff; int Vlen = 0; for (auto &ChildDie : Die.children()) @@ -265,7 +265,7 @@ void Die2BTFEntryEnum::completeData(class Dwarf2BTF &Dwarf2BTF) { BTFEnum.name_off = Dwarf2BTF.addBTFString(Str); auto ChildValueV = ChildDie.findAttribute(dwarf::DW_AT_const_value); - BTFEnum.val = (__s32)(ChildValueV.getDIEInteger().getValue()); + BTFEnum.val = (int32_t)(ChildValueV.getDIEInteger().getValue()); EnumValues.push_back(BTFEnum); } @@ -308,7 +308,7 @@ void Die2BTFEntryArray::completeData(class Dwarf2BTF &Dwarf2BTF) { Nelems = 0; break; } - Nelems *= (__u32)(CountV.getDIEInteger().getValue()); + Nelems *= (uint32_t)(CountV.getDIEInteger().getValue()); } } ArrayInfo.nelems = Nelems; @@ -320,7 +320,7 @@ void Die2BTFEntryArray::completeData(class Dwarf2BTF &Dwarf2BTF) { Die2BTFEntryStruct::Die2BTFEntryStruct(const DIE &Die) : Die2BTFEntry(Die) { // get btf_type.size auto V = Die.findAttribute(dwarf::DW_AT_byte_size); - __u32 Size = V.getDIEInteger().getValue() & 0xffffffff; + uint32_t Size = V.getDIEInteger().getValue() & 0xffffffff; auto Kind = Die2BTFEntry::getDieKind(Die); int Vlen = 0; diff --git a/lib/CodeGen/AsmPrinter/Dwarf2BTF.h b/lib/CodeGen/AsmPrinter/Dwarf2BTF.h index a472d68ed7e..ae13847214c 100644 --- a/lib/CodeGen/AsmPrinter/Dwarf2BTF.h +++ b/lib/CodeGen/AsmPrinter/Dwarf2BTF.h @@ -54,7 +54,7 @@ public: // BTF_KIND_INT class Die2BTFEntryInt : public Die2BTFEntry { - __u32 IntVal; // encoding, offset, bits + uint32_t IntVal; // encoding, offset, bits public: Die2BTFEntryInt(const DIE &Die); @@ -90,7 +90,7 @@ public: // BTF_KIND_FUNC and BTF_KIND_FUNC_PROTO class Die2BTFEntryFunc : public Die2BTFEntry { - std::vector<__u32> Parameters; + std::vector Parameters; public: Die2BTFEntryFunc(const DIE &Die); @@ -109,7 +109,7 @@ public: bool isLittleEndian() { return IsLE; } void addDwarfCU(DwarfUnit *TheU); void finish(); - __u32 getTypeIndex(DIE &Die) { + uint32_t getTypeIndex(DIE &Die) { DIE *DiePtr = const_cast(&Die); assert((DieToIdMap.find(DiePtr) != DieToIdMap.end()) && "Die not added to in the BTFContext"); -- GitLab From f5782f7024e1f46381bd455f8eefb147669766e1 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 12 Oct 2018 18:10:04 +0000 Subject: [PATCH 0120/1116] Fix MCBTF string array initialization so its MSVC friendly. NFCI. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344390 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/MC/MCBTFContext.h | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/include/llvm/MC/MCBTFContext.h b/include/llvm/MC/MCBTFContext.h index 5b4bafd7aee..5ef35f12609 100644 --- a/include/llvm/MC/MCBTFContext.h +++ b/include/llvm/MC/MCBTFContext.h @@ -165,13 +165,20 @@ struct btf_sec_line_info { namespace llvm { const char *const btf_kind_str[NR_BTF_KINDS] = { - [BTF_KIND_UNKN] = "UNKNOWN", [BTF_KIND_INT] = "INT", - [BTF_KIND_PTR] = "PTR", [BTF_KIND_ARRAY] = "ARRAY", - [BTF_KIND_STRUCT] = "STRUCT", [BTF_KIND_UNION] = "UNION", - [BTF_KIND_ENUM] = "ENUM", [BTF_KIND_FWD] = "FWD", - [BTF_KIND_TYPEDEF] = "TYPEDEF", [BTF_KIND_VOLATILE] = "VOLATILE", - [BTF_KIND_CONST] = "CONST", [BTF_KIND_RESTRICT] = "RESTRICT", - [BTF_KIND_FUNC] = "FUNC", [BTF_KIND_FUNC_PROTO] = "FUNC_PROTO", + "UNKNOWN", /* BTF_KIND_UNKN */ + "INT", /* BTF_KIND_INT */ + "PTR", /* BTF_KIND_PTR */ + "ARRAY", /* BTF_KIND_ARRAY */ + "STRUCT", /* BTF_KIND_STRUCT */ + "UNION", /* BTF_KIND_UNION */ + "ENUM", /* BTF_KIND_ENUM */ + "FWD", /* BTF_KIND_FWD */ + "TYPEDEF", /* BTF_KIND_TYPEDEF */ + "VOLATILE", /* BTF_KIND_VOLATILE */ + "CONST", /* BTF_KIND_CONST */ + "RESTRICT", /* BTF_KIND_CONST */ + "FUNC", /* BTF_KIND_FUNC */ + "FUNC_PROTO", /* BTF_KIND_FUNC_PROTO */ }; class MCBTFContext; -- GitLab From 40c1d29a9d14e55fbe5db9fbb3f433ef01af4e5f Mon Sep 17 00:00:00 2001 From: Jonathan Metzman Date: Fri, 12 Oct 2018 18:11:47 +0000 Subject: [PATCH 0121/1116] [SanitizerCoverage] Prevent /OPT:REF from stripping constructors Summary: Linking with the /OPT:REF linker flag when building COFF files causes the linker to strip SanitizerCoverage's constructors. Prevent this by giving the constructors WeakODR linkage and by passing the linker a directive to include sancov.module_ctor. Include a test in compiler-rt to verify libFuzzer can be linked using /OPT:REF Reviewers: morehouse, rnk Reviewed By: morehouse, rnk Subscribers: rnk, morehouse, hiraditya Differential Revision: https://reviews.llvm.org/D52119 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344391 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../Instrumentation/SanitizerCoverage.cpp | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp index bf461c61ede..0bed4139518 100644 --- a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp +++ b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp @@ -29,6 +29,7 @@ #include "llvm/IR/Intrinsics.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/MDBuilder.h" +#include "llvm/IR/Mangler.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/Support/CommandLine.h" @@ -298,6 +299,26 @@ Function *SanitizerCoverageModule::CreateInitCallsForSections( } else { appendToGlobalCtors(M, CtorFunc, SanCtorAndDtorPriority); } + + if (TargetTriple.getObjectFormat() == Triple::COFF) { + // In COFF files, if the contructors are set as COMDAT (they are because + // COFF supports COMDAT) and the linker flag /OPT:REF (strip unreferenced + // functions and data) is used, the constructors get stripped. To prevent + // this, give the constructors weak ODR linkage and tell the linker to + // always include the sancov constructor. This way the linker can + // deduplicate the constructors but always leave one copy. + CtorFunc->setLinkage(GlobalValue::WeakODRLinkage); + SmallString<20> PartialIncDirective("/include:"); + // Get constructor's mangled name in order to support i386. + SmallString<40> MangledName; + Mangler().getNameWithPrefix(MangledName, CtorFunc, true); + Twine IncDirective = PartialIncDirective + MangledName; + Metadata *Args[1] = {MDString::get(*C, IncDirective.str())}; + MDNode *MetadataNode = MDNode::get(*C, Args); + NamedMDNode *NamedMetadata = + M.getOrInsertNamedMetadata("llvm.linker.options"); + NamedMetadata->addOperand(MetadataNode); + } return CtorFunc; } -- GitLab From 787355713025c285219b7e239d9ce184249646d1 Mon Sep 17 00:00:00 2001 From: Andrea Di Biagio Date: Fri, 12 Oct 2018 18:18:53 +0000 Subject: [PATCH 0122/1116] [llvm-mca] Correctly set aliases for register writes introduced by optimized register moves. This fixes a problem introduced by r344334. A write from a non-zero move eliminated at register renaming stage was not correctly handled by the PRF. This would have led to an assertion failure if the processor model declares a PRF that enables non-zero move elimination. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344392 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../include/HardwareUnits/RegisterFile.h | 8 ++- .../lib/HardwareUnits/RegisterFile.cpp | 65 +++++++++++++++---- tools/llvm-mca/lib/Stages/DispatchStage.cpp | 15 +++-- 3 files changed, 72 insertions(+), 16 deletions(-) diff --git a/tools/llvm-mca/include/HardwareUnits/RegisterFile.h b/tools/llvm-mca/include/HardwareUnits/RegisterFile.h index 6a45c707de0..4b8b623bfe6 100644 --- a/tools/llvm-mca/include/HardwareUnits/RegisterFile.h +++ b/tools/llvm-mca/include/HardwareUnits/RegisterFile.h @@ -109,12 +109,18 @@ class RegisterFile : public HardwareUnit { // // Field `AllowMoveElimination` is set for registers that are used as // destination by optimizable register moves. + // + // Field `AliasRegID` is set by writes from register moves that have been + // eliminated at register renaming stage. A move eliminated at register + // renaming stage is effectively bypassed, and its write aliases the source + // register definition. struct RegisterRenamingInfo { IndexPlusCostPairTy IndexPlusCost; llvm::MCPhysReg RenameAs; + llvm::MCPhysReg AliasRegID; bool AllowMoveElimination; RegisterRenamingInfo() - : IndexPlusCost(std::make_pair(0U, 1U)), RenameAs(0U), + : IndexPlusCost(std::make_pair(0U, 1U)), RenameAs(0U), AliasRegID(0U), AllowMoveElimination(false) {} }; diff --git a/tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp b/tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp index 481e2e18fa9..4a2a00523ae 100644 --- a/tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp +++ b/tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp @@ -171,7 +171,8 @@ void RegisterFile::addRegisterWrite(WriteRef Write, // implicitly clears the upper portion of the underlying register. // If a write clears its super-registers, then it is renamed as `RenameAs`. bool IsWriteZero = WS.isWriteZero(); - bool ShouldAllocatePhysRegs = !IsWriteZero; + bool IsEliminated = WS.isEliminated(); + bool ShouldAllocatePhysRegs = !IsWriteZero && !IsEliminated; const RegisterRenamingInfo &RRI = RegisterMappings[RegID].second; if (RRI.RenameAs && RRI.RenameAs != RegID) { @@ -187,6 +188,7 @@ void RegisterFile::addRegisterWrite(WriteRef Write, if (OtherWrite.getWriteState() && (OtherWrite.getSourceIndex() != Write.getSourceIndex())) { // This partial write has a false dependency on RenameAs. + assert(!IsEliminated && "Unexpected partial update!"); WS.setDependentWrite(OtherWrite.getWriteState()); } } @@ -205,22 +207,33 @@ void RegisterFile::addRegisterWrite(WriteRef Write, ZeroRegisters.clearBit(*I); } - // Update the mapping for register RegID including its sub-registers. - RegisterMappings[RegID].first = Write; - for (MCSubRegIterator I(RegID, &MRI); I.isValid(); ++I) - RegisterMappings[*I].first = Write; + // If this is move has been eliminated, then the call to tryEliminateMove + // should have already updated all the register mappings. + if (!IsEliminated) { + // Update the mapping for register RegID including its sub-registers. + RegisterMappings[RegID].first = Write; + RegisterMappings[RegID].second.AliasRegID = 0U; + for (MCSubRegIterator I(RegID, &MRI); I.isValid(); ++I) { + RegisterMappings[*I].first = Write; + RegisterMappings[*I].second.AliasRegID = 0U; + } - // No physical registers are allocated for instructions that are optimized in - // hardware. For example, zero-latency data-dependency breaking instructions - // don't consume physical registers. - if (ShouldAllocatePhysRegs) - allocatePhysRegs(RegisterMappings[RegID].second, UsedPhysRegs); + // No physical registers are allocated for instructions that are optimized in + // hardware. For example, zero-latency data-dependency breaking instructions + // don't consume physical registers. + if (ShouldAllocatePhysRegs) + allocatePhysRegs(RegisterMappings[RegID].second, UsedPhysRegs); + } if (!WS.clearsSuperRegisters()) return; for (MCSuperRegIterator I(RegID, &MRI); I.isValid(); ++I) { - RegisterMappings[*I].first = Write; + if (!IsEliminated) { + RegisterMappings[*I].first = Write; + RegisterMappings[*I].second.AliasRegID = 0U; + } + if (IsWriteZero) ZeroRegisters.setBit(*I); else @@ -230,6 +243,11 @@ void RegisterFile::addRegisterWrite(WriteRef Write, void RegisterFile::removeRegisterWrite( const WriteState &WS, MutableArrayRef FreedPhysRegs) { + // Early exit if this write was eliminated. A write eliminated at register + // renaming stage generates an alias, and it is not added to the PRF. + if (WS.isEliminated()) + return; + unsigned RegID = WS.getRegisterID(); assert(RegID != 0 && "Invalidating an already invalid register?"); @@ -313,10 +331,29 @@ bool RegisterFile::tryEliminateMove(WriteState &WS, const ReadState &RS) { if (RMT.AllowZeroMoveEliminationOnly && !IsZeroMove) return false; + MCPhysReg FromReg = RS.getRegisterID(); + MCPhysReg ToReg = WS.getRegisterID(); + + // Construct an alias. + MCPhysReg AliasReg = FromReg; + if (RRIFrom.RenameAs) + AliasReg = RRIFrom.RenameAs; + + const RegisterRenamingInfo &RMAlias = RegisterMappings[AliasReg].second; + if (RMAlias.AliasRegID) + AliasReg = RMAlias.AliasRegID; + + if (AliasReg != ToReg) { + RegisterMappings[ToReg].second.AliasRegID = AliasReg; + for (MCSubRegIterator I(ToReg, &MRI); I.isValid(); ++I) + RegisterMappings[*I].second.AliasRegID = AliasReg; + } + RMT.NumMoveEliminated++; if (IsZeroMove) WS.setWriteZero(); WS.setEliminated(); + return true; } @@ -325,6 +362,12 @@ void RegisterFile::collectWrites(SmallVectorImpl &Writes, assert(RegID && RegID < RegisterMappings.size()); LLVM_DEBUG(dbgs() << "RegisterFile: collecting writes for register " << MRI.getName(RegID) << '\n'); + + // Check if this is an alias. + const RegisterRenamingInfo &RRI = RegisterMappings[RegID].second; + if (RRI.AliasRegID) + RegID = RRI.AliasRegID; + const WriteRef &WR = RegisterMappings[RegID].first; if (WR.isValid()) Writes.push_back(WR); diff --git a/tools/llvm-mca/lib/Stages/DispatchStage.cpp b/tools/llvm-mca/lib/Stages/DispatchStage.cpp index c33b86027da..a6be2474554 100644 --- a/tools/llvm-mca/lib/Stages/DispatchStage.cpp +++ b/tools/llvm-mca/lib/Stages/DispatchStage.cpp @@ -101,10 +101,11 @@ Error DispatchStage::dispatch(InstRef IR) { } // Check if this is an optimizable reg-reg move. + bool IsEliminated = false; if (IS.isOptimizableMove()) { assert(IS.getDefs().size() == 1 && "Expected a single input!"); assert(IS.getUses().size() == 1 && "Expected a single output!"); - PRF.tryEliminateMove(*IS.getDefs()[0], *IS.getUses()[0]); + IsEliminated = PRF.tryEliminateMove(*IS.getDefs()[0], *IS.getUses()[0]); } // A dependency-breaking instruction doesn't have to wait on the register @@ -113,9 +114,15 @@ Error DispatchStage::dispatch(InstRef IR) { // instruction. A dependency-breaking instruction is a zero-latency // instruction that doesn't consume hardware resources. // An example of dependency-breaking instruction on X86 is a zero-idiom XOR. - for (std::unique_ptr &RS : IS.getUses()) - if (!RS->isIndependentFromDef()) - updateRAWDependencies(*RS, STI); + // + // We also don't update data dependencies for instructions that have been + // eliminated at register renaming stage. + if (!IsEliminated) { + for (std::unique_ptr &RS : IS.getUses()) { + if (!RS->isIndependentFromDef()) + updateRAWDependencies(*RS, STI); + } + } // By default, a dependency-breaking zero-idiom is expected to be optimized // at register renaming stage. That means, no physical register is allocated -- GitLab From a5213c4729d78e374823bb5b15a279a659aa389d Mon Sep 17 00:00:00 2001 From: Reid Kleckner Date: Fri, 12 Oct 2018 18:19:06 +0000 Subject: [PATCH 0123/1116] [codeview] Emit S_BUILDINFO and LF_BUILDINFO with cwd and source file Summary: We can fill in the command line and compiler path later if we want. Reviewers: zturner Subscribers: hiraditya, llvm-commits Differential Revision: https://reviews.llvm.org/D53179 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344393 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/DebugInfo/CodeView/TypeRecord.h | 12 ++++- lib/CodeGen/AsmPrinter/CodeViewDebug.cpp | 48 ++++++++++++++++++++ lib/CodeGen/AsmPrinter/CodeViewDebug.h | 2 + test/DebugInfo/COFF/build-info.ll | 39 ++++++++++++++++ 4 files changed, 100 insertions(+), 1 deletion(-) create mode 100644 test/DebugInfo/COFF/build-info.ll diff --git a/include/llvm/DebugInfo/CodeView/TypeRecord.h b/include/llvm/DebugInfo/CodeView/TypeRecord.h index ee6f53854e7..af4e8f40575 100644 --- a/include/llvm/DebugInfo/CodeView/TypeRecord.h +++ b/include/llvm/DebugInfo/CodeView/TypeRecord.h @@ -655,7 +655,17 @@ public: ArrayRef getArgs() const { return ArgIndices; } - SmallVector ArgIndices; + /// Indices of known build info arguments. + enum BuildInfoArg { + CurrentDirectory, //< Absolute CWD path + BuildTool, //< Absolute compiler path + SourceFile, //< Path to main source file, relative or absolute + TypeServerPDB, //< Absoulte path of type server PDB (/Fd) + CommandLine, //< Full canonical command line (maybe -cc1) + MaxArgs + }; + + SmallVector ArgIndices; }; // LF_VFTABLE diff --git a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp index 8232f076a93..3b503b683a0 100644 --- a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp +++ b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp @@ -561,6 +561,11 @@ void CodeViewDebug::endModule() { OS.AddComment("String table"); OS.EmitCVStringTableDirective(); + // Emit S_BUILDINFO, which points to LF_BUILDINFO. Put this in its own symbol + // subsection in the generic .debug$S section at the end. There is no + // particular reason for this ordering other than to match MSVC. + emitBuildInfo(); + // Emit type information and hashes last, so that any types we translate while // emitting function info are included. emitTypeInformation(); @@ -772,6 +777,49 @@ void CodeViewDebug::emitCompilerInformation() { OS.EmitLabel(CompilerEnd); } +static TypeIndex getStringIdTypeIdx(GlobalTypeTableBuilder &TypeTable, + StringRef S) { + StringIdRecord SIR(TypeIndex(0x0), S); + return TypeTable.writeLeafType(SIR); +} + +void CodeViewDebug::emitBuildInfo() { + // First, make LF_BUILDINFO. It's a sequence of strings with various bits of + // build info. The known prefix is: + // - Absolute path of current directory + // - Compiler path + // - Main source file path, relative to CWD or absolute + // - Type server PDB file + // - Canonical compiler command line + // If frontend and backend compilation are separated (think llc or LTO), it's + // not clear if the compiler path should refer to the executable for the + // frontend or the backend. Leave it blank for now. + TypeIndex BuildInfoArgs[BuildInfoRecord::MaxArgs] = {}; + NamedMDNode *CUs = MMI->getModule()->getNamedMetadata("llvm.dbg.cu"); + const MDNode *Node = *CUs->operands().begin(); // FIXME: Multiple CUs. + const auto *CU = cast(Node); + const DIFile *MainSourceFile = CU->getFile(); + BuildInfoArgs[BuildInfoRecord::CurrentDirectory] = + getStringIdTypeIdx(TypeTable, MainSourceFile->getDirectory()); + BuildInfoArgs[BuildInfoRecord::SourceFile] = + getStringIdTypeIdx(TypeTable, MainSourceFile->getFilename()); + // FIXME: Path to compiler and command line. PDB is intentionally blank unless + // we implement /Zi type servers. + BuildInfoRecord BIR(BuildInfoArgs); + TypeIndex BuildInfoIndex = TypeTable.writeLeafType(BIR); + + // Make a new .debug$S subsection for the S_BUILDINFO record, which points + // from the module symbols into the type stream. + MCSymbol *BuildInfoEnd = beginCVSubsection(DebugSubsectionKind::Symbols); + OS.AddComment("Record length"); + OS.EmitIntValue(6, 2); + OS.AddComment("Record kind: S_BUILDINFO"); + OS.EmitIntValue(unsigned(SymbolKind::S_BUILDINFO), 2); + OS.AddComment("LF_BUILDINFO index"); + OS.EmitIntValue(BuildInfoIndex.getIndex(), 4); + endCVSubsection(BuildInfoEnd); +} + void CodeViewDebug::emitInlineeLinesSubsection() { if (InlinedSubprograms.empty()) return; diff --git a/lib/CodeGen/AsmPrinter/CodeViewDebug.h b/lib/CodeGen/AsmPrinter/CodeViewDebug.h index b97092a642e..b6fbdc1373f 100644 --- a/lib/CodeGen/AsmPrinter/CodeViewDebug.h +++ b/lib/CodeGen/AsmPrinter/CodeViewDebug.h @@ -272,6 +272,8 @@ class LLVM_LIBRARY_VISIBILITY CodeViewDebug : public DebugHandlerBase { void emitCompilerInformation(); + void emitBuildInfo(); + void emitInlineeLinesSubsection(); void emitDebugInfoForThunk(const Function *GV, diff --git a/test/DebugInfo/COFF/build-info.ll b/test/DebugInfo/COFF/build-info.ll new file mode 100644 index 00000000000..94f006c3b09 --- /dev/null +++ b/test/DebugInfo/COFF/build-info.ll @@ -0,0 +1,39 @@ +; RUN: llc -filetype=obj -mtriple i686-pc-windows-msvc %s -o %t.o +; RUN: llvm-pdbutil dump %t.o -symbols -types | FileCheck %s + +; CHECK: [[INFO_IDX:0x[^ ]*]] | LF_BUILDINFO +; CHECK-NEXT: 0x{{.*}}: `D:\src\scopes\clang` +; CHECK-NEXT: : `` +; CHECK-NEXT: 0x{{.*}}: `D:\src\scopes\foo.cpp` +; CHECK-NEXT: : `` +; CHECK-NEXT: : `` + +; CHECK: {{.*}} | S_BUILDINFO [size = 8] BuildId = `[[INFO_IDX]]` + +; ModuleID = 'D:\src\scopes\foo.cpp' +source_filename = "D:\5Csrc\5Cscopes\5Cfoo.cpp" +target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32" +target triple = "i686-pc-windows-msvc19.0.23918" + +define i32 @"?foo@@YAHXZ"() !dbg !10 { +entry: + ret i32 42, !dbg !14 +} + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!7, !8} +!llvm.ident = !{!9} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 4.0.0 ", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2) +; One .debug$S section should contain an S_COMPILE3 record that identifies the +; source language and the version of the compiler based on the DICompileUnit. +!1 = !DIFile(filename: "D:\5Csrc\5Cscopes\5Cfoo.cpp", directory: "D:\5Csrc\5Cscopes\5Cclang") +!2 = !{} +!7 = !{i32 2, !"CodeView", i32 1} +!8 = !{i32 2, !"Debug Info Version", i32 3} +!9 = !{!"clang version 4.0.0 "} +!10 = distinct !DISubprogram(name: "foo", linkageName: "\01?foo@@YAHXZ", scope: !1, file: !1, line: 1, type: !11, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: false, unit: !0, retainedNodes: !2) +!11 = !DISubroutineType(types: !12) +!12 = !{!13} +!13 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed) +!14 = !DILocation(line: 2, scope: !10) -- GitLab From 6316db4486f9763e51db5ea07c22275ad84936d4 Mon Sep 17 00:00:00 2001 From: Rui Ueyama Date: Fri, 12 Oct 2018 18:29:30 +0000 Subject: [PATCH 0124/1116] Replace assert() with llvm_unreachable because it's obviously a typo. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344395 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp b/lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp index 20cc61df9b6..b3e6fce97b6 100644 --- a/lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp +++ b/lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp @@ -155,7 +155,7 @@ Die2BTFEntry::Die2BTFEntry(const DIE &Die) : Die(Die) { case BTF_KIND_VOLATILE: break; default: - assert("Invalid Die passed into BTFTypeEntry()"); + llvm_unreachable("Invalid Die passed into BTFTypeEntry()"); break; } -- GitLab From c27563a142810a55593bb298c5b642085a971485 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 12 Oct 2018 19:03:54 +0000 Subject: [PATCH 0125/1116] Regenerate test. NFCI. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344399 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/CodeGen/X86/x86-interleaved-access.ll | 32 +++++++++++----------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/test/CodeGen/X86/x86-interleaved-access.ll b/test/CodeGen/X86/x86-interleaved-access.ll index bf087e12833..e4624eaf363 100644 --- a/test/CodeGen/X86/x86-interleaved-access.ll +++ b/test/CodeGen/X86/x86-interleaved-access.ll @@ -1591,7 +1591,7 @@ define <64 x i8> @interleaved_load_vf64_i8_stride3(<192 x i8>* %ptr){ ; AVX1-NEXT: vorps %ymm12, %ymm14, %ymm12 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm14 ; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm15[11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7,8,9,10] -; AVX1-NEXT: vmovdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill +; AVX1-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vandnps %ymm14, %ymm13, %ymm14 ; AVX1-NEXT: vandps %ymm13, %ymm7, %ymm7 ; AVX1-NEXT: vorps %ymm14, %ymm7, %ymm13 @@ -1616,7 +1616,7 @@ define <64 x i8> @interleaved_load_vf64_i8_stride3(<192 x i8>* %ptr){ ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpaddb %xmm0, %xmm4, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm12, %xmm0 -; AVX1-NEXT: vpaddb -{{[0-9]+}}(%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-NEXT: vpaddb %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpaddb %xmm11, %xmm12, %xmm3 ; AVX1-NEXT: vpaddb %xmm3, %xmm2, %xmm2 @@ -1732,22 +1732,22 @@ define void @interleaved_store_vf64_i8_stride4(<64 x i8> %a, <64 x i8> %b, <64 x ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm14 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] -; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15] -; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] -; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] ; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm3 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] -; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] -; AVX1-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm3 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15] ; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm4 @@ -1756,7 +1756,7 @@ define void @interleaved_store_vf64_i8_stride4(<64 x i8> %a, <64 x i8> %b, <64 x ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3] ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm11, %ymm1 -; AVX1-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) # 32-byte Spill +; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3] ; AVX1-NEXT: vmovdqa %xmm8, %xmm2 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3] @@ -1765,16 +1765,16 @@ define void @interleaved_store_vf64_i8_stride4(<64 x i8> %a, <64 x i8> %b, <64 x ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7] ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7] ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] -; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm9, %ymm14 -; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload -; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload -; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] ; AVX1-NEXT: vinsertf128 $1, %xmm11, %ymm9, %ymm9 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] @@ -1788,7 +1788,7 @@ define void @interleaved_store_vf64_i8_stride4(<64 x i8> %a, <64 x i8> %b, <64 x ; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm9, %ymm6 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[2,3],ymm1[2,3] ; AVX1-NEXT: vinsertf128 $1, %xmm15, %ymm10, %ymm2 -; AVX1-NEXT: vmovups -{{[0-9]+}}(%rsp), %ymm3 # 32-byte Reload +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-NEXT: vinsertf128 $1, %xmm10, %ymm3, %ymm0 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm3[2,3],ymm2[2,3] ; AVX1-NEXT: vinsertf128 $1, %xmm11, %ymm7, %ymm3 -- GitLab From c6422e18ae6edcce196d7667d6970cd175a8d560 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 12 Oct 2018 19:30:43 +0000 Subject: [PATCH 0126/1116] Fix Wdocumentation warning. NFCI. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344402 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/DebugInfo/CodeView/TypeRecord.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/llvm/DebugInfo/CodeView/TypeRecord.h b/include/llvm/DebugInfo/CodeView/TypeRecord.h index af4e8f40575..9a06a6a3344 100644 --- a/include/llvm/DebugInfo/CodeView/TypeRecord.h +++ b/include/llvm/DebugInfo/CodeView/TypeRecord.h @@ -657,11 +657,11 @@ public: /// Indices of known build info arguments. enum BuildInfoArg { - CurrentDirectory, //< Absolute CWD path - BuildTool, //< Absolute compiler path - SourceFile, //< Path to main source file, relative or absolute - TypeServerPDB, //< Absoulte path of type server PDB (/Fd) - CommandLine, //< Full canonical command line (maybe -cc1) + CurrentDirectory, ///< Absolute CWD path + BuildTool, ///< Absolute compiler path + SourceFile, ///< Path to main source file, relative or absolute + TypeServerPDB, ///< Absolute path of type server PDB (/Fd) + CommandLine, ///< Full canonical command line (maybe -cc1) MaxArgs }; -- GitLab From 6147a037f6cd132b4659c2ae0553a728f376c33c Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 12 Oct 2018 19:37:47 +0000 Subject: [PATCH 0127/1116] [LegalizeVectorTypes] When unrolling in WidenVecRes_Convert, make sure we use the original vector element count. Not min of the widened result type and the possibly widened input type. If the input type is widened as well, but we still were forced to unroll, we shouldn't be considering the widened input element count. We should only create as many scalar operations as the original type called for. This will be important for an upcoming patch. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344403 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../SelectionDAG/LegalizeVectorTypes.cpp | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 1b07358561a..6bee966a327 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -2809,11 +2809,8 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) { if (WidenNumElts % InVTNumElts == 0) { // Widen the input and call convert on the widened input vector. unsigned NumConcat = WidenNumElts/InVTNumElts; - SmallVector Ops(NumConcat); + SmallVector Ops(NumConcat, DAG.getUNDEF(InVT)); Ops[0] = InOp; - SDValue UndefVal = DAG.getUNDEF(InVT); - for (unsigned i = 1; i != NumConcat; ++i) - Ops[i] = UndefVal; SDValue InVec = DAG.getNode(ISD::CONCAT_VECTORS, DL, InWidenVT, Ops); if (N->getNumOperands() == 1) return DAG.getNode(Opcode, DL, WidenVT, InVec); @@ -2832,11 +2829,12 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) { } // Otherwise unroll into some nasty scalar code and rebuild the vector. - SmallVector Ops(WidenNumElts); EVT EltVT = WidenVT.getVectorElementType(); - unsigned MinElts = std::min(InVTNumElts, WidenNumElts); - unsigned i; - for (i=0; i < MinElts; ++i) { + SmallVector Ops(WidenNumElts, DAG.getUNDEF(EltVT)); + // Use the original element count so we don't do more scalar opts than + // necessary. + unsigned MinElts = N->getValueType(0).getVectorNumElements(); + for (unsigned i=0; i < MinElts; ++i) { SDValue Val = DAG.getNode( ISD::EXTRACT_VECTOR_ELT, DL, InEltVT, InOp, DAG.getConstant(i, DL, TLI.getVectorIdxTy(DAG.getDataLayout()))); @@ -2846,10 +2844,6 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) { Ops[i] = DAG.getNode(Opcode, DL, EltVT, Val, N->getOperand(1), Flags); } - SDValue UndefVal = DAG.getUNDEF(EltVT); - for (; i < WidenNumElts; ++i) - Ops[i] = UndefVal; - return DAG.getBuildVector(WidenVT, DL, Ops); } -- GitLab From c646992975f1223770858f996a2fd91660dc1ca0 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 12 Oct 2018 19:37:49 +0000 Subject: [PATCH 0128/1116] [LegalizeVectorTypes] When widening the operands to a concat_vectors, see if we can use the widened operand 0 if the width matches and the other operands are undef. This saves a conversion to extracts and build_vector. We already do this when both the result and the input need to be widened to the same type. This changed the sse-intrinsics-fast-isel test because we don't lower (insert_vector_elt (scalar_to_vector X), Y, 1) well. We turn it into (vector_shuffle (scalar_to_vector X), (scalar_to_vector Y), <0, 4, 2, 3>) losing track of the fact that the upper elts could be undef. We should probably find a way to prevent the scalarization of the <2 x f32> load on these tests. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344404 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../SelectionDAG/LegalizeVectorTypes.cpp | 21 +++++++++++++----- test/CodeGen/X86/sse-intrinsics-fast-isel.ll | 22 +++++++++---------- 2 files changed, 27 insertions(+), 16 deletions(-) diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 6bee966a327..310f5ef5dc7 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -3794,20 +3794,31 @@ SDValue DAGTypeLegalizer::WidenVecOp_BITCAST(SDNode *N) { } SDValue DAGTypeLegalizer::WidenVecOp_CONCAT_VECTORS(SDNode *N) { - // If the input vector is not legal, it is likely that we will not find a - // legal vector of the same size. Replace the concatenate vector with a - // nasty build vector. EVT VT = N->getValueType(0); EVT EltVT = VT.getVectorElementType(); + EVT InVT = N->getOperand(0).getValueType(); SDLoc dl(N); + + // If the widen width for this operand is the same as the width of the concat + // and all but the first operand is undef, just use the widened operand. + unsigned NumOperands = N->getNumOperands(); + if (VT == TLI.getTypeToTransformTo(*DAG.getContext(), InVT)) { + unsigned i; + for (i = 1; i < NumOperands; ++i) + if (!N->getOperand(i).isUndef()) + break; + + if (i == NumOperands) + return GetWidenedVector(N->getOperand(0)); + } + + // Otherwise, fall back to a nasty build vector. unsigned NumElts = VT.getVectorNumElements(); SmallVector Ops(NumElts); - EVT InVT = N->getOperand(0).getValueType(); unsigned NumInElts = InVT.getVectorNumElements(); unsigned Idx = 0; - unsigned NumOperands = N->getNumOperands(); for (unsigned i=0; i < NumOperands; ++i) { SDValue InOp = N->getOperand(i); assert(getTypeAction(InOp.getValueType()) == diff --git a/test/CodeGen/X86/sse-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse-intrinsics-fast-isel.ll index 47649a54e80..1ccd586c453 100644 --- a/test/CodeGen/X86/sse-intrinsics-fast-isel.ll +++ b/test/CodeGen/X86/sse-intrinsics-fast-isel.ll @@ -1320,10 +1320,10 @@ define <4 x float> @test_mm_loadh_pi(<4 x float> %a0, x86_mmx* %a1) { ; X86-SSE-NEXT: # xmm1 = mem[0],zero,zero,zero ; X86-SSE-NEXT: movss 4(%eax), %xmm2 # encoding: [0xf3,0x0f,0x10,0x50,0x04] ; X86-SSE-NEXT: # xmm2 = mem[0],zero,zero,zero -; X86-SSE-NEXT: unpcklps %xmm2, %xmm1 # encoding: [0x0f,0x14,0xca] -; X86-SSE-NEXT: # xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; X86-SSE-NEXT: movlhps %xmm1, %xmm0 # encoding: [0x0f,0x16,0xc1] -; X86-SSE-NEXT: # xmm0 = xmm0[0],xmm1[0] +; X86-SSE-NEXT: shufps $0, %xmm1, %xmm2 # encoding: [0x0f,0xc6,0xd1,0x00] +; X86-SSE-NEXT: # xmm2 = xmm2[0,0],xmm1[0,0] +; X86-SSE-NEXT: shufps $36, %xmm2, %xmm0 # encoding: [0x0f,0xc6,0xc2,0x24] +; X86-SSE-NEXT: # xmm0 = xmm0[0,1],xmm2[2,0] ; X86-SSE-NEXT: retl # encoding: [0xc3] ; ; X86-AVX1-LABEL: test_mm_loadh_pi: @@ -1378,14 +1378,14 @@ define <4 x float> @test_mm_loadl_pi(<4 x float> %a0, x86_mmx* %a1) { ; X86-SSE-LABEL: test_mm_loadl_pi: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-SSE-NEXT: movss (%eax), %xmm1 # encoding: [0xf3,0x0f,0x10,0x08] -; X86-SSE-NEXT: # xmm1 = mem[0],zero,zero,zero -; X86-SSE-NEXT: movss 4(%eax), %xmm2 # encoding: [0xf3,0x0f,0x10,0x50,0x04] +; X86-SSE-NEXT: movss (%eax), %xmm2 # encoding: [0xf3,0x0f,0x10,0x10] ; X86-SSE-NEXT: # xmm2 = mem[0],zero,zero,zero -; X86-SSE-NEXT: unpcklps %xmm2, %xmm1 # encoding: [0x0f,0x14,0xca] -; X86-SSE-NEXT: # xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; X86-SSE-NEXT: shufps $228, %xmm0, %xmm1 # encoding: [0x0f,0xc6,0xc8,0xe4] -; X86-SSE-NEXT: # xmm1 = xmm1[0,1],xmm0[2,3] +; X86-SSE-NEXT: movss 4(%eax), %xmm1 # encoding: [0xf3,0x0f,0x10,0x48,0x04] +; X86-SSE-NEXT: # xmm1 = mem[0],zero,zero,zero +; X86-SSE-NEXT: shufps $0, %xmm2, %xmm1 # encoding: [0x0f,0xc6,0xca,0x00] +; X86-SSE-NEXT: # xmm1 = xmm1[0,0],xmm2[0,0] +; X86-SSE-NEXT: shufps $226, %xmm0, %xmm1 # encoding: [0x0f,0xc6,0xc8,0xe2] +; X86-SSE-NEXT: # xmm1 = xmm1[2,0],xmm0[2,3] ; X86-SSE-NEXT: movaps %xmm1, %xmm0 # encoding: [0x0f,0x28,0xc1] ; X86-SSE-NEXT: retl # encoding: [0xc3] ; -- GitLab From c206978e8af6632a2f32f2b02e544ca122be480f Mon Sep 17 00:00:00 2001 From: Eli Friedman Date: Fri, 12 Oct 2018 19:41:05 +0000 Subject: [PATCH 0129/1116] Revert BTF commit series. The initial patch was not reviewed, and does not have any tests; it should not have been merged. This reverts 344395, 344390, 344387, 344385, 344381, 344376, and 344366. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344405 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/MC/MCBTFContext.h | 361 ------------------- include/llvm/MC/MCContext.h | 7 - include/llvm/MC/MCObjectFileInfo.h | 8 - include/llvm/MC/MCObjectStreamer.h | 1 - lib/CodeGen/AsmPrinter/CMakeLists.txt | 1 - lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp | 501 -------------------------- lib/CodeGen/AsmPrinter/Dwarf2BTF.h | 133 ------- lib/CodeGen/AsmPrinter/DwarfDebug.cpp | 10 - lib/CodeGen/AsmPrinter/DwarfDebug.h | 3 - lib/CodeGen/AsmPrinter/DwarfFile.cpp | 10 - lib/CodeGen/AsmPrinter/DwarfFile.h | 7 +- lib/MC/CMakeLists.txt | 2 - lib/MC/MCBTFContext.cpp | 229 ------------ lib/MC/MCContext.cpp | 11 +- lib/MC/MCDwarf2BTF.cpp | 102 ------ lib/MC/MCDwarf2BTF.h | 30 -- lib/MC/MCObjectFileInfo.cpp | 3 - lib/MC/MCObjectStreamer.cpp | 34 -- 18 files changed, 4 insertions(+), 1449 deletions(-) delete mode 100644 include/llvm/MC/MCBTFContext.h delete mode 100644 lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp delete mode 100644 lib/CodeGen/AsmPrinter/Dwarf2BTF.h delete mode 100644 lib/MC/MCBTFContext.cpp delete mode 100644 lib/MC/MCDwarf2BTF.cpp delete mode 100644 lib/MC/MCDwarf2BTF.h diff --git a/include/llvm/MC/MCBTFContext.h b/include/llvm/MC/MCBTFContext.h deleted file mode 100644 index 5ef35f12609..00000000000 --- a/include/llvm/MC/MCBTFContext.h +++ /dev/null @@ -1,361 +0,0 @@ -//===- MCBTFContext.h ---------------------------------------- *- C++ --*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -// This header file contains two parts. The first part is the BTF ELF -// specification in C format, and the second part is the various -// C++ classes to manipulate the data structure in order to generate -// the BTF related ELF sections. -//===----------------------------------------------------------------------===// -#ifndef LLVM_MC_MCBTFCONTEXT_H -#define LLVM_MC_MCBTFCONTEXT_H - -#include "llvm/MC/MCSymbol.h" -#include "llvm/Support/raw_ostream.h" -#include -#include - -#define BTF_MAGIC 0xeB9F -#define BTF_VERSION 1 - -struct btf_header { - uint16_t magic; - uint8_t version; - uint8_t flags; - uint32_t hdr_len; - - /* All offsets are in bytes relative to the end of this header */ - uint32_t type_off; // offset of type section - uint32_t type_len; // length of type section - uint32_t str_off; // offset of string section - uint32_t str_len; // length of string section -}; - -/* Max # of type identifier */ -#define BTF_MAX_TYPE 0x0000ffff -/* Max offset into the string section */ -#define BTF_MAX_NAME_OFFSET 0x0000ffff -/* Max # of struct/union/enum members or func args */ -#define BTF_MAX_VLEN 0xffff - -struct btf_type { - uint32_t name_off; - /* "info" bits arrangement - * bits 0-15: vlen (e.g. # of struct's members) - * bits 16-23: unused - * bits 24-27: kind (e.g. int, ptr, array...etc) - * bits 28-31: unused - */ - uint32_t info; - /* "size" is used by INT, ENUM, STRUCT and UNION. - * "size" tells the size of the type it is describing. - * - * "type" is used by PTR, TYPEDEF, VOLATILE, CONST, RESTRICT, - * FUNC and FUNC_PROTO. - * "type" is a type_id referring to another type. - */ - union { - uint32_t size; - uint32_t type; - }; -}; - -#define BTF_INFO_KIND(info) (((info) >> 24) & 0x0f) -#define BTF_INFO_VLEN(info) ((info)&0xffff) - -#define BTF_KIND_UNKN 0 /* Unknown */ -#define BTF_KIND_INT 1 /* Integer */ -#define BTF_KIND_PTR 2 /* Pointer */ -#define BTF_KIND_ARRAY 3 /* Array */ -#define BTF_KIND_STRUCT 4 /* Struct */ -#define BTF_KIND_UNION 5 /* Union */ -#define BTF_KIND_ENUM 6 /* Enumeration */ -#define BTF_KIND_FWD 7 /* Forward */ -#define BTF_KIND_TYPEDEF 8 /* Typedef */ -#define BTF_KIND_VOLATILE 9 /* Volatile */ -#define BTF_KIND_CONST 10 /* Const */ -#define BTF_KIND_RESTRICT 11 /* Restrict */ -#define BTF_KIND_FUNC 12 /* Function */ -#define BTF_KIND_FUNC_PROTO 13 /* Function Prototype */ -#define BTF_KIND_MAX 13 -#define NR_BTF_KINDS 14 - -/* For some specific BTF_KIND, "struct btf_type" is immediately - * followed by extra data. - */ - -/* BTF_KIND_INT is followed by a u32 and the following - * is the 32 bits arrangement: - */ -#define BTF_INT_ENCODING(VAL) (((VAL)&0x0f000000) >> 24) -#define BTF_INT_OFFSET(VAL) (((VAL & 0x00ff0000)) >> 16) -#define BTF_INT_BITS(VAL) ((VAL)&0x000000ff) - -/* Attributes stored in the BTF_INT_ENCODING */ -#define BTF_INT_SIGNED (1 << 0) -#define BTF_INT_CHAR (1 << 1) -#define BTF_INT_BOOL (1 << 2) - -/* BTF_KIND_ENUM is followed by multiple "struct btf_enum". - * The exact number of btf_enum is stored in the vlen (of the - * info in "struct btf_type"). - */ -struct btf_enum { - uint32_t name_off; - int32_t val; -}; - -/* BTF_KIND_ARRAY is followed by one "struct btf_array" */ -struct btf_array { - uint32_t type; - uint32_t index_type; - uint32_t nelems; -}; - -/* BTF_KIND_STRUCT and BTF_KIND_UNION are followed - * by multiple "struct btf_member". The exact number - * of btf_member is stored in the vlen (of the info in - * "struct btf_type"). - */ -struct btf_member { - uint32_t name_off; - uint32_t type; - uint32_t offset; /* offset in bits */ -}; - -/* .BTF.ext section contains func_info and line_info. - */ -struct btf_ext_header { - uint16_t magic; - uint8_t version; - uint8_t flags; - uint32_t hdr_len; - - uint32_t func_info_off; - uint32_t func_info_len; - uint32_t line_info_off; - uint32_t line_info_len; -}; - -struct bpf_func_info { - uint32_t insn_offset; - uint32_t type_id; -}; - -struct btf_sec_func_info { - uint32_t sec_name_off; - uint32_t num_func_info; -}; - -struct bpf_line_info { - uint32_t insn_offset; - uint32_t file_name_off; - uint32_t line_off; - uint32_t line_col; /* line num: line_col >> 10, col num: line_col & 0x3ff */ -}; - -struct btf_sec_line_info { - uint32_t sec_name_off; - uint32_t num_line_info; -}; - -namespace llvm { - -const char *const btf_kind_str[NR_BTF_KINDS] = { - "UNKNOWN", /* BTF_KIND_UNKN */ - "INT", /* BTF_KIND_INT */ - "PTR", /* BTF_KIND_PTR */ - "ARRAY", /* BTF_KIND_ARRAY */ - "STRUCT", /* BTF_KIND_STRUCT */ - "UNION", /* BTF_KIND_UNION */ - "ENUM", /* BTF_KIND_ENUM */ - "FWD", /* BTF_KIND_FWD */ - "TYPEDEF", /* BTF_KIND_TYPEDEF */ - "VOLATILE", /* BTF_KIND_VOLATILE */ - "CONST", /* BTF_KIND_CONST */ - "RESTRICT", /* BTF_KIND_CONST */ - "FUNC", /* BTF_KIND_FUNC */ - "FUNC_PROTO", /* BTF_KIND_FUNC_PROTO */ -}; - -class MCBTFContext; -class MCObjectStreamer; - -// This is base class of all BTF KIND. It is also used directly -// by the reference kinds: -// BTF_KIND_CONST, BTF_KIND_PTR, BTF_KIND_VOLATILE, -// BTF_KIND_TYPEDEF, BTF_KIND_RESTRICT, and BTF_KIND_FWD -class BTFTypeEntry { -protected: - size_t Id; /* type index in the BTF list, started from 1 */ - struct btf_type BTFType; - -public: - BTFTypeEntry(size_t id, struct btf_type &type) : Id(id), BTFType(type) {} - virtual ~BTFTypeEntry(); - unsigned char getKind() { return BTF_INFO_KIND(BTFType.info); } - void setId(size_t Id) { this->Id = Id; } - size_t getId() { return Id; } - void setNameOff(unsigned NameOff) { BTFType.name_off = NameOff; } - - unsigned getTypeIndex() { return BTFType.type; } - unsigned getNameOff() { return BTFType.name_off; } - virtual size_t getSize() { return sizeof(struct btf_type); } - virtual void print(raw_ostream &s, MCBTFContext &BTFContext); - virtual void emitData(MCObjectStreamer *MCOS); -}; - -// BTF_KIND_INT -class BTFTypeEntryInt : public BTFTypeEntry { - unsigned IntVal; // encoding, offset, bits - -public: - BTFTypeEntryInt(size_t id, struct btf_type &type, unsigned intval) - : BTFTypeEntry(id, type), IntVal(intval) {} - size_t getSize() { return BTFTypeEntry::getSize() + sizeof(unsigned); } - void print(raw_ostream &s, MCBTFContext &BTFContext); - void emitData(MCObjectStreamer *MCOS); -}; - -// BTF_KIND_ENUM -class BTFTypeEntryEnum : public BTFTypeEntry { - std::vector EnumValues; - -public: - BTFTypeEntryEnum(size_t id, struct btf_type &type, - std::vector &values) - : BTFTypeEntry(id, type), EnumValues(values) {} - size_t getSize() { - return BTFTypeEntry::getSize() + - BTF_INFO_VLEN(BTFType.info) * sizeof(struct btf_enum); - } - void print(raw_ostream &s, MCBTFContext &BTFContext); - void emitData(MCObjectStreamer *MCOS); -}; - -// BTF_KIND_ARRAY -class BTFTypeEntryArray : public BTFTypeEntry { - struct btf_array ArrayInfo; - -public: - BTFTypeEntryArray(size_t id, struct btf_type &type, - struct btf_array &arrayinfo) - : BTFTypeEntry(id, type), ArrayInfo(arrayinfo) {} - size_t getSize() { - return BTFTypeEntry::getSize() + sizeof(struct btf_array); - } - void print(raw_ostream &s, MCBTFContext &BTFContext); - void emitData(MCObjectStreamer *MCOS); -}; - -// BTF_KIND_STRUCT and BTF_KIND_UNION -class BTFTypeEntryStruct : public BTFTypeEntry { - std::vector Members; - -public: - BTFTypeEntryStruct(size_t id, struct btf_type &type, - std::vector &members) - : BTFTypeEntry(id, type), Members(members) {} - size_t getSize() { - return BTFTypeEntry::getSize() + - BTF_INFO_VLEN(BTFType.info) * sizeof(struct btf_member); - } - void print(raw_ostream &s, MCBTFContext &BTFContext); - void emitData(MCObjectStreamer *MCOS); -}; - -// BTF_KIND_FUNC and BTF_KIND_FUNC_PROTO -class BTFTypeEntryFunc : public BTFTypeEntry { - std::vector Parameters; - -public: - BTFTypeEntryFunc(size_t id, struct btf_type &type, - std::vector ¶ms) - : BTFTypeEntry(id, type), Parameters(params) {} - size_t getSize() { - return BTFTypeEntry::getSize() + - BTF_INFO_VLEN(BTFType.info) * sizeof(unsigned); - } - void print(raw_ostream &s, MCBTFContext &BTFContext); - void emitData(MCObjectStreamer *MCOS); -}; - -class BTFStringTable { - size_t Size; // total size in bytes - std::map OffsetToIdMap; - std::vector Table; - -public: - BTFStringTable() : Size(0) {} - size_t getSize() { return Size; } - std::vector &getTable() { return Table; } - size_t addString(std::string S) { - // check whether the string already exists - for (auto &OffsetM : OffsetToIdMap) { - if (Table[OffsetM.second] == S) - return OffsetM.first; - } - // not find, add to the string table - size_t Offset = Size; - OffsetToIdMap[Offset] = Table.size(); - Table.push_back(S); - Size += S.size() + 1; - return Offset; - } - std::string &getStringAtOffset(size_t Offset) { - return Table[OffsetToIdMap[Offset]]; - } - void showTable(raw_ostream &OS) { - for (auto OffsetM : OffsetToIdMap) - OS << OffsetM.first << " : " << Table[OffsetM.second] << "\n"; - } -}; - -struct BTFFuncInfo { - const MCSymbol *Label; - unsigned int TypeId; -}; - -struct BTFLineInfo { - MCSymbol *Label; - unsigned int FileNameOff; - unsigned int LineOff; - unsigned int LineNum; - unsigned int ColumnNum; -}; - -class MCBTFContext { - std::vector> TypeEntries; - BTFStringTable StringTable; - std::map> FuncInfoTable; - std::map> LineInfoTable; - - friend class BTFTypeEntry; - friend class BTFTypeEntryInt; - friend class BTFTypeEntryEnum; - friend class BTFTypeEntryArray; - friend class BTFTypeEntryStruct; - friend class BTFTypeEntryFunc; - -public: - void dump(raw_ostream &OS); - void emitAll(MCObjectStreamer *MCOS); - void emitCommonHeader(MCObjectStreamer *MCOS); - void emitBTFSection(MCObjectStreamer *MCOS); - void emitBTFExtSection(MCObjectStreamer *MCOS); - - size_t addString(std::string S) { return StringTable.addString(S); } - void addTypeEntry(std::unique_ptr Entry); - void addFuncInfo(unsigned SecNameOff, BTFFuncInfo Info) { - FuncInfoTable[SecNameOff].push_back(Info); - } - void addLineInfo(unsigned SecNameOff, BTFLineInfo Info) { - LineInfoTable[SecNameOff].push_back(Info); - } -}; - -} -#endif diff --git a/include/llvm/MC/MCContext.h b/include/llvm/MC/MCContext.h index d5c49408c68..3b8ac8b79e2 100644 --- a/include/llvm/MC/MCContext.h +++ b/include/llvm/MC/MCContext.h @@ -56,7 +56,6 @@ namespace llvm { class MCSymbolWasm; class SMLoc; class SourceMgr; - class MCBTFContext; /// Context object for machine code objects. This class owns all of the /// sections that it creates. @@ -279,9 +278,6 @@ namespace llvm { /// Map of currently defined macros. StringMap MacroMap; - /// for BTF debug information - std::unique_ptr BTFCtx; - public: explicit MCContext(const MCAsmInfo *MAI, const MCRegisterInfo *MRI, const MCObjectFileInfo *MOFI, @@ -290,9 +286,6 @@ namespace llvm { MCContext &operator=(const MCContext &) = delete; ~MCContext(); - void setBTFContext(std::unique_ptr Ctx); - std::unique_ptr &getBTFContext() { return BTFCtx; } - const SourceMgr *getSourceManager() const { return SrcMgr; } void setInlineSourceManager(SourceMgr *SM) { InlineSrcMgr = SM; } diff --git a/include/llvm/MC/MCObjectFileInfo.h b/include/llvm/MC/MCObjectFileInfo.h index 1dda7b0712f..8cf9e1cc55a 100644 --- a/include/llvm/MC/MCObjectFileInfo.h +++ b/include/llvm/MC/MCObjectFileInfo.h @@ -207,10 +207,6 @@ protected: MCSection *SXDataSection; MCSection *GFIDsSection; - // BTF specific sections. - MCSection *BTFSection; - MCSection *BTFExtSection; - public: void InitMCObjectFileInfo(const Triple &TT, bool PIC, MCContext &ctx, bool LargeCodeModel = false); @@ -376,10 +372,6 @@ public: return EHFrameSection; } - // BTF specific sections. - MCSection *getBTFSection() const { return BTFSection; } - MCSection *getBTFExtSection() const { return BTFExtSection; } - enum Environment { IsMachO, IsELF, IsCOFF, IsWasm }; Environment getObjectFileType() const { return Env; } diff --git a/include/llvm/MC/MCObjectStreamer.h b/include/llvm/MC/MCObjectStreamer.h index 9d15086ac63..c9e577b7e29 100644 --- a/include/llvm/MC/MCObjectStreamer.h +++ b/include/llvm/MC/MCObjectStreamer.h @@ -138,7 +138,6 @@ public: unsigned PointerSize); void EmitDwarfAdvanceFrameAddr(const MCSymbol *LastLabel, const MCSymbol *Label); - void EmitBTFAdvanceLineAddr(const MCSymbol *Label, unsigned Size); void EmitCVLocDirective(unsigned FunctionId, unsigned FileNo, unsigned Line, unsigned Column, bool PrologueEnd, bool IsStmt, StringRef FileName, SMLoc Loc) override; diff --git a/lib/CodeGen/AsmPrinter/CMakeLists.txt b/lib/CodeGen/AsmPrinter/CMakeLists.txt index 14c895a9c82..6cba4a0d4b8 100644 --- a/lib/CodeGen/AsmPrinter/CMakeLists.txt +++ b/lib/CodeGen/AsmPrinter/CMakeLists.txt @@ -17,7 +17,6 @@ add_llvm_library(LLVMAsmPrinter DwarfFile.cpp DwarfStringPool.cpp DwarfUnit.cpp - Dwarf2BTF.cpp EHStreamer.cpp ErlangGCPrinter.cpp OcamlGCPrinter.cpp diff --git a/lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp b/lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp deleted file mode 100644 index b3e6fce97b6..00000000000 --- a/lib/CodeGen/AsmPrinter/Dwarf2BTF.cpp +++ /dev/null @@ -1,501 +0,0 @@ -//===- Dwarf2BTF.cpp ------------------------------------------ *- C++ --*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#include "Dwarf2BTF.h" -#include "DwarfUnit.h" -#include "llvm/MC/MCBTFContext.h" -#include "llvm/MC/MCContext.h" -#include "llvm/MC/MCSectionELF.h" - -namespace llvm { - -Die2BTFEntry::~Die2BTFEntry() {} - -unsigned char Die2BTFEntry::getDieKind(const DIE &Die) { - auto Tag = Die.getTag(); - - switch (Tag) { - case dwarf::DW_TAG_base_type: - if (getBaseTypeEncoding(Die) == BTF_INVALID_ENCODING) - return BTF_KIND_UNKN; - return BTF_KIND_INT; - case dwarf::DW_TAG_const_type: - return BTF_KIND_CONST; - case dwarf::DW_TAG_pointer_type: - return BTF_KIND_PTR; - case dwarf::DW_TAG_restrict_type: - return BTF_KIND_RESTRICT; - case dwarf::DW_TAG_volatile_type: - return BTF_KIND_VOLATILE; - case dwarf::DW_TAG_typedef: - return BTF_KIND_TYPEDEF; - case dwarf::DW_TAG_structure_type: - case dwarf::DW_TAG_class_type: - if (Die.findAttribute(dwarf::DW_AT_declaration).getType() != - DIEValue::isNone) - return BTF_KIND_FWD; - else - return BTF_KIND_STRUCT; - case dwarf::DW_TAG_union_type: - if (Die.findAttribute(dwarf::DW_AT_declaration).getType() != - DIEValue::isNone) - return BTF_KIND_FWD; - else - return BTF_KIND_UNION; - case dwarf::DW_TAG_enumeration_type: - return BTF_KIND_ENUM; - case dwarf::DW_TAG_array_type: - return BTF_KIND_ARRAY; - case dwarf::DW_TAG_subprogram: - return BTF_KIND_FUNC; - case dwarf::DW_TAG_subroutine_type: - return BTF_KIND_FUNC_PROTO; - default: - break; - } - - return BTF_KIND_UNKN; -} - -std::unique_ptr Die2BTFEntry::dieToBTFTypeEntry(const DIE &Die) { - unsigned char Kind = getDieKind(Die); - - switch (Kind) { - case BTF_KIND_INT: - return llvm::make_unique(Die); - case BTF_KIND_PTR: - case BTF_KIND_TYPEDEF: - case BTF_KIND_VOLATILE: - case BTF_KIND_CONST: - case BTF_KIND_RESTRICT: - case BTF_KIND_FWD: - return llvm::make_unique(Die); - case BTF_KIND_ARRAY: - return llvm::make_unique(Die); - case BTF_KIND_STRUCT: - case BTF_KIND_UNION: - return llvm::make_unique(Die); - case BTF_KIND_ENUM: - return llvm::make_unique(Die); - case BTF_KIND_FUNC: - case BTF_KIND_FUNC_PROTO: - return llvm::make_unique(Die); - default: - break; - } - return nullptr; -} - -bool Die2BTFEntry::shouldSkipDie(const DIE &Die) { - auto Tag = Die.getTag(); - - switch (Tag) { - case dwarf::DW_TAG_const_type: - case dwarf::DW_TAG_pointer_type: - case dwarf::DW_TAG_restrict_type: - case dwarf::DW_TAG_typedef: - case dwarf::DW_TAG_volatile_type: { - auto TypeV = Die.findAttribute(dwarf::DW_AT_type); - if (TypeV.getType() == DIEValue::isNone) - return false; - auto &TypeDie = TypeV.getDIEEntry().getEntry(); - return Die2BTFEntry::shouldSkipDie(TypeDie); - } - default: - return getDieKind(Die) == BTF_KIND_UNKN; - } - return true; -} -unsigned char Die2BTFEntry::getBaseTypeEncoding(const DIE &Die) { - auto V = Die.findAttribute(dwarf::DW_AT_encoding); - - if (V.getType() != DIEValue::isInteger) - return BTF_INVALID_ENCODING; - - switch (V.getDIEInteger().getValue()) { - case dwarf::DW_ATE_boolean: - return BTF_INT_BOOL; - case dwarf::DW_ATE_signed: - return BTF_INT_SIGNED; - case dwarf::DW_ATE_signed_char: - return BTF_INT_CHAR; - case dwarf::DW_ATE_unsigned: - return 0; - case dwarf::DW_ATE_unsigned_char: - return BTF_INT_CHAR; - case dwarf::DW_ATE_imaginary_float: - case dwarf::DW_ATE_packed_decimal: - case dwarf::DW_ATE_numeric_string: - case dwarf::DW_ATE_edited: - case dwarf::DW_ATE_signed_fixed: - case dwarf::DW_ATE_address: - case dwarf::DW_ATE_complex_float: - case dwarf::DW_ATE_float: - default: - break; - } - return BTF_INVALID_ENCODING; -} - -Die2BTFEntry::Die2BTFEntry(const DIE &Die) : Die(Die) { - unsigned char Kind = getDieKind(Die); - - switch (Kind) { - case BTF_KIND_CONST: - case BTF_KIND_FWD: - case BTF_KIND_PTR: - case BTF_KIND_RESTRICT: - case BTF_KIND_TYPEDEF: - case BTF_KIND_VOLATILE: - break; - default: - llvm_unreachable("Invalid Die passed into BTFTypeEntry()"); - break; - } - - BTFType.info = (Kind & 0xf) << 24; -} - -void Die2BTFEntry::completeData(class Dwarf2BTF &Dwarf2BTF) { - auto TypeV = Die.findAttribute(dwarf::DW_AT_type); - if (TypeV.getType() == DIEValue::isNone) { - BTFType.type = 0; - } else { - auto &TypeDie = TypeV.getDIEEntry().getEntry(); - auto Type = Dwarf2BTF.getTypeIndex(TypeDie); - BTFType.type = Type; - } - - unsigned char Kind = getDieKind(Die); - if (Kind != BTF_KIND_FWD) { - BTFType.name_off = 0; - } else { - auto NameV = Die.findAttribute(dwarf::DW_AT_name); - auto Str = NameV.getDIEString().getString(); - BTFType.name_off = Dwarf2BTF.addBTFString(Str); - } - - auto typeEntry = llvm::make_unique(Id, BTFType); - Dwarf2BTF.addBTFTypeEntry(std::move(typeEntry)); -} - -Die2BTFEntryInt::Die2BTFEntryInt(const DIE &Die) : Die2BTFEntry(Die) { - unsigned char Kind = getDieKind(Die); - - switch (Kind) { - case BTF_KIND_INT: - break; - default: - assert("Invalid Die passed into BTFTypeEntryInt()"); - break; - } - - // handle BTF_INT_ENCODING in IntVal - auto Encoding = Die2BTFEntry::getBaseTypeEncoding(Die); - assert((Encoding != BTF_INVALID_ENCODING) && - "Invalid Die passed to BTFTypeEntryInt()"); - uint32_t IntVal = (Encoding & 0xf) << 24; - - // handle BTF_INT_OFFSET in IntVal - auto V = Die.findAttribute(dwarf::DW_AT_bit_offset); - if (V.getType() == DIEValue::isInteger) - IntVal |= (V.getDIEInteger().getValue() & 0xff) << 16; - - // get btf_type.size - V = Die.findAttribute(dwarf::DW_AT_byte_size); - uint32_t Size = V.getDIEInteger().getValue() & 0xffffffff; - - // handle BTF_INT_BITS in IntVal - V = Die.findAttribute(dwarf::DW_AT_bit_size); - if (V.getType() == DIEValue::isInteger) - IntVal |= V.getDIEInteger().getValue() & 0xff; - else - IntVal |= (Size << 3) & 0xff; - - BTFType.info = BTF_KIND_INT << 24; - BTFType.size = Size; - this->IntVal = IntVal; -} - -void Die2BTFEntryInt::completeData(class Dwarf2BTF &Dwarf2BTF) { - auto NameV = Die.findAttribute(dwarf::DW_AT_name); - auto TypeV = Die.findAttribute(dwarf::DW_AT_type); - auto Str = NameV.getDIEString().getString(); - - BTFType.name_off = Dwarf2BTF.addBTFString(Str); - - auto typeEntry = llvm::make_unique(Id, BTFType, IntVal); - Dwarf2BTF.addBTFTypeEntry(std::move(typeEntry)); -} - -Die2BTFEntryEnum::Die2BTFEntryEnum(const DIE &Die) : Die2BTFEntry(Die) { - // get btf_type.size - auto V = Die.findAttribute(dwarf::DW_AT_byte_size); - uint32_t Size = V.getDIEInteger().getValue() & 0xffffffff; - - int Vlen = 0; - for (auto &ChildDie : Die.children()) - if (ChildDie.getTag() == dwarf::DW_TAG_enumerator) - Vlen++; - - BTFType.info = (BTF_KIND_ENUM << 24) | (Vlen & BTF_MAX_VLEN); - BTFType.type = Size; -} - -void Die2BTFEntryEnum::completeData(class Dwarf2BTF &Dwarf2BTF) { - auto TypeV = Die.findAttribute(dwarf::DW_AT_type); - auto NameV = Die.findAttribute(dwarf::DW_AT_name); - - if (NameV.getType() != DIEValue::isNone) { - auto Str = NameV.getDIEString().getString(); - BTFType.name_off = Dwarf2BTF.addBTFString(Str); - } else - BTFType.name_off = 0; - - for (auto &ChildDie : Die.children()) { - struct btf_enum BTFEnum; - auto ChildNameV = ChildDie.findAttribute(dwarf::DW_AT_name); - auto Str = ChildNameV.getDIEString().getString(); - - BTFEnum.name_off = Dwarf2BTF.addBTFString(Str); - auto ChildValueV = ChildDie.findAttribute(dwarf::DW_AT_const_value); - BTFEnum.val = (int32_t)(ChildValueV.getDIEInteger().getValue()); - - EnumValues.push_back(BTFEnum); - } - - auto typeEntry = llvm::make_unique(Id, BTFType, EnumValues); - Dwarf2BTF.addBTFTypeEntry(std::move(typeEntry)); -} - -Die2BTFEntryArray::Die2BTFEntryArray(const DIE &Die) : Die2BTFEntry(Die) { - BTFType.info = (BTF_KIND_ARRAY << 24); - BTFType.size = 0; -} - -void Die2BTFEntryArray::completeData(class Dwarf2BTF &Dwarf2BTF) { - auto NameV = Die.findAttribute(dwarf::DW_AT_name); - - std::string Str; - if (NameV.getType() != DIEValue::isNone) - Str = NameV.getDIEString().getString(); - BTFType.name_off = Dwarf2BTF.addBTFString(Str); - - auto &ArrayTypeDie = - Die.findAttribute(dwarf::DW_AT_type).getDIEEntry().getEntry(); - ArrayInfo.type = Dwarf2BTF.getTypeIndex(ArrayTypeDie); - - // The number of elements should count all subranges - unsigned Nelems = 1; - bool IsFirstSubrange = true; - for (auto &ChildDie : Die.children()) { - if (ChildDie.getTag() == dwarf::DW_TAG_subrange_type) { - if (IsFirstSubrange) { - auto TypeV = ChildDie.findAttribute(dwarf::DW_AT_type); - auto &TypeDie = TypeV.getDIEEntry().getEntry(); - ArrayInfo.index_type = Dwarf2BTF.getTypeIndex(TypeDie); - IsFirstSubrange = false; - } - auto CountV = ChildDie.findAttribute(dwarf::DW_AT_count); - if (CountV.getType() == DIEValue::isNone) { - // array like a[] which essentially a pointer - Nelems = 0; - break; - } - Nelems *= (uint32_t)(CountV.getDIEInteger().getValue()); - } - } - ArrayInfo.nelems = Nelems; - - auto TypeEntry = llvm::make_unique(Id, BTFType, ArrayInfo); - Dwarf2BTF.addBTFTypeEntry(std::move(TypeEntry)); -} - -Die2BTFEntryStruct::Die2BTFEntryStruct(const DIE &Die) : Die2BTFEntry(Die) { - // get btf_type.size - auto V = Die.findAttribute(dwarf::DW_AT_byte_size); - uint32_t Size = V.getDIEInteger().getValue() & 0xffffffff; - auto Kind = Die2BTFEntry::getDieKind(Die); - - int Vlen = 0; - for (auto &ChildDie : Die.children()) - if (ChildDie.getTag() == dwarf::DW_TAG_member) - Vlen++; - - BTFType.size = Size; - BTFType.info = (Kind << 24) | (Vlen & BTF_MAX_VLEN); -} - -void Die2BTFEntryStruct::completeData(class Dwarf2BTF &Dwarf2BTF) { - auto NameV = Die.findAttribute(dwarf::DW_AT_name); - - if (NameV.getType() != DIEValue::isNone) { - auto Str = NameV.getDIEString().getString(); - BTFType.name_off = Dwarf2BTF.addBTFString(Str); - } else - BTFType.name_off = 0; - - for (auto &ChildDie : Die.children()) { - if (ChildDie.getTag() != dwarf::DW_TAG_member) - continue; - - struct btf_member BTFMember; - auto ChildNameV = ChildDie.findAttribute(dwarf::DW_AT_name); - - if (ChildNameV.getType() != DIEValue::isNone) { - auto Str = ChildNameV.getDIEString().getString(); - BTFMember.name_off = Dwarf2BTF.addBTFString(Str); - } else - BTFMember.name_off = 0; - - auto TypeV = ChildDie.findAttribute(dwarf::DW_AT_type); - auto &TypeDie = TypeV.getDIEEntry().getEntry(); - BTFMember.type = Dwarf2BTF.getTypeIndex(TypeDie); - - auto MemLocV = ChildDie.findAttribute(dwarf::DW_AT_data_member_location); - unsigned MemLoc = MemLocV.getDIEInteger().getValue() * 8; - - auto ByteSizeV = ChildDie.findAttribute(dwarf::DW_AT_byte_size); - if (ByteSizeV.getType() != DIEValue::isNone) { - unsigned ByteSize = ByteSizeV.getDIEInteger().getValue(); - auto BitOffsetV = ChildDie.findAttribute(dwarf::DW_AT_bit_offset); - unsigned BitOffset = BitOffsetV.getDIEInteger().getValue(); - auto BitSizeV = ChildDie.findAttribute(dwarf::DW_AT_bit_size); - unsigned BitSize = BitSizeV.getDIEInteger().getValue(); - if (Dwarf2BTF.isLittleEndian()) - MemLoc += ByteSize * 8 - BitSize - BitOffset; - else - MemLoc += BitOffset; - } - BTFMember.offset = MemLoc; - - Members.push_back(BTFMember); - } - - auto typeEntry = llvm::make_unique(Id, BTFType, Members); - Dwarf2BTF.addBTFTypeEntry(std::move(typeEntry)); -} - -Die2BTFEntryFunc::Die2BTFEntryFunc(const DIE &Die) : Die2BTFEntry(Die) { - auto Kind = Die2BTFEntry::getDieKind(Die); - - int Vlen = 0; - for (auto &ChildDie : Die.children()) - if (ChildDie.getTag() == dwarf::DW_TAG_formal_parameter) - Vlen++; - - BTFType.size = 0; - BTFType.info = (Kind << 24) | (Vlen & BTF_MAX_VLEN); -} - -void Die2BTFEntryFunc::completeData(class Dwarf2BTF &Dwarf2BTF) { - auto NameV = Die.findAttribute(dwarf::DW_AT_name); - if (NameV.getType() == DIEValue::isNone) { - BTFType.name_off = 0; - } else { - auto Str = NameV.getDIEString().getString(); - BTFType.name_off = Dwarf2BTF.addBTFString(Str); - } - - auto RetTypeV = Die.findAttribute(dwarf::DW_AT_type); - if (RetTypeV.getType() != DIEValue::isNone) { - auto &TypeDie = RetTypeV.getDIEEntry().getEntry(); - BTFType.type = Dwarf2BTF.getTypeIndex(TypeDie); - } else { - BTFType.type = 0; - } - - for (auto &ChildDie : Die.children()) { - if (ChildDie.getTag() == dwarf::DW_TAG_formal_parameter) { - auto TypeV = ChildDie.findAttribute(dwarf::DW_AT_abstract_origin); - if (TypeV.getType() != DIEValue::isNone) { - auto &AbsOriginDie = TypeV.getDIEEntry().getEntry(); - assert(AbsOriginDie.getTag() == dwarf::DW_TAG_formal_parameter); - TypeV = AbsOriginDie.findAttribute(dwarf::DW_AT_type); - } else { - TypeV = ChildDie.findAttribute(dwarf::DW_AT_type); - } - auto &TypeDie = TypeV.getDIEEntry().getEntry(); - Parameters.push_back(Dwarf2BTF.getTypeIndex(TypeDie)); - } else if (ChildDie.getTag() == dwarf::DW_TAG_unspecified_parameters) { - Parameters.push_back(0); - } - } - - auto typeEntry = llvm::make_unique(Id, BTFType, Parameters); - Dwarf2BTF.addBTFTypeEntry(std::move(typeEntry)); - - if (BTF_INFO_KIND(BTFType.info) == BTF_KIND_FUNC) { - auto LowPCV = Die.findAttribute(dwarf::DW_AT_low_pc); - if (LowPCV.getType() != DIEValue::isNone) { - const MCSymbol *Label = LowPCV.getDIELabel().getValue(); - BTFFuncInfo FuncInfo; - unsigned SecNameOff; - - FuncInfo.Label = Label; - FuncInfo.TypeId = Id; - if (Label->isInSection()) { - MCSection &Section = Label->getSection(); - MCSectionELF *SectionELF = dyn_cast(&Section); - assert(SectionELF); - SecNameOff = Dwarf2BTF.addBTFString(SectionELF->getSectionName().str()); - } else { - SecNameOff = Dwarf2BTF.addBTFString(".text"); - } - Dwarf2BTF.addBTFFuncInfo(SecNameOff, FuncInfo); - } - } -} - -Dwarf2BTF::Dwarf2BTF(MCContext &Context, bool IsLittleEndian) - : OuterCtx(Context), IsLE(IsLittleEndian) { - BTFContext = llvm::make_unique(); -} - -void Dwarf2BTF::addTypeEntry(const DIE &Die) { - for (auto &ChildDie : Die.children()) - addTypeEntry(ChildDie); - if (Die2BTFEntry::shouldSkipDie(Die)) - return; - auto Kind = Die2BTFEntry::getDieKind(Die); - if (Kind != BTF_KIND_UNKN) { - auto TypeEntry = Die2BTFEntry::dieToBTFTypeEntry(Die); - if (TypeEntry != nullptr) { - TypeEntry->setId(TypeEntries.size() + 1); - DieToIdMap[const_cast(&Die)] = TypeEntry->getId(); - TypeEntries.push_back(std::move(TypeEntry)); - } - } -} - -void Dwarf2BTF::addBTFTypeEntry(std::unique_ptr Entry) { - BTFContext->addTypeEntry(std::move(Entry)); -} - -void Dwarf2BTF::completeData() { - BTFContext->addString("\0"); - - for (auto &TypeEntry : TypeEntries) - TypeEntry->completeData(*this); -} - -void Dwarf2BTF::addDwarfCU(DwarfUnit *TheU) { - DIE &CuDie = TheU->getUnitDie(); - - assert((CuDie.getTag() == dwarf::DW_TAG_compile_unit) && - "Not a compile unit"); - addTypeEntry(CuDie); -} - -void Dwarf2BTF::finish() { - completeData(); - OuterCtx.setBTFContext(std::move(BTFContext)); -} - -} // namespace llvm diff --git a/lib/CodeGen/AsmPrinter/Dwarf2BTF.h b/lib/CodeGen/AsmPrinter/Dwarf2BTF.h deleted file mode 100644 index ae13847214c..00000000000 --- a/lib/CodeGen/AsmPrinter/Dwarf2BTF.h +++ /dev/null @@ -1,133 +0,0 @@ -//===- Dwarf2BTF.h -------------------------------------------- *- C++ --*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DWARF2BTF_H -#define LLVM_LIB_CODEGEN_ASMPRINTER_DWARF2BTF_H - -#include "DwarfUnit.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/CodeGen/AsmPrinter.h" -#include "llvm/CodeGen/DIE.h" -#include "llvm/MC/MCBTFContext.h" -#include - -namespace llvm { - -class Dwarf2BTF; -class MCBTFContext; - -#define BTF_INVALID_ENCODING 0xff - -class Die2BTFEntry { -protected: - const DIE &Die; - size_t Id; /* type index in the BTF list, started from 1 */ - struct btf_type BTFType; - -public: - virtual ~Die2BTFEntry(); - // Return desired BTF_KIND for the Die, return BTF_KIND_UNKN for - // invalid/unsupported Die - static unsigned char getDieKind(const DIE &Die); - - // Return proper BTF_INT_ENCODING of a basetype. - // Return BTF_INVALID_ENCODING for unsupported (float, etc.) - static unsigned char getBaseTypeEncoding(const DIE &Die); - - // Return whether this Die should be skipped. - // We currently skip unsupported data type (e.g. float) - // and references to unsupported types - static bool shouldSkipDie(const DIE &Die); - - static std::unique_ptr dieToBTFTypeEntry(const DIE &Die); - - Die2BTFEntry(const DIE &Die); - void setId(size_t Id) { this->Id = Id; } - size_t getId() { return Id; } - virtual void completeData(class Dwarf2BTF &Dwarf2BTF); -}; - -// BTF_KIND_INT -class Die2BTFEntryInt : public Die2BTFEntry { - uint32_t IntVal; // encoding, offset, bits - -public: - Die2BTFEntryInt(const DIE &Die); - void completeData(class Dwarf2BTF &Dwarf2BTF); -}; - -// BTF_KIND_ENUM -class Die2BTFEntryEnum : public Die2BTFEntry { - std::vector EnumValues; - -public: - Die2BTFEntryEnum(const DIE &Die); - void completeData(class Dwarf2BTF &Dwarf2BTF); -}; - -// BTF_KIND_ARRAY -class Die2BTFEntryArray : public Die2BTFEntry { - struct btf_array ArrayInfo; - -public: - Die2BTFEntryArray(const DIE &Die); - void completeData(class Dwarf2BTF &Dwarf2BTF); -}; - -// BTF_KIND_STRUCT and BTF_KIND_UNION -class Die2BTFEntryStruct : public Die2BTFEntry { - std::vector Members; - -public: - Die2BTFEntryStruct(const DIE &Die); - void completeData(class Dwarf2BTF &Dwarf2BTF); -}; - -// BTF_KIND_FUNC and BTF_KIND_FUNC_PROTO -class Die2BTFEntryFunc : public Die2BTFEntry { - std::vector Parameters; - -public: - Die2BTFEntryFunc(const DIE &Die); - void completeData(class Dwarf2BTF &Dwarf2BTF); -}; - -class Dwarf2BTF { - std::vector> TypeEntries; - std::map DieToIdMap; - std::unique_ptr BTFContext; - MCContext &OuterCtx; - bool IsLE; - -public: - Dwarf2BTF(MCContext &Context, bool IsLittleEndian); - bool isLittleEndian() { return IsLE; } - void addDwarfCU(DwarfUnit *TheU); - void finish(); - uint32_t getTypeIndex(DIE &Die) { - DIE *DiePtr = const_cast(&Die); - assert((DieToIdMap.find(DiePtr) != DieToIdMap.end()) && - "Die not added to in the BTFContext"); - return DieToIdMap[DiePtr]; - } - size_t addBTFString(std::string S) { return BTFContext->addString(S); } - void addBTFTypeEntry(std::unique_ptr Entry); - void addBTFFuncInfo(unsigned SecNameOff, BTFFuncInfo FuncInfo) { - BTFContext->addFuncInfo(SecNameOff, FuncInfo); - } - -private: - void addTypeEntry(const DIE &Die); - bool alreadyAdded(DIE &Die) { - return DieToIdMap.find(const_cast(&Die)) != DieToIdMap.end(); - } - void completeData(); -}; - -} // namespace llvm -#endif diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp index 184ec4dabe9..94e12658cfe 100644 --- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp +++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp @@ -971,10 +971,6 @@ void DwarfDebug::endModule() { // Emit the pubnames and pubtypes sections if requested. emitDebugPubSections(); - const Triple &TT = Asm->TM.getTargetTriple(); - if (TT.getArch() == Triple::bpfel || TT.getArch() == Triple::bpfeb) - emitBTFSection(TT.getArch() == Triple::bpfel); - // clean up. // FIXME: AbstractVariables.clear(); } @@ -2459,12 +2455,6 @@ MCDwarfDwoLineTable *DwarfDebug::getDwoLineTable(const DwarfCompileUnit &CU) { return &SplitTypeUnitFileTable; } -void DwarfDebug::emitBTFSection(bool IsLittleEndian) { - DwarfFile &Holder = useSplitDwarf() ? SkeletonHolder : InfoHolder; - - Holder.emitBTFSection(IsLittleEndian); -} - uint64_t DwarfDebug::makeTypeSignature(StringRef Identifier) { MD5 Hash; Hash.update(Identifier); diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.h b/lib/CodeGen/AsmPrinter/DwarfDebug.h index 1350317db02..fecf8056765 100644 --- a/lib/CodeGen/AsmPrinter/DwarfDebug.h +++ b/lib/CodeGen/AsmPrinter/DwarfDebug.h @@ -567,9 +567,6 @@ class DwarfDebug : public DebugHandlerBase { /// Emit the reference to the section. void emitSectionReference(const DwarfCompileUnit &CU); - // Emit the BTF sections - void emitBTFSection(bool IsLittleEndian); - protected: /// Gather pre-function debug information. void beginFunctionImpl(const MachineFunction *MF) override; diff --git a/lib/CodeGen/AsmPrinter/DwarfFile.cpp b/lib/CodeGen/AsmPrinter/DwarfFile.cpp index 7ac16b34c4c..0ab9ea87c23 100644 --- a/lib/CodeGen/AsmPrinter/DwarfFile.cpp +++ b/lib/CodeGen/AsmPrinter/DwarfFile.cpp @@ -7,7 +7,6 @@ // //===----------------------------------------------------------------------===// -#include "Dwarf2BTF.h" #include "DwarfFile.h" #include "DwarfCompileUnit.h" #include "DwarfDebug.h" @@ -16,8 +15,6 @@ #include "llvm/CodeGen/AsmPrinter.h" #include "llvm/CodeGen/DIE.h" #include "llvm/IR/DebugInfoMetadata.h" -#include "llvm/MC/MCBTFContext.h" -#include "llvm/MC/MCContext.h" #include "llvm/MC/MCStreamer.h" #include #include @@ -91,13 +88,6 @@ void DwarfFile::emitStrings(MCSection *StrSection, MCSection *OffsetSection, StrPool.emit(*Asm, StrSection, OffsetSection, UseRelativeOffsets); } -void DwarfFile::emitBTFSection(bool IsLittleEndian) { - Dwarf2BTF Dwarf2BTF(Asm->OutContext, IsLittleEndian); - for (auto &TheU : CUs) - Dwarf2BTF.addDwarfCU(TheU.get()); - Dwarf2BTF.finish(); -} - bool DwarfFile::addScopeVariable(LexicalScope *LS, DbgVariable *Var) { auto &ScopeVars = ScopeVariables[LS]; const DILocalVariable *DV = Var->getVariable(); diff --git a/lib/CodeGen/AsmPrinter/DwarfFile.h b/lib/CodeGen/AsmPrinter/DwarfFile.h index 114f98f725d..c315f44a8d8 100644 --- a/lib/CodeGen/AsmPrinter/DwarfFile.h +++ b/lib/CodeGen/AsmPrinter/DwarfFile.h @@ -114,9 +114,6 @@ public: void emitStrings(MCSection *StrSection, MCSection *OffsetSection = nullptr, bool UseRelativeOffsets = false); - // Emit all data for the BTF section - void emitBTFSection(bool IsLittleEndian); - /// Returns the string pool. DwarfStringPool &getStringPool() { return StrPool; } @@ -137,7 +134,9 @@ public: return ScopeVariables; } - DenseMap &getScopeLabels() { return ScopeLabels; } + DenseMap &getScopeLabels() { + return ScopeLabels; + } DenseMap &getAbstractSPDies() { return AbstractSPDies; diff --git a/lib/MC/CMakeLists.txt b/lib/MC/CMakeLists.txt index 85bf1616fd6..ba36d99e8f7 100644 --- a/lib/MC/CMakeLists.txt +++ b/lib/MC/CMakeLists.txt @@ -10,13 +10,11 @@ add_llvm_library(LLVMMC MCAsmMacro.cpp MCAsmStreamer.cpp MCAssembler.cpp - MCBTFContext.cpp MCCodeEmitter.cpp MCCodePadder.cpp MCCodeView.cpp MCContext.cpp MCDwarf.cpp - MCDwarf2BTF.cpp MCELFObjectTargetWriter.cpp MCELFStreamer.cpp MCExpr.cpp diff --git a/lib/MC/MCBTFContext.cpp b/lib/MC/MCBTFContext.cpp deleted file mode 100644 index cb121c41552..00000000000 --- a/lib/MC/MCBTFContext.cpp +++ /dev/null @@ -1,229 +0,0 @@ -//===- lib/MC/MCBTFContext.cpp - Machine Code BTF Context -----------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#include "llvm/MC/MCBTFContext.h" -#include "llvm/MC/MCContext.h" -#include "llvm/MC/MCObjectFileInfo.h" -#include "llvm/MC/MCObjectStreamer.h" -#include -#include -#include - -using namespace llvm; - -#define DEBUG_TYPE "btf" - -BTFTypeEntry::~BTFTypeEntry() {} - -void MCBTFContext::addTypeEntry(std::unique_ptr Entry) { - TypeEntries.push_back(std::move(Entry)); -} - -void MCBTFContext::dump(raw_ostream &OS) { - OS << "Type Table:\n"; - for (size_t i = 0; i < TypeEntries.size(); i++) { - auto TypeEntry = TypeEntries[i].get(); - TypeEntry->print(OS, *this); - } - - OS << "\nString Table:\n"; - StringTable.showTable(OS); - - OS << "\nFuncInfo Table:\n"; - for (auto &FuncSec : FuncInfoTable) { - OS << "sec_name_off=" << FuncSec.first << "\n"; - for (auto &FuncInfo : FuncSec.second) { - OS << "\tinsn_offset= type_id=" << FuncInfo.TypeId << "\n"; - } - } - - OS << "\nLineInfo Table:\n"; - for (auto &LineSec : LineInfoTable) { - OS << "sec_name_off=" << LineSec.first << "\n"; - for (auto &LineInfo : LineSec.second) { - OS << "\tinsn_offset= file_name_off=" << LineInfo.FileNameOff - << " line_off=" << LineInfo.LineOff << " line_num=" << LineInfo.LineNum - << " column_num=" << LineInfo.ColumnNum << "\n"; - } - } -} - -void MCBTFContext::emitCommonHeader(MCObjectStreamer *MCOS) { - MCOS->EmitIntValue(BTF_MAGIC, 2); - MCOS->EmitIntValue(BTF_VERSION, 1); - MCOS->EmitIntValue(0, 1); -} - -void MCBTFContext::emitBTFSection(MCObjectStreamer *MCOS) { - MCContext &context = MCOS->getContext(); - MCOS->SwitchSection(context.getObjectFileInfo()->getBTFSection()); - - // emit header - emitCommonHeader(MCOS); - MCOS->EmitIntValue(sizeof(struct btf_header), 4); - - uint32_t type_len = 0, str_len; - for (auto &TypeEntry : TypeEntries) - type_len += TypeEntry->getSize(); - str_len = StringTable.getSize(); - - MCOS->EmitIntValue(0, 4); - MCOS->EmitIntValue(type_len, 4); - MCOS->EmitIntValue(type_len, 4); - MCOS->EmitIntValue(str_len, 4); - - // emit type table - for (auto &TypeEntry : TypeEntries) - TypeEntry->emitData(MCOS); - - // emit string table - for (auto &S : StringTable.getTable()) { - for (auto C : S) - MCOS->EmitIntValue(C, 1); - MCOS->EmitIntValue('\0', 1); - } -} - -void MCBTFContext::emitBTFExtSection(MCObjectStreamer *MCOS) { - MCContext &context = MCOS->getContext(); - MCOS->SwitchSection(context.getObjectFileInfo()->getBTFExtSection()); - - // emit header - emitCommonHeader(MCOS); - MCOS->EmitIntValue(sizeof(struct btf_ext_header), 4); - - uint32_t func_len = 0, line_len = 0; - for (auto &FuncSec : FuncInfoTable) { - func_len += sizeof(struct btf_sec_func_info); - func_len += FuncSec.second.size() * sizeof(struct bpf_func_info); - } - for (auto &LineSec : LineInfoTable) { - line_len += sizeof(struct btf_sec_line_info); - line_len += LineSec.second.size() * sizeof(struct bpf_line_info); - } - - MCOS->EmitIntValue(0, 4); - MCOS->EmitIntValue(func_len, 4); - MCOS->EmitIntValue(func_len, 4); - MCOS->EmitIntValue(line_len, 4); - - // emit func_info table - for (const auto &FuncSec : FuncInfoTable) { - MCOS->EmitIntValue(FuncSec.first, 4); - MCOS->EmitIntValue(FuncSec.second.size(), 4); - for (const auto &FuncInfo : FuncSec.second) { - MCOS->EmitBTFAdvanceLineAddr(FuncInfo.Label, 4); - MCOS->EmitIntValue(FuncInfo.TypeId, 4); - } - } - - // emit line_info table - for (const auto &LineSec : LineInfoTable) { - MCOS->EmitIntValue(LineSec.first, 4); - MCOS->EmitIntValue(LineSec.second.size(), 4); - for (const auto &LineInfo : LineSec.second) { - MCOS->EmitBTFAdvanceLineAddr(LineInfo.Label, 4); - MCOS->EmitIntValue(LineInfo.FileNameOff, 4); - MCOS->EmitIntValue(LineInfo.LineOff, 4); - MCOS->EmitIntValue(LineInfo.LineNum << 10 | LineInfo.ColumnNum, 4); - } - } -} - -void MCBTFContext::emitAll(MCObjectStreamer *MCOS) { - LLVM_DEBUG(dump(dbgs())); - emitBTFSection(MCOS); - emitBTFExtSection(MCOS); -} - -void BTFTypeEntry::print(raw_ostream &OS, MCBTFContext &MCBTFContext) { - OS << "[" << Id << "] " << btf_kind_str[BTF_INFO_KIND(BTFType.info)] - << " name_off=" << BTFType.name_off - << " info=" << format("0x%08lx", BTFType.info) - << " size/type=" << BTFType.size << "\n"; -} - -void BTFTypeEntry::emitData(MCObjectStreamer *MCOS) { - MCOS->EmitIntValue(BTFType.name_off, 4); - MCOS->EmitIntValue(BTFType.info, 4); - MCOS->EmitIntValue(BTFType.size, 4); -} - -void BTFTypeEntryInt::print(raw_ostream &OS, MCBTFContext &MCBTFContext) { - BTFTypeEntry::print(OS, MCBTFContext); - OS << "\tdesc=" << format("0x%08lx", IntVal) << "\n"; -} - -void BTFTypeEntryInt::emitData(MCObjectStreamer *MCOS) { - BTFTypeEntry::emitData(MCOS); - MCOS->EmitIntValue(IntVal, 4); -} - -void BTFTypeEntryEnum::print(raw_ostream &OS, MCBTFContext &MCBTFContext) { - BTFTypeEntry::print(OS, MCBTFContext); - for (size_t i = 0; i < BTF_INFO_VLEN(BTFType.info); i++) { - auto &EnumValue = EnumValues[i]; - OS << "\tname_off=" << EnumValue.name_off << " value=" << EnumValue.val - << "\n"; - } -} - -void BTFTypeEntryEnum::emitData(MCObjectStreamer *MCOS) { - BTFTypeEntry::emitData(MCOS); - for (auto &EnumValue : EnumValues) { - MCOS->EmitIntValue(EnumValue.name_off, 4); - MCOS->EmitIntValue(EnumValue.val, 4); - } -} - -void BTFTypeEntryArray::print(raw_ostream &OS, MCBTFContext &MCBTFContext) { - BTFTypeEntry::print(OS, MCBTFContext); - OS << "\telem_type=" << format("0x%08lx", ArrayInfo.type) - << " index_type=" << format("0x%08lx", ArrayInfo.index_type) - << " num_element=" << ArrayInfo.nelems << "\n"; -} - -void BTFTypeEntryArray::emitData(MCObjectStreamer *MCOS) { - BTFTypeEntry::emitData(MCOS); - MCOS->EmitIntValue(ArrayInfo.type, 4); - MCOS->EmitIntValue(ArrayInfo.index_type, 4); - MCOS->EmitIntValue(ArrayInfo.nelems, 4); -} - -void BTFTypeEntryStruct::print(raw_ostream &OS, MCBTFContext &MCBTFContext) { - BTFTypeEntry::print(OS, MCBTFContext); - for (size_t i = 0; i < BTF_INFO_VLEN(BTFType.info); i++) { - auto &Member = Members[i]; - OS << "\tname_off=" << Member.name_off << " type=" << Member.type - << " bit_offset=" << Member.offset << "\n"; - } -} - -void BTFTypeEntryStruct::emitData(MCObjectStreamer *MCOS) { - BTFTypeEntry::emitData(MCOS); - for (auto &Member : Members) { - MCOS->EmitIntValue(Member.name_off, 4); - MCOS->EmitIntValue(Member.type, 4); - MCOS->EmitIntValue(Member.offset, 4); - } -} - -void BTFTypeEntryFunc::print(raw_ostream &OS, MCBTFContext &MCBTFContext) { - BTFTypeEntry::print(OS, MCBTFContext); - for (size_t i = 0; i < BTF_INFO_VLEN(BTFType.info); i++) { - auto Parameter = Parameters[i]; - OS << "\tparam_type=" << Parameter << "\n"; - } -} - -void BTFTypeEntryFunc::emitData(MCObjectStreamer *MCOS) { - BTFTypeEntry::emitData(MCOS); - for (auto &Parameter : Parameters) - MCOS->EmitIntValue(Parameter, 4); -} diff --git a/lib/MC/MCContext.cpp b/lib/MC/MCContext.cpp index 18250a474b7..fab517075c5 100644 --- a/lib/MC/MCContext.cpp +++ b/lib/MC/MCContext.cpp @@ -17,7 +17,6 @@ #include "llvm/BinaryFormat/COFF.h" #include "llvm/BinaryFormat/ELF.h" #include "llvm/MC/MCAsmInfo.h" -#include "llvm/MC/MCBTFContext.h" #include "llvm/MC/MCCodeView.h" #include "llvm/MC/MCDwarf.h" #include "llvm/MC/MCExpr.h" @@ -61,7 +60,7 @@ MCContext::MCContext(const MCAsmInfo *mai, const MCRegisterInfo *mri, : SrcMgr(mgr), InlineSrcMgr(nullptr), MAI(mai), MRI(mri), MOFI(mofi), Symbols(Allocator), UsedNames(Allocator), CurrentDwarfLoc(0, 0, 0, DWARF2_FLAG_IS_STMT, 0, 0), - AutoReset(DoAutoReset), BTFCtx(nullptr) { + AutoReset(DoAutoReset) { SecureLogFile = AsSecureLogFileName; if (SrcMgr && SrcMgr->getNumBuffers()) @@ -115,14 +114,6 @@ void MCContext::reset() { GenDwarfFileNumber = 0; HadError = false; - BTFCtx.reset(); -} - -//===----------------------------------------------------------------------===// -// BTFCtx Manipulation -//===----------------------------------------------------------------------===// -void MCContext::setBTFContext(std::unique_ptr Ctx) { - BTFCtx = std::move(Ctx); } //===----------------------------------------------------------------------===// diff --git a/lib/MC/MCDwarf2BTF.cpp b/lib/MC/MCDwarf2BTF.cpp deleted file mode 100644 index 9809a2153ec..00000000000 --- a/lib/MC/MCDwarf2BTF.cpp +++ /dev/null @@ -1,102 +0,0 @@ -//===- MCDwarf2BTF.cpp ---------------------------------------- *- C++ --*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#include "MCDwarf2BTF.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/MC/MCAsmInfo.h" -#include "llvm/MC/MCBTFContext.h" -#include "llvm/MC/MCContext.h" -#include "llvm/MC/MCObjectStreamer.h" -#include "llvm/MC/MCSection.h" -#include "llvm/MC/MCSectionELF.h" -#include "llvm/Support/Endian.h" -#include "llvm/Support/EndianStream.h" -#include - -using namespace llvm; - -void MCDwarf2BTF::addFiles(MCObjectStreamer *MCOS, std::string &FileName, - std::vector &Files) { - std::vector Content; - - std::ifstream Inputfile(FileName); - std::string Line; - Content.push_back(Line); // line 0 for empty string - while (std::getline(Inputfile, Line)) - Content.push_back(Line); - - Files.push_back(FileContent(FileName, Content)); -} - -void MCDwarf2BTF::addLines( - MCObjectStreamer *MCOS, StringRef &SectionName, - std::vector &Files, - const MCLineSection::MCDwarfLineEntryCollection &LineEntries) { - MCContext &Context = MCOS->getContext(); - auto &BTFCxt = Context.getBTFContext(); - - unsigned SecNameOff = BTFCxt->addString(SectionName.str()); - for (const MCDwarfLineEntry &LineEntry : LineEntries) { - BTFLineInfo LineInfo; - unsigned FileNum = LineEntry.getFileNum(); - unsigned Line = LineEntry.getLine(); - - LineInfo.Label = LineEntry.getLabel(); - if (FileNum < Files.size()) { - LineInfo.FileNameOff = BTFCxt->addString(Files[FileNum].first); - if (Line < Files[FileNum].second.size()) - LineInfo.LineOff = BTFCxt->addString(Files[FileNum].second[Line]); - else - LineInfo.LineOff = 0; - } else { - LineInfo.FileNameOff = 0; - LineInfo.LineOff = 0; - } - LineInfo.LineNum = Line; - LineInfo.ColumnNum = LineEntry.getColumn(); - BTFCxt->addLineInfo(SecNameOff, LineInfo); - } -} - -void MCDwarf2BTF::addDwarfLineInfo(MCObjectStreamer *MCOS) { - MCContext &Context = MCOS->getContext(); - - auto &LineTables = Context.getMCDwarfLineTables(); - if (LineTables.empty()) - return; - - for (const auto &CUIDTablePair : LineTables) { - std::vector Dirs; - std::vector Files; - - for (auto &Dir : CUIDTablePair.second.getMCDwarfDirs()) - Dirs.push_back(Dir); - for (auto &File : CUIDTablePair.second.getMCDwarfFiles()) { - std::string FileName; - if (File.DirIndex == 0) - FileName = File.Name; - else - FileName = Dirs[File.DirIndex - 1] + "/" + File.Name; - MCDwarf2BTF::addFiles(MCOS, FileName, Files); - } - for (const auto &LineSec : - CUIDTablePair.second.getMCLineSections().getMCLineEntries()) { - MCSection *Section = LineSec.first; - const MCLineSection::MCDwarfLineEntryCollection &LineEntries = - LineSec.second; - - StringRef SectionName; - if (MCSectionELF *SectionELF = dyn_cast(Section)) - SectionName = SectionELF->getSectionName(); - else - return; - MCDwarf2BTF::addLines(MCOS, SectionName, Files, LineEntries); - } - } -} diff --git a/lib/MC/MCDwarf2BTF.h b/lib/MC/MCDwarf2BTF.h deleted file mode 100644 index 69983374a09..00000000000 --- a/lib/MC/MCDwarf2BTF.h +++ /dev/null @@ -1,30 +0,0 @@ -//===- MCDwarf2BTF.h ------------------------------------------ *- C++ --*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_MC_MCDWARF2BTF_H -#define LLVM_LIB_MC_MCDWARF2BTF_H - -#include "llvm/MC/MCDwarf.h" - -namespace llvm { - -using FileContent = std::pair>; - -class MCDwarf2BTF { -public: - static void addFiles(MCObjectStreamer *MCOS, std::string &FileName, - std::vector &Files); - static void - addLines(MCObjectStreamer *MCOS, StringRef &SectionName, - std::vector &Files, - const MCLineSection::MCDwarfLineEntryCollection &LineEntries); - static void addDwarfLineInfo(MCObjectStreamer *MCOS); -}; - -} // namespace llvm -#endif diff --git a/lib/MC/MCObjectFileInfo.cpp b/lib/MC/MCObjectFileInfo.cpp index bddcf459ac0..edfccfcb9ed 100644 --- a/lib/MC/MCObjectFileInfo.cpp +++ b/lib/MC/MCObjectFileInfo.cpp @@ -468,9 +468,6 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(const Triple &T, bool Large) { Ctx->getELFSection(".eh_frame", EHSectionType, EHSectionFlags); StackSizesSection = Ctx->getELFSection(".stack_sizes", ELF::SHT_PROGBITS, 0); - - BTFSection = Ctx->getELFSection(".BTF", ELF::SHT_PROGBITS, 0); - BTFExtSection = Ctx->getELFSection(".BTF.ext", ELF::SHT_PROGBITS, 0); } void MCObjectFileInfo::initCOFFMCObjectFileInfo(const Triple &T) { diff --git a/lib/MC/MCObjectStreamer.cpp b/lib/MC/MCObjectStreamer.cpp index 4f74f4101c8..8c88db009bd 100644 --- a/lib/MC/MCObjectStreamer.cpp +++ b/lib/MC/MCObjectStreamer.cpp @@ -14,7 +14,6 @@ #include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCCodeView.h" #include "llvm/MC/MCContext.h" -#include "llvm/MC/MCBTFContext.h" #include "llvm/MC/MCDwarf.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCObjectWriter.h" @@ -22,7 +21,6 @@ #include "llvm/MC/MCSymbol.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/SourceMgr.h" -#include "MCDwarf2BTF.h" using namespace llvm; MCObjectStreamer::MCObjectStreamer(MCContext &Context, @@ -441,31 +439,6 @@ void MCObjectStreamer::EmitDwarfAdvanceFrameAddr(const MCSymbol *LastLabel, insert(new MCDwarfCallFrameFragment(*AddrDelta)); } -void MCObjectStreamer::EmitBTFAdvanceLineAddr(const MCSymbol *Label, - unsigned Size) { - const MCExpr *Value = MCSymbolRefExpr::create(Label, getContext()); - MCDataFragment *DF = getOrCreateDataFragment(); - - // Avoid fixups when possible. - int64_t AbsValue; - SMLoc Loc; - - if (Value->evaluateAsAbsolute(AbsValue, getAssemblerPtr())) { - if (!isUIntN(8 * Size, AbsValue) && !isIntN(8 * Size, AbsValue)) { - getContext().reportError( - Loc, "value evaluated as " + Twine(AbsValue) + " is out of range."); - return; - } - EmitIntValue(AbsValue, Size); - return; - } - - DF->getFixups().push_back( - MCFixup::create(DF->getContents().size(), Value, - MCFixup::getKindForSize(Size, false), Loc)); - DF->getContents().resize(DF->getContents().size() + Size, 0); -} - void MCObjectStreamer::EmitCVLocDirective(unsigned FunctionId, unsigned FileNo, unsigned Line, unsigned Column, bool PrologueEnd, bool IsStmt, @@ -715,13 +688,6 @@ void MCObjectStreamer::FinishImpl() { // Dump out the dwarf file & directory tables and line tables. MCDwarfLineTable::Emit(this, getAssembler().getDWARFLinetableParams()); - auto &BTFCtx = getContext().getBTFContext(); - if (BTFCtx) { - MCDwarf2BTF::addDwarfLineInfo(this); - BTFCtx->emitAll(this); - BTFCtx.reset(); - } - flushPendingLabels(); getAssembler().Finish(); } -- GitLab From 5650e8ff9073049ea2ef27f881e301ebafb94c49 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 12 Oct 2018 21:59:55 +0000 Subject: [PATCH 0130/1116] [LegalizeVectorTypes] When widening the result of a bitcast from a scalar type, use a scalar_to_vector to turn the scalar into a vector intead of a build vector full of mostly undefs. This is more consistent with what we usually do and matches some code X86 custom emits in some cases that I think I can cleanup. The MIPS test change just looks to be an instruction ordering change. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344422 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../SelectionDAG/LegalizeVectorTypes.cpp | 26 +++++++++---------- test/CodeGen/Mips/cconv/vector.ll | 24 ++++++++--------- 2 files changed, 24 insertions(+), 26 deletions(-) diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 310f5ef5dc7..f4cad796863 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -3022,22 +3022,20 @@ SDValue DAGTypeLegalizer::WidenVecRes_BITCAST(SDNode *N) { } if (TLI.isTypeLegal(NewInVT)) { - // Because the result and the input are different vector types, widening - // the result could create a legal type but widening the input might make - // it an illegal type that might lead to repeatedly splitting the input - // and then widening it. To avoid this, we widen the input only if - // it results in a legal type. - SmallVector Ops(NewNumElts); - SDValue UndefVal = DAG.getUNDEF(InVT); - Ops[0] = InOp; - for (unsigned i = 1; i < NewNumElts; ++i) - Ops[i] = UndefVal; - SDValue NewVec; - if (InVT.isVector()) + if (InVT.isVector()) { + // Because the result and the input are different vector types, widening + // the result could create a legal type but widening the input might make + // it an illegal type that might lead to repeatedly splitting the input + // and then widening it. To avoid this, we widen the input only if + // it results in a legal type. + SmallVector Ops(NewNumElts, DAG.getUNDEF(InVT)); + Ops[0] = InOp; + NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewInVT, Ops); - else - NewVec = DAG.getBuildVector(NewInVT, dl, Ops); + } else { + NewVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NewInVT, InOp); + } return DAG.getNode(ISD::BITCAST, dl, WidenVT, NewVec); } } diff --git a/test/CodeGen/Mips/cconv/vector.ll b/test/CodeGen/Mips/cconv/vector.ll index 29ffe23f712..d6e260786d1 100644 --- a/test/CodeGen/Mips/cconv/vector.ll +++ b/test/CodeGen/Mips/cconv/vector.ll @@ -2420,10 +2420,10 @@ define void @float_2(<2 x float> %a, <2 x float> %b) { ; MIPS64R5EB-NEXT: lui $1, %hi(%neg(%gp_rel(float_2))) ; MIPS64R5EB-NEXT: daddu $1, $1, $25 ; MIPS64R5EB-NEXT: daddiu $1, $1, %lo(%neg(%gp_rel(float_2))) -; MIPS64R5EB-NEXT: sd $5, 0($sp) -; MIPS64R5EB-NEXT: sd $4, 16($sp) -; MIPS64R5EB-NEXT: ld.w $w0, 0($sp) -; MIPS64R5EB-NEXT: ld.w $w1, 16($sp) +; MIPS64R5EB-NEXT: sd $5, 16($sp) +; MIPS64R5EB-NEXT: sd $4, 0($sp) +; MIPS64R5EB-NEXT: ld.w $w0, 16($sp) +; MIPS64R5EB-NEXT: ld.w $w1, 0($sp) ; MIPS64R5EB-NEXT: fadd.w $w0, $w1, $w0 ; MIPS64R5EB-NEXT: shf.w $w0, $w0, 177 ; MIPS64R5EB-NEXT: copy_s.d $2, $w0[0] @@ -2463,10 +2463,10 @@ define void @float_2(<2 x float> %a, <2 x float> %b) { ; MIPS64R5EL-NEXT: lui $1, %hi(%neg(%gp_rel(float_2))) ; MIPS64R5EL-NEXT: daddu $1, $1, $25 ; MIPS64R5EL-NEXT: daddiu $1, $1, %lo(%neg(%gp_rel(float_2))) -; MIPS64R5EL-NEXT: sd $5, 0($sp) -; MIPS64R5EL-NEXT: sd $4, 16($sp) -; MIPS64R5EL-NEXT: ld.w $w0, 0($sp) -; MIPS64R5EL-NEXT: ld.w $w1, 16($sp) +; MIPS64R5EL-NEXT: sd $5, 16($sp) +; MIPS64R5EL-NEXT: sd $4, 0($sp) +; MIPS64R5EL-NEXT: ld.w $w0, 16($sp) +; MIPS64R5EL-NEXT: ld.w $w1, 0($sp) ; MIPS64R5EL-NEXT: fadd.w $w0, $w1, $w0 ; MIPS64R5EL-NEXT: copy_s.d $2, $w0[0] ; MIPS64R5EL-NEXT: ld $1, %got_disp(float_res_v2f32)($1) @@ -6211,14 +6211,14 @@ define float @mixed_i8(<2 x float> %a, i8 %b, <2 x float> %c) { ; MIPS64R5-NEXT: andi $1, $1, 255 ; MIPS64R5-NEXT: sw $1, 36($sp) ; MIPS64R5-NEXT: sw $1, 32($sp) -; MIPS64R5-NEXT: sd $4, 16($sp) +; MIPS64R5-NEXT: sd $4, 0($sp) ; MIPS64R5-NEXT: ld.w $w0, 32($sp) ; MIPS64R5-NEXT: ffint_s.w $w0, $w0 -; MIPS64R5-NEXT: ld.w $w1, 16($sp) -; MIPS64R5-NEXT: fadd.w $w0, $w0, $w1 -; MIPS64R5-NEXT: sd $6, 0($sp) ; MIPS64R5-NEXT: ld.w $w1, 0($sp) ; MIPS64R5-NEXT: fadd.w $w0, $w0, $w1 +; MIPS64R5-NEXT: sd $6, 16($sp) +; MIPS64R5-NEXT: ld.w $w1, 16($sp) +; MIPS64R5-NEXT: fadd.w $w0, $w0, $w1 ; MIPS64R5-NEXT: splati.w $w1, $w0[1] ; MIPS64R5-NEXT: add.s $f0, $f0, $f1 ; MIPS64R5-NEXT: daddiu $sp, $sp, 48 -- GitLab From 18cda8141231bc1afa46d2ea805c319edf4589e6 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 12 Oct 2018 21:59:58 +0000 Subject: [PATCH 0131/1116] [X86] Skip (v2i32/v4i16/v8i8 (bitcast (f64))) handling in ReplaceNodeResults if the dest type can be widened by generic legalization. NFCI The algorithm we would do previously was identical to generic legalization. If we ever switch to legalizing integer vectors via widening we'll be able to kill off the code since it now only runs for promotion. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344423 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 97731dff9b2..220e2e2fdc0 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -26297,7 +26297,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, } if (SrcVT != MVT::f64 || - (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8)) + (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8) || + getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector) return; unsigned NumElts = DstVT.getVectorNumElements(); @@ -26307,13 +26308,6 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, MVT::v2f64, N->getOperand(0)); SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded); - if (getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector) { - // If we are legalizing vectors by widening, we already have the desired - // legal vector type, just return it. - Results.push_back(ToVecInt); - return; - } - SmallVector Elts; for (unsigned i = 0, e = NumElts; i != e; ++i) Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, -- GitLab From e9abd40f5c6ad7f17936b45a8421b4b4995488aa Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 12 Oct 2018 22:00:00 +0000 Subject: [PATCH 0132/1116] [X86] Simplify the end of custom type legalization for (v2i32/v4i16/v8i8 (bitcast (f64))) by just emitting an EXTRACT_SUBVECTOR instead of a BUILD_VECTOR. Generic legalization should be able to finish legalizing the EXTRACT_SUBVECTOR probably by turning it into a BUILD_VECTOR. But we should emit the simplest sequence. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344424 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 220e2e2fdc0..ffb5acf3386 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -26307,13 +26307,9 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, N->getOperand(0)); SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded); - - SmallVector Elts; - for (unsigned i = 0, e = NumElts; i != e; ++i) - Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, - ToVecInt, DAG.getIntPtrConstant(i, dl))); - - Results.push_back(DAG.getBuildVector(DstVT, dl, Elts)); + SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, + ToVecInt, DAG.getIntPtrConstant(0, dl)); + Results.push_back(Extract); return; } case ISD::MGATHER: { -- GitLab From bb098ae625465cc0b6341b22b9f1160dc9a7a110 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 12 Oct 2018 22:00:04 +0000 Subject: [PATCH 0133/1116] [X86] Improve type legalization of (v2i32/v4i16/v8i16 (bitcast (v2f32))) to avoid a stack stack temporary. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344425 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 20 +++++++++++++------- test/CodeGen/X86/2012-01-18-vbitcast.ll | 9 +-------- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index ffb5acf3386..86141965393 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -26296,7 +26296,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, return; } - if (SrcVT != MVT::f64 || + if ((SrcVT != MVT::f64 && SrcVT != MVT::v2f32) || (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8) || getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector) return; @@ -26304,12 +26304,18 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, unsigned NumElts = DstVT.getVectorNumElements(); EVT SVT = DstVT.getVectorElementType(); EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2); - SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, - MVT::v2f64, N->getOperand(0)); - SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded); - SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, - ToVecInt, DAG.getIntPtrConstant(0, dl)); - Results.push_back(Extract); + SDValue Res; + if (SrcVT == MVT::f64) + Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, + MVT::v2f64, N->getOperand(0)); + else + Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, N->getOperand(0), + DAG.getUNDEF(MVT::v2f32)); + + Res = DAG.getBitcast(WiderVT, Res); + Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, Res, + DAG.getIntPtrConstant(0, dl)); + Results.push_back(Res); return; } case ISD::MGATHER: { diff --git a/test/CodeGen/X86/2012-01-18-vbitcast.ll b/test/CodeGen/X86/2012-01-18-vbitcast.ll index 61c25021bba..ab57b61770d 100644 --- a/test/CodeGen/X86/2012-01-18-vbitcast.ll +++ b/test/CodeGen/X86/2012-01-18-vbitcast.ll @@ -4,17 +4,10 @@ define <2 x i32> @vcast(<2 x float> %a, <2 x float> %b) { ; CHECK-LABEL: vcast: ; CHECK: # %bb.0: -; CHECK-NEXT: subq $16, %rsp -; CHECK-NEXT: .seh_stackalloc 16 -; CHECK-NEXT: .seh_endprologue -; CHECK-NEXT: pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero ; CHECK-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero +; CHECK-NEXT: pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero ; CHECK-NEXT: psubq %xmm1, %xmm0 -; CHECK-NEXT: addq $16, %rsp ; CHECK-NEXT: retq -; CHECK-NEXT: .seh_handlerdata -; CHECK-NEXT: .text -; CHECK-NEXT: .seh_endproc %af = bitcast <2 x float> %a to <2 x i32> %bf = bitcast <2 x float> %b to <2 x i32> %x = sub <2 x i32> %af, %bf -- GitLab From cb064c84c5432b8193815b2ad04939c81f821c18 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 12 Oct 2018 22:55:17 +0000 Subject: [PATCH 0134/1116] [LegalizeVectorTypes] Use TLI.getVectorIdxTy instead of DAG.getIntPtrConstant. There's no guarantee that vector indices should use pointer types. So use the correct query method. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344428 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index f4cad796863..a08a41ccaf2 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -3749,8 +3749,9 @@ SDValue DAGTypeLegalizer::WidenVecOp_Convert(SDNode *N) { InVT.getVectorNumElements()); if (TLI.isTypeLegal(WideVT)) { SDValue Res = DAG.getNode(Opcode, dl, WideVT, InOp); - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res, - DAG.getIntPtrConstant(0, dl)); + return DAG.getNode( + ISD::EXTRACT_SUBVECTOR, dl, VT, Res, + DAG.getConstant(0, dl, TLI.getVectorIdxTy(DAG.getDataLayout()))); } EVT InEltVT = InVT.getVectorElementType(); -- GitLab From 8aea7592db278dee40dee36e9a4fdcf58a544223 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Fri, 12 Oct 2018 22:57:57 +0000 Subject: [PATCH 0135/1116] [llvm-readobj] Fix an error message about .llvm.call-graph-profile .note.llvm.cgprofile was an obvious typo in rL333823 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344430 91177308-0d34-0410-b5e6-96231b3b80d8 --- tools/llvm-readobj/ELFDumper.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/llvm-readobj/ELFDumper.cpp b/tools/llvm-readobj/ELFDumper.cpp index 6f71d2d8b6b..5e7eae1b272 100644 --- a/tools/llvm-readobj/ELFDumper.cpp +++ b/tools/llvm-readobj/ELFDumper.cpp @@ -1423,7 +1423,7 @@ ELFDumper::ELFDumper(const ELFFile *Obj, ScopedPrinter &Writer) break; case ELF::SHT_LLVM_CALL_GRAPH_PROFILE: if (DotCGProfileSec != nullptr) - reportError("Multiple .note.llvm.cgprofile"); + reportError("Multiple .llvm.call-graph-profile"); DotCGProfileSec = &Sec; break; case ELF::SHT_LLVM_ADDRSIG: -- GitLab From 56ebea371fc81717d6269a8c35e0e9a553195f18 Mon Sep 17 00:00:00 2001 From: Alex Bradbury Date: Fri, 12 Oct 2018 23:18:52 +0000 Subject: [PATCH 0136/1116] [RISCV] Eliminate unnecessary masking of promoted shift amounts SelectionDAGBuilder::visitShift will always zero-extend a shift amount when it is promoted to the ShiftAmountTy. This results in zero-extension (masking) which is unnecessary for RISC-V as the shift operations only read the lower 5 or 6 bits (RV32 or RV64). I initially proposed adding a getExtendForShiftAmount hook so the shift amount can be any-extended (D52975). @efriedma explained this was unsafe, so I have instead eliminate the unnecessary and operations at instruction selection time in a manner similar to X86InstrCompiler.td. Differential Revision: https://reviews.llvm.org/D53224 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344432 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/RISCV/RISCVInstrInfo.td | 23 +++++++- test/CodeGen/RISCV/alu16.ll | 9 --- test/CodeGen/RISCV/alu8.ll | 5 -- test/CodeGen/RISCV/shift-masked-shamt.ll | 70 ++++++++++++++++++++++++ 4 files changed, 90 insertions(+), 17 deletions(-) create mode 100644 test/CodeGen/RISCV/shift-masked-shamt.ll diff --git a/lib/Target/RISCV/RISCVInstrInfo.td b/lib/Target/RISCV/RISCVInstrInfo.td index 5ca1cbd165d..50012569a74 100644 --- a/lib/Target/RISCV/RISCVInstrInfo.td +++ b/lib/Target/RISCV/RISCVInstrInfo.td @@ -205,6 +205,12 @@ def ixlenimm : Operand { // Standalone (codegen-only) immleaf patterns. def simm32 : ImmLeaf(Imm);}]>; def simm32hi20 : ImmLeaf(Imm);}]>; +// A mask value that won't affect significant shift bits. +def immshiftxlen : ImmLeafis64Bit()) + return countTrailingOnes(Imm) >= 6; + return countTrailingOnes(Imm) >= 5; +}]>; // Addressing modes. // Necessary because a frameindex can't be matched directly in a pattern. @@ -646,13 +652,24 @@ def : PatGprGpr; def : PatGprSimm12; def : PatGprGpr; def : PatGprSimm12; -def : PatGprGpr; def : PatGprUimmLog2XLen; -def : PatGprGpr; def : PatGprUimmLog2XLen; -def : PatGprGpr; def : PatGprUimmLog2XLen; +// Match both a plain shift and one where the shift amount is masked (this is +// typically introduced when the legalizer promotes the shift amount and +// zero-extends it). For RISC-V, the mask is unnecessary as shifts in the base +// ISA only read the least significant 5 bits (RV32I) or 6 bits (RV64I). +multiclass VarShiftXLenPat { + def : Pat<(ShiftOp GPR:$rs1, GPR:$rs2), (Inst GPR:$rs1, GPR:$rs2)>; + def : Pat<(ShiftOp GPR:$rs1, (and GPR:$rs2, immshiftxlen)), + (Inst GPR:$rs1, GPR:$rs2)>; +} + +defm : VarShiftXLenPat; +defm : VarShiftXLenPat; +defm : VarShiftXLenPat; + /// FrameIndex calculations def : Pat<(add (i32 AddrFI:$Rs), simm12:$imm12), diff --git a/test/CodeGen/RISCV/alu16.ll b/test/CodeGen/RISCV/alu16.ll index 20b79a987f6..79e74ffc8a5 100644 --- a/test/CodeGen/RISCV/alu16.ll +++ b/test/CodeGen/RISCV/alu16.ll @@ -6,8 +6,6 @@ ; that legalisation of these non-native types doesn't introduce unnecessary ; inefficiencies. -; TODO: it's unnecessary to mask (zero-extend) the shift amount. - define i16 @addi(i16 %a) nounwind { ; RV32I-LABEL: addi: ; RV32I: # %bb.0: @@ -122,9 +120,6 @@ define i16 @sub(i16 %a, i16 %b) nounwind { define i16 @sll(i16 %a, i16 %b) nounwind { ; RV32I-LABEL: sll: ; RV32I: # %bb.0: -; RV32I-NEXT: lui a2, 16 -; RV32I-NEXT: addi a2, a2, -1 -; RV32I-NEXT: and a1, a1, a2 ; RV32I-NEXT: sll a0, a0, a1 ; RV32I-NEXT: ret %1 = shl i16 %a, %b @@ -173,7 +168,6 @@ define i16 @srl(i16 %a, i16 %b) nounwind { ; RV32I: # %bb.0: ; RV32I-NEXT: lui a2, 16 ; RV32I-NEXT: addi a2, a2, -1 -; RV32I-NEXT: and a1, a1, a2 ; RV32I-NEXT: and a0, a0, a2 ; RV32I-NEXT: srl a0, a0, a1 ; RV32I-NEXT: ret @@ -184,9 +178,6 @@ define i16 @srl(i16 %a, i16 %b) nounwind { define i16 @sra(i16 %a, i16 %b) nounwind { ; RV32I-LABEL: sra: ; RV32I: # %bb.0: -; RV32I-NEXT: lui a2, 16 -; RV32I-NEXT: addi a2, a2, -1 -; RV32I-NEXT: and a1, a1, a2 ; RV32I-NEXT: slli a0, a0, 16 ; RV32I-NEXT: srai a0, a0, 16 ; RV32I-NEXT: sra a0, a0, a1 diff --git a/test/CodeGen/RISCV/alu8.ll b/test/CodeGen/RISCV/alu8.ll index f7d0e8beef3..ad97e620319 100644 --- a/test/CodeGen/RISCV/alu8.ll +++ b/test/CodeGen/RISCV/alu8.ll @@ -6,8 +6,6 @@ ; that legalisation of these non-native types doesn't introduce unnecessary ; inefficiencies. -; TODO: it's unnecessary to mask (zero-extend) the shift amount. - define i8 @addi(i8 %a) nounwind { ; RV32I-LABEL: addi: ; RV32I: # %bb.0: @@ -118,7 +116,6 @@ define i8 @sub(i8 %a, i8 %b) nounwind { define i8 @sll(i8 %a, i8 %b) nounwind { ; RV32I-LABEL: sll: ; RV32I: # %bb.0: -; RV32I-NEXT: andi a1, a1, 255 ; RV32I-NEXT: sll a0, a0, a1 ; RV32I-NEXT: ret %1 = shl i8 %a, %b @@ -163,7 +160,6 @@ define i8 @xor(i8 %a, i8 %b) nounwind { define i8 @srl(i8 %a, i8 %b) nounwind { ; RV32I-LABEL: srl: ; RV32I: # %bb.0: -; RV32I-NEXT: andi a1, a1, 255 ; RV32I-NEXT: andi a0, a0, 255 ; RV32I-NEXT: srl a0, a0, a1 ; RV32I-NEXT: ret @@ -174,7 +170,6 @@ define i8 @srl(i8 %a, i8 %b) nounwind { define i8 @sra(i8 %a, i8 %b) nounwind { ; RV32I-LABEL: sra: ; RV32I: # %bb.0: -; RV32I-NEXT: andi a1, a1, 255 ; RV32I-NEXT: slli a0, a0, 24 ; RV32I-NEXT: srai a0, a0, 24 ; RV32I-NEXT: sra a0, a0, a1 diff --git a/test/CodeGen/RISCV/shift-masked-shamt.ll b/test/CodeGen/RISCV/shift-masked-shamt.ll new file mode 100644 index 00000000000..5c77aa2d77f --- /dev/null +++ b/test/CodeGen/RISCV/shift-masked-shamt.ll @@ -0,0 +1,70 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \ +; RUN: | FileCheck %s -check-prefix=RV32I + +; This test checks that unnecessary masking of shift amount operands is +; eliminated during instruction selection. The test needs to ensure that the +; masking is not removed if it may affect the shift amount. + +define i32 @sll_redundant_mask(i32 %a, i32 %b) nounwind { +; RV32I-LABEL: sll_redundant_mask: +; RV32I: # %bb.0: +; RV32I-NEXT: sll a0, a0, a1 +; RV32I-NEXT: ret + %1 = and i32 %b, 31 + %2 = shl i32 %a, %1 + ret i32 %2 +} + +define i32 @sll_non_redundant_mask(i32 %a, i32 %b) nounwind { +; RV32I-LABEL: sll_non_redundant_mask: +; RV32I: # %bb.0: +; RV32I-NEXT: andi a1, a1, 15 +; RV32I-NEXT: sll a0, a0, a1 +; RV32I-NEXT: ret + %1 = and i32 %b, 15 + %2 = shl i32 %a, %1 + ret i32 %2 +} + +define i32 @srl_redundant_mask(i32 %a, i32 %b) nounwind { +; RV32I-LABEL: srl_redundant_mask: +; RV32I: # %bb.0: +; RV32I-NEXT: srl a0, a0, a1 +; RV32I-NEXT: ret + %1 = and i32 %b, 4095 + %2 = lshr i32 %a, %1 + ret i32 %2 +} + +define i32 @srl_non_redundant_mask(i32 %a, i32 %b) nounwind { +; RV32I-LABEL: srl_non_redundant_mask: +; RV32I: # %bb.0: +; RV32I-NEXT: andi a1, a1, 7 +; RV32I-NEXT: srl a0, a0, a1 +; RV32I-NEXT: ret + %1 = and i32 %b, 7 + %2 = lshr i32 %a, %1 + ret i32 %2 +} + +define i32 @sra_redundant_mask(i32 %a, i32 %b) nounwind { +; RV32I-LABEL: sra_redundant_mask: +; RV32I: # %bb.0: +; RV32I-NEXT: sra a0, a0, a1 +; RV32I-NEXT: ret + %1 = and i32 %b, 65535 + %2 = ashr i32 %a, %1 + ret i32 %2 +} + +define i32 @sra_non_redundant_mask(i32 %a, i32 %b) nounwind { +; RV32I-LABEL: sra_non_redundant_mask: +; RV32I: # %bb.0: +; RV32I-NEXT: andi a1, a1, 32 +; RV32I-NEXT: sra a0, a0, a1 +; RV32I-NEXT: ret + %1 = and i32 %b, 32 + %2 = ashr i32 %a, %1 + ret i32 %2 +} -- GitLab From d069d45aa888c5b8a44521d4cade32767c09b35e Mon Sep 17 00:00:00 2001 From: Kostya Serebryany Date: Fri, 12 Oct 2018 23:21:48 +0000 Subject: [PATCH 0137/1116] move GetOrCreateFunctionComdat to Instrumentation.cpp/Instrumentation.h Summary: GetOrCreateFunctionComdat is currently used in SanitizerCoverage, where it's defined. I'm planing to use it in HWASAN as well, so moving it into a common location. NFC Reviewers: morehouse Reviewed By: morehouse Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D53218 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344433 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Transforms/Instrumentation.h | 6 +++++ .../Instrumentation/Instrumentation.cpp | 15 +++++++++++++ .../Instrumentation/SanitizerCoverage.cpp | 22 ++++--------------- 3 files changed, 25 insertions(+), 18 deletions(-) diff --git a/include/llvm/Transforms/Instrumentation.h b/include/llvm/Transforms/Instrumentation.h index 2157fcab726..0011a5b3c51 100644 --- a/include/llvm/Transforms/Instrumentation.h +++ b/include/llvm/Transforms/Instrumentation.h @@ -27,6 +27,7 @@ namespace llvm { class FunctionPass; class ModulePass; class OptimizationRemarkEmitter; +class Comdat; /// Instrumentation passes often insert conditional checks into entry blocks. /// Call this function before splitting the entry block to move instructions @@ -41,6 +42,11 @@ GlobalVariable *createPrivateGlobalForString(Module &M, StringRef Str, bool AllowMerging, const char *NamePrefix = ""); +// Returns F.getComdat() if it exists. +// Otherwise creates a new comdat, sets F's comdat, and returns it. +// Returns nullptr on failure. +Comdat *GetOrCreateFunctionComdat(Function &F, const std::string &ModuleId); + // Insert GCOV profiling instrumentation struct GCOVOptions { static GCOVOptions getDefault(); diff --git a/lib/Transforms/Instrumentation/Instrumentation.cpp b/lib/Transforms/Instrumentation/Instrumentation.cpp index 1c739c09e39..55b449ffca1 100644 --- a/lib/Transforms/Instrumentation/Instrumentation.cpp +++ b/lib/Transforms/Instrumentation/Instrumentation.cpp @@ -70,6 +70,21 @@ GlobalVariable *llvm::createPrivateGlobalForString(Module &M, StringRef Str, return GV; } +Comdat *llvm::GetOrCreateFunctionComdat(Function &F, + const std::string &ModuleId) { + if (auto Comdat = F.getComdat()) return Comdat; + assert(F.hasName()); + Module *M = F.getParent(); + std::string Name = F.getName(); + if (F.hasLocalLinkage()) { + if (ModuleId.empty()) + return nullptr; + Name += ModuleId; + } + F.setComdat(M->getOrInsertComdat(Name)); + return F.getComdat(); +} + /// initializeInstrumentation - Initialize all passes in the TransformUtils /// library. void llvm::initializeInstrumentation(PassRegistry &Registry) { diff --git a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp index 0bed4139518..b3450728f04 100644 --- a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp +++ b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp @@ -220,8 +220,6 @@ private: MDNode::get(*C, None)); } - Comdat *GetOrCreateFunctionComdat(Function &F); - std::string getSectionName(const std::string &Section) const; std::string getSectionStart(const std::string &Section) const; std::string getSectionEnd(const std::string &Section) const; @@ -590,28 +588,16 @@ bool SanitizerCoverageModule::runOnFunction(Function &F) { return true; } -Comdat *SanitizerCoverageModule::GetOrCreateFunctionComdat(Function &F) { - if (auto Comdat = F.getComdat()) return Comdat; - if (!TargetTriple.isOSBinFormatELF()) return nullptr; - assert(F.hasName()); - std::string Name = F.getName(); - if (F.hasLocalLinkage()) { - if (CurModuleUniqueId.empty()) return nullptr; - Name += CurModuleUniqueId; - } - auto Comdat = CurModule->getOrInsertComdat(Name); - F.setComdat(Comdat); - return Comdat; -} - GlobalVariable *SanitizerCoverageModule::CreateFunctionLocalArrayInSection( size_t NumElements, Function &F, Type *Ty, const char *Section) { ArrayType *ArrayTy = ArrayType::get(Ty, NumElements); auto Array = new GlobalVariable( *CurModule, ArrayTy, false, GlobalVariable::PrivateLinkage, Constant::getNullValue(ArrayTy), "__sancov_gen_"); - if (auto Comdat = GetOrCreateFunctionComdat(F)) - Array->setComdat(Comdat); + + if (TargetTriple.isOSBinFormatELF()) + if (auto Comdat = GetOrCreateFunctionComdat(F, CurModuleUniqueId)) + Array->setComdat(Comdat); Array->setSection(getSectionName(Section)); Array->setAlignment(Ty->isPointerTy() ? DL->getPointerSize() : Ty->getPrimitiveSizeInBits() / 8); -- GitLab From 93c7b61d509c53ed8dc790934eb9ca6bca64e57a Mon Sep 17 00:00:00 2001 From: Thomas Lively Date: Sat, 13 Oct 2018 07:09:10 +0000 Subject: [PATCH 0138/1116] [WebAssembly][NFC] Unify ARGUMENT classes Reviewers: aheejin, dschuff Subscribers: sbc100, jgravelle-google, sunfish, llvm-commits Differential Revision: https://reviews.llvm.org/D53172 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344436 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../WebAssembly/WebAssemblyAsmPrinter.cpp | 16 +++++++------- .../WebAssembly/WebAssemblyFastISel.cpp | 10 ++++----- .../WebAssembly/WebAssemblyInstrInfo.td | 18 ++++++++-------- .../WebAssembly/WebAssemblyInstrSIMD.td | 21 ++++++------------- .../WebAssembly/WebAssemblyUtilities.cpp | 16 +++++++------- 5 files changed, 36 insertions(+), 45 deletions(-) diff --git a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp index 2ea3760b923..b8ac85943eb 100644 --- a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp +++ b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp @@ -176,14 +176,14 @@ void WebAssemblyAsmPrinter::EmitInstruction(const MachineInstr *MI) { LLVM_DEBUG(dbgs() << "EmitInstruction: " << *MI << '\n'); switch (MI->getOpcode()) { - case WebAssembly::ARGUMENT_I32: - case WebAssembly::ARGUMENT_I32_S: - case WebAssembly::ARGUMENT_I64: - case WebAssembly::ARGUMENT_I64_S: - case WebAssembly::ARGUMENT_F32: - case WebAssembly::ARGUMENT_F32_S: - case WebAssembly::ARGUMENT_F64: - case WebAssembly::ARGUMENT_F64_S: + case WebAssembly::ARGUMENT_i32: + case WebAssembly::ARGUMENT_i32_S: + case WebAssembly::ARGUMENT_i64: + case WebAssembly::ARGUMENT_i64_S: + case WebAssembly::ARGUMENT_f32: + case WebAssembly::ARGUMENT_f32_S: + case WebAssembly::ARGUMENT_f64: + case WebAssembly::ARGUMENT_f64_S: case WebAssembly::ARGUMENT_v16i8: case WebAssembly::ARGUMENT_v16i8_S: case WebAssembly::ARGUMENT_v8i16: diff --git a/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/lib/Target/WebAssembly/WebAssemblyFastISel.cpp index 8dc535445d6..0be4f228347 100644 --- a/lib/Target/WebAssembly/WebAssemblyFastISel.cpp +++ b/lib/Target/WebAssembly/WebAssemblyFastISel.cpp @@ -646,19 +646,19 @@ bool WebAssemblyFastISel::fastLowerArguments() { case MVT::i8: case MVT::i16: case MVT::i32: - Opc = WebAssembly::ARGUMENT_I32; + Opc = WebAssembly::ARGUMENT_i32; RC = &WebAssembly::I32RegClass; break; case MVT::i64: - Opc = WebAssembly::ARGUMENT_I64; + Opc = WebAssembly::ARGUMENT_i64; RC = &WebAssembly::I64RegClass; break; case MVT::f32: - Opc = WebAssembly::ARGUMENT_F32; + Opc = WebAssembly::ARGUMENT_f32; RC = &WebAssembly::F32RegClass; break; case MVT::f64: - Opc = WebAssembly::ARGUMENT_F64; + Opc = WebAssembly::ARGUMENT_f64; RC = &WebAssembly::F64RegClass; break; case MVT::v16i8: @@ -686,7 +686,7 @@ bool WebAssemblyFastISel::fastLowerArguments() { RC = &WebAssembly::V128RegClass; break; case MVT::ExceptRef: - Opc = WebAssembly::ARGUMENT_EXCEPT_REF; + Opc = WebAssembly::ARGUMENT_ExceptRef; RC = &WebAssembly::EXCEPT_REFRegClass; break; default: diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td index 9e1409cf90e..8d98510c67d 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td @@ -163,18 +163,18 @@ include "WebAssemblyInstrFormats.td" // Additional instructions. //===----------------------------------------------------------------------===// -multiclass ARGUMENT { +multiclass ARGUMENT { let hasSideEffects = 1, isCodeGenOnly = 1, Defs = [], Uses = [ARGUMENTS] in - defm ARGUMENT_#vt : I<(outs vt:$res), (ins i32imm:$argno), - (outs), (ins i32imm:$argno), - [(set vt:$res, (WebAssemblyargument timm:$argno))]>; + defm ARGUMENT_#vt : + I<(outs reg:$res), (ins i32imm:$argno), (outs), (ins i32imm:$argno), + [(set (vt reg:$res), (WebAssemblyargument timm:$argno))]>; } -defm "": ARGUMENT; -defm "": ARGUMENT; -defm "": ARGUMENT; -defm "": ARGUMENT; -defm "": ARGUMENT; +defm "": ARGUMENT; +defm "": ARGUMENT; +defm "": ARGUMENT; +defm "": ARGUMENT; +defm "": ARGUMENT; // get_local and set_local are not generated by instruction selection; they // are implied by virtual register uses and defs. diff --git a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td index b575a039ae0..1eb38588c81 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -21,21 +21,12 @@ multiclass SIMD_I; } -multiclass SIMD_ARGUMENT { - let hasSideEffects = 1, isCodeGenOnly = 1, - Defs = [], Uses = [ARGUMENTS] in - defm ARGUMENT_#vt : SIMD_I<(outs V128:$res), (ins i32imm:$argno), - (outs), (ins i32imm:$argno), - [(set (vt V128:$res), - (WebAssemblyargument timm:$argno))]>; -} - -defm "": SIMD_ARGUMENT; -defm "": SIMD_ARGUMENT; -defm "": SIMD_ARGUMENT; -defm "": SIMD_ARGUMENT; -defm "": SIMD_ARGUMENT; -defm "": SIMD_ARGUMENT; +defm "" : ARGUMENT; +defm "" : ARGUMENT; +defm "" : ARGUMENT; +defm "" : ARGUMENT; +defm "" : ARGUMENT; +defm "" : ARGUMENT; // Constrained immediate argument types foreach SIZE = [8, 16] in diff --git a/lib/Target/WebAssembly/WebAssemblyUtilities.cpp b/lib/Target/WebAssembly/WebAssemblyUtilities.cpp index a25ec7cf4c2..ada6fb9a96d 100644 --- a/lib/Target/WebAssembly/WebAssemblyUtilities.cpp +++ b/lib/Target/WebAssembly/WebAssemblyUtilities.cpp @@ -27,14 +27,14 @@ const char *const WebAssembly::PersonalityWrapperFn = bool WebAssembly::isArgument(const MachineInstr &MI) { switch (MI.getOpcode()) { - case WebAssembly::ARGUMENT_I32: - case WebAssembly::ARGUMENT_I32_S: - case WebAssembly::ARGUMENT_I64: - case WebAssembly::ARGUMENT_I64_S: - case WebAssembly::ARGUMENT_F32: - case WebAssembly::ARGUMENT_F32_S: - case WebAssembly::ARGUMENT_F64: - case WebAssembly::ARGUMENT_F64_S: + case WebAssembly::ARGUMENT_i32: + case WebAssembly::ARGUMENT_i32_S: + case WebAssembly::ARGUMENT_i64: + case WebAssembly::ARGUMENT_i64_S: + case WebAssembly::ARGUMENT_f32: + case WebAssembly::ARGUMENT_f32_S: + case WebAssembly::ARGUMENT_f64: + case WebAssembly::ARGUMENT_f64_S: case WebAssembly::ARGUMENT_v16i8: case WebAssembly::ARGUMENT_v16i8_S: case WebAssembly::ARGUMENT_v8i16: -- GitLab From 6e3463c0eb418dd2265a827d807814ab6ac53554 Mon Sep 17 00:00:00 2001 From: Thomas Lively Date: Sat, 13 Oct 2018 07:21:44 +0000 Subject: [PATCH 0139/1116] [Intrinsic] Add llvm.minimum and llvm.maximum instrinsic functions Summary: These new intrinsics have the semantics of the `minimum` and `maximum` operations specified by the latest draft of IEEE 754-2018. Unlike llvm.minnum and llvm.maxnum, these new intrinsics propagate NaNs and always treat -0.0 as less than 0.0. `minimum` and `maximum` lower directly to the existing `fminnan` and `fmaxnan` ISel DAG nodes. It is safe to reuse these DAG nodes because before this patch were only emitted in situations where there were known to be no NaN arguments or where NaN propagation was correct and there were known to be no zero arguments. I know of only four backends that lower fminnan and fmaxnan: WebAssembly, ARM, AArch64, and SystemZ, and each of these lowers fminnan and fmaxnan to instructions that are compatible with the IEEE 754-2018 semantics. Reviewers: aheejin, dschuff, sunfish, javed.absar Subscribers: kristof.beyls, dexonsmith, kristina, llvm-commits Differential Revision: https://reviews.llvm.org/D52764 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344437 91177308-0d34-0410-b5e6-96231b3b80d8 --- docs/LangRef.rst | 76 +++++++++++++++++++ include/llvm/ADT/APFloat.h | 26 +++++++ include/llvm/CodeGen/ISDOpcodes.h | 5 +- include/llvm/IR/IRBuilder.h | 10 +++ include/llvm/IR/Intrinsics.td | 8 ++ .../SelectionDAG/SelectionDAGBuilder.cpp | 12 +++ unittests/ADT/APFloatTest.cpp | 30 ++++++++ unittests/IR/IRBuilderTest.cpp | 8 ++ 8 files changed, 173 insertions(+), 2 deletions(-) diff --git a/docs/LangRef.rst b/docs/LangRef.rst index 9fcfd29a6e8..e977657d1cb 100644 --- a/docs/LangRef.rst +++ b/docs/LangRef.rst @@ -11560,6 +11560,82 @@ NaN, the intrinsic lowering is responsible for quieting the inputs to correctly return the non-NaN input (e.g. by using the equivalent of ``llvm.canonicalize``). +'``llvm.minimum.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +This is an overloaded intrinsic. You can use ``llvm.minimum`` on any +floating-point or vector of floating-point type. Not all targets support +all types however. + +:: + + declare float @llvm.minimum.f32(float %Val0, float %Val1) + declare double @llvm.minimum.f64(double %Val0, double %Val1) + declare x86_fp80 @llvm.minimum.f80(x86_fp80 %Val0, x86_fp80 %Val1) + declare fp128 @llvm.minimum.f128(fp128 %Val0, fp128 %Val1) + declare ppc_fp128 @llvm.minimum.ppcf128(ppc_fp128 %Val0, ppc_fp128 %Val1) + +Overview: +""""""""" + +The '``llvm.minimum.*``' intrinsics return the minimum of the two +arguments, propagating NaNs and treating -0.0 as less than +0.0. + + +Arguments: +"""""""""" + +The arguments and return value are floating-point numbers of the same +type. + +Semantics: +"""""""""" +If either operand is a NaN, returns NaN. Otherwise returns the lesser +of the two arguments. -0.0 is considered to be less than +0.0 for this +intrinsic. Note that these are the semantics specified in the draft of +IEEE 754-2018. + +'``llvm.maximum.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +This is an overloaded intrinsic. You can use ``llvm.maximum`` on any +floating-point or vector of floating-point type. Not all targets support +all types however. + +:: + + declare float @llvm.maximum.f32(float %Val0, float %Val1) + declare double @llvm.maximum.f64(double %Val0, double %Val1) + declare x86_fp80 @llvm.maximum.f80(x86_fp80 %Val0, x86_fp80 %Val1) + declare fp128 @llvm.maximum.f128(fp128 %Val0, fp128 %Val1) + declare ppc_fp128 @llvm.maximum.ppcf128(ppc_fp128 %Val0, ppc_fp128 %Val1) + +Overview: +""""""""" + +The '``llvm.maximum.*``' intrinsics return the maximum of the two +arguments, propagating NaNs and treating -0.0 as less than +0.0. + + +Arguments: +"""""""""" + +The arguments and return value are floating-point numbers of the same +type. + +Semantics: +"""""""""" +If either operand is a NaN, returns NaN. Otherwise returns the greater +of the two arguments. -0.0 is considered to be less than +0.0 for this +intrinsic. Note that these are the semantics specified in the draft of +IEEE 754-2018. + '``llvm.copysign.*``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/include/llvm/ADT/APFloat.h b/include/llvm/ADT/APFloat.h index 5c59af4c04b..52ed183c78a 100644 --- a/include/llvm/ADT/APFloat.h +++ b/include/llvm/ADT/APFloat.h @@ -1243,6 +1243,32 @@ inline APFloat maxnum(const APFloat &A, const APFloat &B) { return (A.compare(B) == APFloat::cmpLessThan) ? B : A; } +/// Implements IEEE 754-2018 minimum semantics. Returns the smaller of 2 +/// arguments, propagating NaNs and treating -0 as less than +0. +LLVM_READONLY +inline APFloat minimum(const APFloat &A, const APFloat &B) { + if (A.isNaN()) + return A; + if (B.isNaN()) + return B; + if (A.isZero() && B.isZero() && (A.isNegative() != B.isNegative())) + return A.isNegative() ? A : B; + return (B.compare(A) == APFloat::cmpLessThan) ? B : A; +} + +/// Implements IEEE 754-2018 maximum semantics. Returns the larger of 2 +/// arguments, propagating NaNs and treating -0 as less than +0. +LLVM_READONLY +inline APFloat maximum(const APFloat &A, const APFloat &B) { + if (A.isNaN()) + return A; + if (B.isNaN()) + return B; + if (A.isZero() && B.isZero() && (A.isNegative() != B.isNegative())) + return A.isNegative() ? B : A; + return (A.compare(B) == APFloat::cmpLessThan) ? B : A; +} + } // namespace llvm #undef APFLOAT_DISPATCH_ON_SEMANTICS diff --git a/include/llvm/CodeGen/ISDOpcodes.h b/include/llvm/CodeGen/ISDOpcodes.h index ec9c46140d7..d9a513fe247 100644 --- a/include/llvm/CodeGen/ISDOpcodes.h +++ b/include/llvm/CodeGen/ISDOpcodes.h @@ -560,8 +560,9 @@ namespace ISD { /// /// The return value of (FMINNUM 0.0, -0.0) could be either 0.0 or -0.0. FMINNUM, FMAXNUM, - /// FMINNAN/FMAXNAN - Behave identically to FMINNUM/FMAXNUM, except that - /// when a single input is NaN, NaN is returned. + /// FMINNAN/FMAXNAN - NaN-propagating minimum/maximum that also treat -0.0 + /// as less than 0.0. While FMINNUM/FMAXNUM follow IEEE 754-2008 semantics, + /// FMINNAN/FMAXNAN follow IEEE 754-2018 draft semantics. FMINNAN, FMAXNAN, /// FSINCOS - Compute both fsin and fcos as a single operation. diff --git a/include/llvm/IR/IRBuilder.h b/include/llvm/IR/IRBuilder.h index 0af53c5b3f4..e89c44380d0 100644 --- a/include/llvm/IR/IRBuilder.h +++ b/include/llvm/IR/IRBuilder.h @@ -705,6 +705,16 @@ public: return CreateBinaryIntrinsic(Intrinsic::maxnum, LHS, RHS, nullptr, Name); } + /// Create call to the minimum intrinsic. + CallInst *CreateMinimum(Value *LHS, Value *RHS, const Twine &Name = "") { + return CreateBinaryIntrinsic(Intrinsic::minimum, LHS, RHS, nullptr, Name); + } + + /// Create call to the maximum intrinsic. + CallInst *CreateMaximum(Value *LHS, Value *RHS, const Twine &Name = "") { + return CreateBinaryIntrinsic(Intrinsic::maximum, LHS, RHS, nullptr, Name); + } + private: /// Create a call to a masked intrinsic with given Id. CallInst *CreateMaskedIntrinsic(Intrinsic::ID Id, ArrayRef Ops, diff --git a/include/llvm/IR/Intrinsics.td b/include/llvm/IR/Intrinsics.td index b405e86ef40..410e35f9acb 100644 --- a/include/llvm/IR/Intrinsics.td +++ b/include/llvm/IR/Intrinsics.td @@ -453,6 +453,14 @@ def int_maxnum : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, Commutative] >; +def int_minimum : Intrinsic<[llvm_anyfloat_ty], + [LLVMMatchType<0>, LLVMMatchType<0>], + [IntrNoMem, IntrSpeculatable, Commutative] +>; +def int_maximum : Intrinsic<[llvm_anyfloat_ty], + [LLVMMatchType<0>, LLVMMatchType<0>], + [IntrNoMem, IntrSpeculatable, Commutative] +>; // NOTE: these are internal interfaces. def int_setjmp : Intrinsic<[llvm_i32_ty], [llvm_ptr_ty]>; diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 868160c77a3..f7866665bcb 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -5584,6 +5584,18 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { getValue(I.getArgOperand(1)))); return nullptr; } + case Intrinsic::minimum: + setValue(&I, DAG.getNode(ISD::FMINNAN, sdl, + getValue(I.getArgOperand(0)).getValueType(), + getValue(I.getArgOperand(0)), + getValue(I.getArgOperand(1)))); + return nullptr; + case Intrinsic::maximum: + setValue(&I, DAG.getNode(ISD::FMAXNAN, sdl, + getValue(I.getArgOperand(0)).getValueType(), + getValue(I.getArgOperand(0)), + getValue(I.getArgOperand(1)))); + return nullptr; case Intrinsic::copysign: setValue(&I, DAG.getNode(ISD::FCOPYSIGN, sdl, getValue(I.getArgOperand(0)).getValueType(), diff --git a/unittests/ADT/APFloatTest.cpp b/unittests/ADT/APFloatTest.cpp index 1212b45fb57..b739e857849 100644 --- a/unittests/ADT/APFloatTest.cpp +++ b/unittests/ADT/APFloatTest.cpp @@ -555,6 +555,36 @@ TEST(APFloatTest, MaxNum) { EXPECT_EQ(1.0, maxnum(nan, f1).convertToDouble()); } +TEST(APFloatTest, Minimum) { + APFloat f1(1.0); + APFloat f2(2.0); + APFloat zp(0.0); + APFloat zn(-0.0); + APFloat nan = APFloat::getNaN(APFloat::IEEEdouble()); + + EXPECT_EQ(1.0, minimum(f1, f2).convertToDouble()); + EXPECT_EQ(1.0, minimum(f2, f1).convertToDouble()); + EXPECT_EQ(-0.0, minimum(zp, zn).convertToDouble()); + EXPECT_EQ(-0.0, minimum(zn, zp).convertToDouble()); + EXPECT_TRUE(std::isnan(minimum(f1, nan).convertToDouble())); + EXPECT_TRUE(std::isnan(minimum(nan, f1).convertToDouble())); +} + +TEST(APFloatTest, Maximum) { + APFloat f1(1.0); + APFloat f2(2.0); + APFloat zp(0.0); + APFloat zn(-0.0); + APFloat nan = APFloat::getNaN(APFloat::IEEEdouble()); + + EXPECT_EQ(2.0, maximum(f1, f2).convertToDouble()); + EXPECT_EQ(2.0, maximum(f2, f1).convertToDouble()); + EXPECT_EQ(0.0, maximum(zp, zn).convertToDouble()); + EXPECT_EQ(0.0, maximum(zn, zp).convertToDouble()); + EXPECT_TRUE(std::isnan(maximum(f1, nan).convertToDouble())); + EXPECT_TRUE(std::isnan(maximum(nan, f1).convertToDouble())); +} + TEST(APFloatTest, Denormal) { APFloat::roundingMode rdmd = APFloat::rmNearestTiesToEven; diff --git a/unittests/IR/IRBuilderTest.cpp b/unittests/IR/IRBuilderTest.cpp index 42c0393d382..713c0a14f66 100644 --- a/unittests/IR/IRBuilderTest.cpp +++ b/unittests/IR/IRBuilderTest.cpp @@ -68,6 +68,14 @@ TEST_F(IRBuilderTest, Intrinsics) { II = cast(Call); EXPECT_EQ(II->getIntrinsicID(), Intrinsic::maxnum); + Call = Builder.CreateMinimum(V, V); + II = cast(Call); + EXPECT_EQ(II->getIntrinsicID(), Intrinsic::minimum); + + Call = Builder.CreateMaximum(V, V); + II = cast(Call); + EXPECT_EQ(II->getIntrinsicID(), Intrinsic::maximum); + Call = Builder.CreateIntrinsic(Intrinsic::readcyclecounter, {}, {}); II = cast(Call); EXPECT_EQ(II->getIntrinsicID(), Intrinsic::readcyclecounter); -- GitLab From 3baba1cf36ad30ab99ca557ab8f208c3be88aaaf Mon Sep 17 00:00:00 2001 From: Thomas Lively Date: Sat, 13 Oct 2018 07:26:10 +0000 Subject: [PATCH 0140/1116] [WebAssembly] SIMD min and max Summary: Depends on D52324 and D52764. Reviewers: aheejin, dschuff Subscribers: sbc100, jgravelle-google, sunfish, llvm-commits Differential Revision: https://reviews.llvm.org/D52325 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344438 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../WebAssembly/WebAssemblyInstrSIMD.td | 14 +- test/CodeGen/WebAssembly/f32.ll | 24 ++- test/CodeGen/WebAssembly/f64.ll | 24 ++- test/CodeGen/WebAssembly/simd-arith.ll | 168 ++++++++++++++++++ test/MC/WebAssembly/simd-encodings.s | 12 ++ 5 files changed, 223 insertions(+), 19 deletions(-) diff --git a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td index 1eb38588c81..af5c03599cd 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -699,21 +699,21 @@ defm "" : SIMDAbs; // Floating-point min and max //===----------------------------------------------------------------------===// +multiclass SIMDBinaryFP baseInst> { + defm "" : SIMDBinary; + defm "" : SIMDBinary; +} + // NaN-propagating minimum: min -// TODO +defm MIN : SIMDBinaryFP; // NaN-propagating maximum: max -// TODO +defm MAX : SIMDBinaryFP; //===----------------------------------------------------------------------===// // Floating-point arithmetic //===----------------------------------------------------------------------===// -multiclass SIMDBinaryFP baseInst> { - defm "" : SIMDBinary; - defm "" : SIMDBinary; -} - // Addition: add let isCommutable = 1 in defm ADD : SIMDBinaryFP; diff --git a/test/CodeGen/WebAssembly/f32.ll b/test/CodeGen/WebAssembly/f32.ll index 9314b2e6e5f..27520d035c9 100644 --- a/test/CodeGen/WebAssembly/f32.ll +++ b/test/CodeGen/WebAssembly/f32.ll @@ -123,12 +123,6 @@ define float @nearest32_via_rint(float %x) { ret float %a } -; Min and max tests. LLVM currently only forms fminnan and fmaxnan nodes in -; cases where there's a single fcmp with a select and it can prove that one -; of the arms is never NaN, so we only test that case. In the future if LLVM -; learns to form fminnan/fmaxnan in more cases, we can write more general -; tests. - ; CHECK-LABEL: fmin32: ; CHECK: f32.min $push1=, $pop{{[0-9]+}}, $pop[[LR]]{{$}} ; CHECK-NEXT: return $pop1{{$}} @@ -147,6 +141,24 @@ define float @fmax32(float %x) { ret float %b } +; CHECK-LABEL: fmin32_intrinsic: +; CHECK: f32.min $push0=, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}} +; CHECK-NEXT: return $pop0{{$}} +declare float @llvm.minimum.f32(float, float) +define float @fmin32_intrinsic(float %x, float %y) { + %a = call float @llvm.minimum.f32(float %x, float %y) + ret float %a +} + +; CHECK-LABEL: fmax32_intrinsic: +; CHECK: f32.max $push0=, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}} +; CHECK-NEXT: return $pop0{{$}} +declare float @llvm.maximum.f32(float, float) +define float @fmax32_intrinsic(float %x, float %y) { + %a = call float @llvm.maximum.f32(float %x, float %y) + ret float %a +} + ; CHECK-LABEL: fma32: ; CHECK: {{^}} f32.call $push[[LR:[0-9]+]]=, fmaf@FUNCTION, $pop{{[0-9]+}}, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}} ; CHECK-NEXT: return $pop[[LR]]{{$}} diff --git a/test/CodeGen/WebAssembly/f64.ll b/test/CodeGen/WebAssembly/f64.ll index 5635e326561..d02767fa3a1 100644 --- a/test/CodeGen/WebAssembly/f64.ll +++ b/test/CodeGen/WebAssembly/f64.ll @@ -123,12 +123,6 @@ define double @nearest64_via_rint(double %x) { ret double %a } -; Min and max tests. LLVM currently only forms fminnan and fmaxnan nodes in -; cases where there's a single fcmp with a select and it can prove that one -; of the arms is never NaN, so we only test that case. In the future if LLVM -; learns to form fminnan/fmaxnan in more cases, we can write more general -; tests. - ; CHECK-LABEL: fmin64: ; CHECK: f64.min $push1=, $pop{{[0-9]+}}, $pop[[LR]]{{$}} ; CHECK-NEXT: return $pop1{{$}} @@ -147,6 +141,24 @@ define double @fmax64(double %x) { ret double %b } +; CHECK-LABEL: fmin64_intrinsic: +; CHECK: f64.min $push0=, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}} +; CHECK-NEXT: return $pop0{{$}} +declare double @llvm.minimum.f64(double, double) +define double @fmin64_intrinsic(double %x, double %y) { + %a = call double @llvm.minimum.f64(double %x, double %y) + ret double %a +} + +; CHECK-LABEL: fmax64_intrinsic: +; CHECK: f64.max $push0=, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}} +; CHECK-NEXT: return $pop0{{$}} +declare double @llvm.maximum.f64(double, double) +define double @fmax64_intrinsic(double %x, double %y) { + %a = call double @llvm.maximum.f64(double %x, double %y) + ret double %a +} + ; CHECK-LABEL: fma64: ; CHECK: {{^}} f64.call $push[[LR:[0-9]+]]=, fma@FUNCTION, $pop{{[0-9]+}}, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}} ; CHECK-NEXT: return $pop[[LR]]{{$}} diff --git a/test/CodeGen/WebAssembly/simd-arith.ll b/test/CodeGen/WebAssembly/simd-arith.ll index f3e70156d8b..973f78b30dc 100644 --- a/test/CodeGen/WebAssembly/simd-arith.ll +++ b/test/CodeGen/WebAssembly/simd-arith.ll @@ -765,6 +765,90 @@ define <4 x float> @abs_v4f32(<4 x float> %x) { ret <4 x float> %a } +; CHECK-LABEL: min_unordered_v4f32: +; NO-SIMD128-NOT: f32x4 +; SIMD128-NEXT: .param v128{{$}} +; SIMD128-NEXT: .result v128{{$}} +; SIMD128-NEXT: f32.const $push[[L0:[0-9]+]]=, 0x1.4p2 +; SIMD128-NEXT: f32x4.splat $push[[L1:[0-9]+]]=, $pop[[L0]] +; SIMD128-NEXT: f32x4.min $push[[R:[0-9]+]]=, $0, $pop[[L1]]{{$}} +; SIMD128-NEXT: return $pop[[R]]{{$}} +define <4 x float> @min_unordered_v4f32(<4 x float> %x) { + %cmps = fcmp ule <4 x float> %x, + %a = select <4 x i1> %cmps, <4 x float> %x, + <4 x float> + ret <4 x float> %a +} + +; CHECK-LABEL: max_unordered_v4f32: +; NO-SIMD128-NOT: f32x4 +; SIMD128-NEXT: .param v128{{$}} +; SIMD128-NEXT: .result v128{{$}} +; SIMD128-NEXT: f32.const $push[[L0:[0-9]+]]=, 0x1.4p2 +; SIMD128-NEXT: f32x4.splat $push[[L1:[0-9]+]]=, $pop[[L0]] +; SIMD128-NEXT: f32x4.max $push[[R:[0-9]+]]=, $0, $pop[[L1]]{{$}} +; SIMD128-NEXT: return $pop[[R]]{{$}} +define <4 x float> @max_unordered_v4f32(<4 x float> %x) { + %cmps = fcmp uge <4 x float> %x, + %a = select <4 x i1> %cmps, <4 x float> %x, + <4 x float> + ret <4 x float> %a +} + +; CHECK-LABEL: min_ordered_v4f32: +; NO-SIMD128-NOT: f32x4 +; SIMD128-NEXT: .param v128{{$}} +; SIMD128-NEXT: .result v128{{$}} +; SIMD128-NEXT: f32.const $push[[L0:[0-9]+]]=, 0x1.4p2 +; SIMD128-NEXT: f32x4.splat $push[[L1:[0-9]+]]=, $pop[[L0]] +; SIMD128-NEXT: f32x4.min $push[[R:[0-9]+]]=, $pop[[L1]], $0{{$}} +; SIMD128-NEXT: return $pop[[R]]{{$}} +define <4 x float> @min_ordered_v4f32(<4 x float> %x) { + %cmps = fcmp ole <4 x float> , %x + %a = select <4 x i1> %cmps, + <4 x float> , <4 x float> %x + ret <4 x float> %a +} + +; CHECK-LABEL: max_ordered_v4f32: +; NO-SIMD128-NOT: f32x4 +; SIMD128-NEXT: .param v128{{$}} +; SIMD128-NEXT: .result v128{{$}} +; SIMD128-NEXT: f32.const $push[[L0:[0-9]+]]=, 0x1.4p2 +; SIMD128-NEXT: f32x4.splat $push[[L1:[0-9]+]]=, $pop[[L0]] +; SIMD128-NEXT: f32x4.max $push[[R:[0-9]+]]=, $pop[[L1]], $0{{$}} +; SIMD128-NEXT: return $pop[[R]]{{$}} +define <4 x float> @max_ordered_v4f32(<4 x float> %x) { + %cmps = fcmp oge <4 x float> , %x + %a = select <4 x i1> %cmps, + <4 x float> , <4 x float> %x + ret <4 x float> %a +} + +; CHECK-LABEL: min_intrinsic_v4f32: +; NO-SIMD128-NOT: f32x4 +; SIMD128-NEXT: .param v128, v128{{$}} +; SIMD128-NEXT: .result v128{{$}} +; SIMD128-NEXT: f32x4.min $push[[R:[0-9]+]]=, $0, $1{{$}} +; SIMD128-NEXT: return $pop[[R]]{{$}} +declare <4 x float> @llvm.minimum.v4f32(<4 x float>, <4 x float>) +define <4 x float> @min_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) { + %a = call <4 x float> @llvm.minimum.v4f32(<4 x float> %x, <4 x float> %y) + ret <4 x float> %a +} + +; CHECK-LABEL: max_intrinsic_v4f32: +; NO-SIMD128-NOT: f32x4 +; SIMD128-NEXT: .param v128, v128{{$}} +; SIMD128-NEXT: .result v128{{$}} +; SIMD128-NEXT: f32x4.max $push[[R:[0-9]+]]=, $0, $1{{$}} +; SIMD128-NEXT: return $pop[[R]]{{$}} +declare <4 x float> @llvm.maximum.v4f32(<4 x float>, <4 x float>) +define <4 x float> @max_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) { + %a = call <4 x float> @llvm.maximum.v4f32(<4 x float> %x, <4 x float> %y) + ret <4 x float> %a +} + ; CHECK-LABEL: add_v4f32: ; NO-SIMD128-NOT: f32x4 ; SIMD128-NEXT: .param v128, v128{{$}} @@ -848,6 +932,90 @@ define <2 x double> @abs_v2f64(<2 x double> %x) { ret <2 x double> %a } +; CHECK-LABEL: min_unordered_v2f64: +; NO-SIMD128-NOT: f64x2 +; SIMD128-NEXT: .param v128{{$}} +; SIMD128-NEXT: .result v128{{$}} +; SIMD128-NEXT: f64.const $push[[L0:[0-9]+]]=, 0x1.4p2 +; SIMD128-NEXT: f64x2.splat $push[[L1:[0-9]+]]=, $pop[[L0]] +; SIMD128-NEXT: f64x2.min $push[[R:[0-9]+]]=, $0, $pop[[L1]]{{$}} +; SIMD128-NEXT: return $pop[[R]]{{$}} +define <2 x double> @min_unordered_v2f64(<2 x double> %x) { + %cmps = fcmp ule <2 x double> %x, + %a = select <2 x i1> %cmps, <2 x double> %x, + <2 x double> + ret <2 x double> %a +} + +; CHECK-LABEL: max_unordered_v2f64: +; NO-SIMD128-NOT: f64x2 +; SIMD128-NEXT: .param v128{{$}} +; SIMD128-NEXT: .result v128{{$}} +; SIMD128-NEXT: f64.const $push[[L0:[0-9]+]]=, 0x1.4p2 +; SIMD128-NEXT: f64x2.splat $push[[L1:[0-9]+]]=, $pop[[L0]] +; SIMD128-NEXT: f64x2.max $push[[R:[0-9]+]]=, $0, $pop[[L1]]{{$}} +; SIMD128-NEXT: return $pop[[R]]{{$}} +define <2 x double> @max_unordered_v2f64(<2 x double> %x) { + %cmps = fcmp uge <2 x double> %x, + %a = select <2 x i1> %cmps, <2 x double> %x, + <2 x double> + ret <2 x double> %a +} + +; CHECK-LABEL: min_ordered_v2f64: +; NO-SIMD128-NOT: f64x2 +; SIMD128-NEXT: .param v128{{$}} +; SIMD128-NEXT: .result v128{{$}} +; SIMD128-NEXT: f64.const $push[[L0:[0-9]+]]=, 0x1.4p2 +; SIMD128-NEXT: f64x2.splat $push[[L1:[0-9]+]]=, $pop[[L0]] +; SIMD128-NEXT: f64x2.min $push[[R:[0-9]+]]=, $pop[[L1]], $0{{$}} +; SIMD128-NEXT: return $pop[[R]]{{$}} +define <2 x double> @min_ordered_v2f64(<2 x double> %x) { + %cmps = fcmp ole <2 x double> , %x + %a = select <2 x i1> %cmps, <2 x double> , + <2 x double> %x + ret <2 x double> %a +} + +; CHECK-LABEL: max_ordered_v2f64: +; NO-SIMD128-NOT: f64x2 +; SIMD128-NEXT: .param v128{{$}} +; SIMD128-NEXT: .result v128{{$}} +; SIMD128-NEXT: f64.const $push[[L0:[0-9]+]]=, 0x1.4p2 +; SIMD128-NEXT: f64x2.splat $push[[L1:[0-9]+]]=, $pop[[L0]] +; SIMD128-NEXT: f64x2.max $push[[R:[0-9]+]]=, $pop[[L1]], $0{{$}} +; SIMD128-NEXT: return $pop[[R]]{{$}} +define <2 x double> @max_ordered_v2f64(<2 x double> %x) { + %cmps = fcmp oge <2 x double> , %x + %a = select <2 x i1> %cmps, <2 x double> , + <2 x double> %x + ret <2 x double> %a +} + +; CHECK-LABEL: min_intrinsic_v2f64: +; NO-SIMD128-NOT: f64x2 +; SIMD128-NEXT: .param v128, v128{{$}} +; SIMD128-NEXT: .result v128{{$}} +; SIMD128-NEXT: f64x2.min $push[[R:[0-9]+]]=, $0, $1{{$}} +; SIMD128-NEXT: return $pop[[R]]{{$}} +declare <2 x double> @llvm.minimum.v2f64(<2 x double>, <2 x double>) +define <2 x double> @min_intrinsic_v2f64(<2 x double> %x, <2 x double> %y) { + %a = call <2 x double> @llvm.minimum.v2f64(<2 x double> %x, <2 x double> %y) + ret <2 x double> %a +} + +; CHECK-LABEL: max_intrinsic_v2f64: +; NO-SIMD128-NOT: f64x2 +; SIMD128-NEXT: .param v128, v128{{$}} +; SIMD128-NEXT: .result v128{{$}} +; SIMD128-NEXT: f64x2.max $push[[R:[0-9]+]]=, $0, $1{{$}} +; SIMD128-NEXT: return $pop[[R]]{{$}} +declare <2 x double> @llvm.maximum.v2f64(<2 x double>, <2 x double>) +define <2 x double> @max_intrinsic_v2f64(<2 x double> %x, <2 x double> %y) { + %a = call <2 x double> @llvm.maximum.v2f64(<2 x double> %x, <2 x double> %y) + ret <2 x double> %a +} + ; CHECK-LABEL: add_v2f64: ; NO-SIMD128-NOT: f64x2 ; SIMD128-VM-NOT: f62x2 diff --git a/test/MC/WebAssembly/simd-encodings.s b/test/MC/WebAssembly/simd-encodings.s index 02d07674c16..8cd4bc9cd34 100644 --- a/test/MC/WebAssembly/simd-encodings.s +++ b/test/MC/WebAssembly/simd-encodings.s @@ -382,6 +382,18 @@ # CHECK: f64x2.abs # encoding: [0xfd,0x80] f64x2.abs + # CHECK: f32x4.min # encoding: [0xfd,0x81] + f32x4.min + + # CHECK: f64x2.min # encoding: [0xfd,0x82] + f64x2.min + + # CHECK: f32x4.max # encoding: [0xfd,0x83] + f32x4.max + + # CHECK: f64x2.max # encoding: [0xfd,0x84] + f64x2.max + # CHECK: f32x4.add # encoding: [0xfd,0x85] f32x4.add -- GitLab From 952b7309b14ccede0e01ccd43f85d8c28aa99280 Mon Sep 17 00:00:00 2001 From: "Arnaud A. de Grandmaison" Date: Sat, 13 Oct 2018 07:43:56 +0000 Subject: [PATCH 0141/1116] [AArch64] Swap comparison operands if that enables some folding. Summary: AArch64 can fold some shift+extend operations on the RHS operand of comparisons, so swap the operands if that makes sense. This provides a fix for https://bugs.llvm.org/show_bug.cgi?id=38751 Reviewers: efriedma, t.p.northover, javed.absar Subscribers: mcrosier, kristof.beyls, llvm-commits Differential Revision: https://reviews.llvm.org/D53067 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344439 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AArch64/AArch64ISelLowering.cpp | 86 ++- test/CodeGen/AArch64/and-mask-removal.ll | 12 +- .../lack-of-signed-truncation-check.ll | 45 +- test/CodeGen/AArch64/sat-add.ll | 16 +- .../AArch64/signed-truncation-check.ll | 45 +- test/CodeGen/AArch64/swap-compare-operands.ll | 632 ++++++++++++++++++ 6 files changed, 752 insertions(+), 84 deletions(-) create mode 100644 test/CodeGen/AArch64/swap-compare-operands.ll diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index 8cf9d55a950..90633807cdf 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1460,6 +1460,21 @@ static bool isLegalArithImmed(uint64_t C) { return IsLegal; } +// Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on +// the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags +// can be set differently by this operation. It comes down to whether +// "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then +// everything is fine. If not then the optimization is wrong. Thus general +// comparisons are only valid if op2 != 0. +// +// So, finally, the only LLVM-native comparisons that don't mention C and V +// are SETEQ and SETNE. They're the only ones we can safely use CMN for in +// the absence of information about op2. +static bool isCMN(SDValue Op, ISD::CondCode CC) { + return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) && + (CC == ISD::SETEQ || CC == ISD::SETNE); +} + static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG) { EVT VT = LHS.getValueType(); @@ -1482,18 +1497,8 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, // register to WZR/XZR if it ends up being unused. unsigned Opcode = AArch64ISD::SUBS; - if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) && - (CC == ISD::SETEQ || CC == ISD::SETNE)) { - // We'd like to combine a (CMP op1, (sub 0, op2) into a CMN instruction on - // the grounds that "op1 - (-op2) == op1 + op2". However, the C and V flags - // can be set differently by this operation. It comes down to whether - // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then - // everything is fine. If not then the optimization is wrong. Thus general - // comparisons are only valid if op2 != 0. - - // So, finally, the only LLVM-native comparisons that don't mention C and V - // are SETEQ and SETNE. They're the only ones we can safely use CMN for in - // the absence of information about op2. + if (isCMN(RHS, CC)) { + // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ? Opcode = AArch64ISD::ADDS; RHS = RHS.getOperand(1); } else if (LHS.getOpcode() == ISD::AND && isNullConstant(RHS) && @@ -1765,6 +1770,42 @@ static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val, /// @} +/// Returns how profitable it is to fold a comparison's operand's shift and/or +/// extension operations. +static unsigned getCmpOperandFoldingProfit(SDValue Op) { + auto isSupportedExtend = [&](SDValue V) { + if (V.getOpcode() == ISD::SIGN_EXTEND_INREG) + return true; + + if (V.getOpcode() == ISD::AND) + if (ConstantSDNode *MaskCst = dyn_cast(V.getOperand(1))) { + uint64_t Mask = MaskCst->getZExtValue(); + return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF); + } + + return false; + }; + + if (!Op.hasOneUse()) + return 0; + + if (isSupportedExtend(Op)) + return 1; + + unsigned Opc = Op.getOpcode(); + if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA) + if (ConstantSDNode *ShiftCst = dyn_cast(Op.getOperand(1))) { + uint64_t Shift = ShiftCst->getZExtValue(); + if (isSupportedExtend(Op.getOperand(0))) + return (Shift <= 4) ? 2 : 1; + EVT VT = Op.getValueType(); + if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63)) + return 1; + } + + return 0; +} + static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &AArch64cc, SelectionDAG &DAG, const SDLoc &dl) { @@ -1822,6 +1863,27 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, } } } + + // Comparisons are canonicalized so that the RHS operand is simpler than the + // LHS one, the extreme case being when RHS is an immediate. However, AArch64 + // can fold some shift+extend operations on the RHS operand, so swap the + // operands if that can be done. + // + // For example: + // lsl w13, w11, #1 + // cmp w13, w12 + // can be turned into: + // cmp w12, w11, lsl #1 + if (!isa(RHS) || + !isLegalArithImmed(cast(RHS)->getZExtValue())) { + SDValue TheLHS = isCMN(LHS, CC) ? LHS.getOperand(1) : LHS; + + if (getCmpOperandFoldingProfit(TheLHS) > getCmpOperandFoldingProfit(RHS)) { + std::swap(LHS, RHS); + CC = ISD::getSetCCSwappedOperands(CC); + } + } + SDValue Cmp; AArch64CC::CondCode AArch64CC; if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa(RHS)) { diff --git a/test/CodeGen/AArch64/and-mask-removal.ll b/test/CodeGen/AArch64/and-mask-removal.ll index c02bc881cd3..4424b0e4112 100644 --- a/test/CodeGen/AArch64/and-mask-removal.ll +++ b/test/CodeGen/AArch64/and-mask-removal.ll @@ -179,7 +179,9 @@ ret_false: ret_true: ret i1 true ; CHECK-LABEL: test16_2 -; CHECK: and +; CHECK: mov [[CST:w[0-9]+]], #16882 +; CHECK: add [[ADD:w[0-9]+]], w0, [[CST]] +; CHECK: cmp {{.*}}, [[ADD]], uxth ; CHECK: ret } @@ -207,7 +209,9 @@ ret_false: ret_true: ret i1 true ; CHECK-LABEL: test16_4 -; CHECK: and +; CHECK: mov [[CST:w[0-9]+]], #29985 +; CHECK: add [[ADD:w[0-9]+]], w0, [[CST]] +; CHECK: cmp {{.*}}, [[ADD]], uxth ; CHECK: ret } @@ -249,7 +253,9 @@ ret_false: ret_true: ret i1 true ; CHECK-LABEL: test16_7 -; CHECK: and +; CHECK: mov [[CST:w[0-9]+]], #9272 +; CHECK: add [[ADD:w[0-9]+]], w0, [[CST]] +; CHECK: cmp {{.*}}, [[ADD]], uxth ; CHECK: ret } diff --git a/test/CodeGen/AArch64/lack-of-signed-truncation-check.ll b/test/CodeGen/AArch64/lack-of-signed-truncation-check.ll index d8ae73293d9..f4680354d7e 100644 --- a/test/CodeGen/AArch64/lack-of-signed-truncation-check.ll +++ b/test/CodeGen/AArch64/lack-of-signed-truncation-check.ll @@ -35,8 +35,7 @@ define i1 @shifts_necmp_i16_i8(i16 %x) nounwind { define i1 @shifts_necmp_i32_i16(i32 %x) nounwind { ; CHECK-LABEL: shifts_necmp_i32_i16: ; CHECK: // %bb.0: -; CHECK-NEXT: sxth w8, w0 -; CHECK-NEXT: cmp w8, w0 +; CHECK-NEXT: cmp w0, w0, sxth ; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret %tmp0 = shl i32 %x, 16 ; 32-16 @@ -48,8 +47,7 @@ define i1 @shifts_necmp_i32_i16(i32 %x) nounwind { define i1 @shifts_necmp_i32_i8(i32 %x) nounwind { ; CHECK-LABEL: shifts_necmp_i32_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: sxtb w8, w0 -; CHECK-NEXT: cmp w8, w0 +; CHECK-NEXT: cmp w0, w0, sxtb ; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret %tmp0 = shl i32 %x, 24 ; 32-8 @@ -61,8 +59,7 @@ define i1 @shifts_necmp_i32_i8(i32 %x) nounwind { define i1 @shifts_necmp_i64_i32(i64 %x) nounwind { ; CHECK-LABEL: shifts_necmp_i64_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: sxtw x8, w0 -; CHECK-NEXT: cmp x8, x0 +; CHECK-NEXT: cmp x0, w0, sxtw ; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret %tmp0 = shl i64 %x, 32 ; 64-32 @@ -74,8 +71,7 @@ define i1 @shifts_necmp_i64_i32(i64 %x) nounwind { define i1 @shifts_necmp_i64_i16(i64 %x) nounwind { ; CHECK-LABEL: shifts_necmp_i64_i16: ; CHECK: // %bb.0: -; CHECK-NEXT: sxth x8, w0 -; CHECK-NEXT: cmp x8, x0 +; CHECK-NEXT: cmp x0, w0, sxth ; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret %tmp0 = shl i64 %x, 48 ; 64-16 @@ -87,8 +83,7 @@ define i1 @shifts_necmp_i64_i16(i64 %x) nounwind { define i1 @shifts_necmp_i64_i8(i64 %x) nounwind { ; CHECK-LABEL: shifts_necmp_i64_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: sxtb x8, w0 -; CHECK-NEXT: cmp x8, x0 +; CHECK-NEXT: cmp x0, w0, sxtb ; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret %tmp0 = shl i64 %x, 56 ; 64-8 @@ -117,8 +112,7 @@ define i1 @add_ultcmp_i16_i8(i16 %x) nounwind { define i1 @add_ultcmp_i32_i16(i32 %x) nounwind { ; CHECK-LABEL: add_ultcmp_i32_i16: ; CHECK: // %bb.0: -; CHECK-NEXT: sxth w8, w0 -; CHECK-NEXT: cmp w8, w0 +; CHECK-NEXT: cmp w0, w0, sxth ; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret %tmp0 = add i32 %x, -32768 ; ~0U << (16-1) @@ -129,8 +123,7 @@ define i1 @add_ultcmp_i32_i16(i32 %x) nounwind { define i1 @add_ultcmp_i32_i8(i32 %x) nounwind { ; CHECK-LABEL: add_ultcmp_i32_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: sxtb w8, w0 -; CHECK-NEXT: cmp w8, w0 +; CHECK-NEXT: cmp w0, w0, sxtb ; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret %tmp0 = add i32 %x, -128 ; ~0U << (8-1) @@ -141,8 +134,7 @@ define i1 @add_ultcmp_i32_i8(i32 %x) nounwind { define i1 @add_ultcmp_i64_i32(i64 %x) nounwind { ; CHECK-LABEL: add_ultcmp_i64_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: sxtw x8, w0 -; CHECK-NEXT: cmp x8, x0 +; CHECK-NEXT: cmp x0, w0, sxtw ; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret %tmp0 = add i64 %x, -2147483648 ; ~0U << (32-1) @@ -153,8 +145,7 @@ define i1 @add_ultcmp_i64_i32(i64 %x) nounwind { define i1 @add_ultcmp_i64_i16(i64 %x) nounwind { ; CHECK-LABEL: add_ultcmp_i64_i16: ; CHECK: // %bb.0: -; CHECK-NEXT: sxth x8, w0 -; CHECK-NEXT: cmp x8, x0 +; CHECK-NEXT: cmp x0, w0, sxth ; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret %tmp0 = add i64 %x, -32768 ; ~0U << (16-1) @@ -165,8 +156,7 @@ define i1 @add_ultcmp_i64_i16(i64 %x) nounwind { define i1 @add_ultcmp_i64_i8(i64 %x) nounwind { ; CHECK-LABEL: add_ultcmp_i64_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: sxtb x8, w0 -; CHECK-NEXT: cmp x8, x0 +; CHECK-NEXT: cmp x0, w0, sxtb ; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret %tmp0 = add i64 %x, -128 ; ~0U << (8-1) @@ -208,8 +198,7 @@ define i1 @add_ugecmp_i16_i8(i16 %x) nounwind { define i1 @add_ugecmp_i32_i16(i32 %x) nounwind { ; CHECK-LABEL: add_ugecmp_i32_i16: ; CHECK: // %bb.0: -; CHECK-NEXT: sxth w8, w0 -; CHECK-NEXT: cmp w8, w0 +; CHECK-NEXT: cmp w0, w0, sxth ; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret %tmp0 = add i32 %x, 32768 ; 1U << (16-1) @@ -220,8 +209,7 @@ define i1 @add_ugecmp_i32_i16(i32 %x) nounwind { define i1 @add_ugecmp_i32_i8(i32 %x) nounwind { ; CHECK-LABEL: add_ugecmp_i32_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: sxtb w8, w0 -; CHECK-NEXT: cmp w8, w0 +; CHECK-NEXT: cmp w0, w0, sxtb ; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret %tmp0 = add i32 %x, 128 ; 1U << (8-1) @@ -232,8 +220,7 @@ define i1 @add_ugecmp_i32_i8(i32 %x) nounwind { define i1 @add_ugecmp_i64_i32(i64 %x) nounwind { ; CHECK-LABEL: add_ugecmp_i64_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: sxtw x8, w0 -; CHECK-NEXT: cmp x8, x0 +; CHECK-NEXT: cmp x0, w0, sxtw ; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret %tmp0 = add i64 %x, 2147483648 ; 1U << (32-1) @@ -244,8 +231,7 @@ define i1 @add_ugecmp_i64_i32(i64 %x) nounwind { define i1 @add_ugecmp_i64_i16(i64 %x) nounwind { ; CHECK-LABEL: add_ugecmp_i64_i16: ; CHECK: // %bb.0: -; CHECK-NEXT: sxth x8, w0 -; CHECK-NEXT: cmp x8, x0 +; CHECK-NEXT: cmp x0, w0, sxth ; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret %tmp0 = add i64 %x, 32768 ; 1U << (16-1) @@ -256,8 +242,7 @@ define i1 @add_ugecmp_i64_i16(i64 %x) nounwind { define i1 @add_ugecmp_i64_i8(i64 %x) nounwind { ; CHECK-LABEL: add_ugecmp_i64_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: sxtb x8, w0 -; CHECK-NEXT: cmp x8, x0 +; CHECK-NEXT: cmp x0, w0, sxtb ; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret %tmp0 = add i64 %x, 128 ; 1U << (8-1) diff --git a/test/CodeGen/AArch64/sat-add.ll b/test/CodeGen/AArch64/sat-add.ll index d9082859988..4d865a2b14b 100644 --- a/test/CodeGen/AArch64/sat-add.ll +++ b/test/CodeGen/AArch64/sat-add.ll @@ -52,11 +52,10 @@ define i8 @unsigned_sat_constant_i8_using_cmp_notval(i8 %x) { define i16 @unsigned_sat_constant_i16_using_min(i16 %x) { ; CHECK-LABEL: unsigned_sat_constant_i16_using_min: ; CHECK: // %bb.0: -; CHECK-NEXT: and w8, w0, #0xffff -; CHECK-NEXT: mov w9, #65493 -; CHECK-NEXT: cmp w8, w9 +; CHECK-NEXT: mov w8, #65493 +; CHECK-NEXT: cmp w8, w0, uxth ; CHECK-NEXT: mov w8, #-43 -; CHECK-NEXT: csel w8, w0, w8, lo +; CHECK-NEXT: csel w8, w0, w8, hi ; CHECK-NEXT: add w0, w8, #42 // =42 ; CHECK-NEXT: ret %c = icmp ult i16 %x, -43 @@ -82,11 +81,10 @@ define i16 @unsigned_sat_constant_i16_using_cmp_sum(i16 %x) { define i16 @unsigned_sat_constant_i16_using_cmp_notval(i16 %x) { ; CHECK-LABEL: unsigned_sat_constant_i16_using_cmp_notval: ; CHECK: // %bb.0: -; CHECK-NEXT: and w8, w0, #0xffff -; CHECK-NEXT: mov w10, #65493 -; CHECK-NEXT: add w9, w0, #42 // =42 -; CHECK-NEXT: cmp w8, w10 -; CHECK-NEXT: csinv w0, w9, wzr, ls +; CHECK-NEXT: mov w9, #65493 +; CHECK-NEXT: add w8, w0, #42 // =42 +; CHECK-NEXT: cmp w9, w0, uxth +; CHECK-NEXT: csinv w0, w8, wzr, hs ; CHECK-NEXT: ret %a = add i16 %x, 42 %c = icmp ugt i16 %x, -43 diff --git a/test/CodeGen/AArch64/signed-truncation-check.ll b/test/CodeGen/AArch64/signed-truncation-check.ll index f475dbc2f74..edd61b10d00 100644 --- a/test/CodeGen/AArch64/signed-truncation-check.ll +++ b/test/CodeGen/AArch64/signed-truncation-check.ll @@ -35,8 +35,7 @@ define i1 @shifts_eqcmp_i16_i8(i16 %x) nounwind { define i1 @shifts_eqcmp_i32_i16(i32 %x) nounwind { ; CHECK-LABEL: shifts_eqcmp_i32_i16: ; CHECK: // %bb.0: -; CHECK-NEXT: sxth w8, w0 -; CHECK-NEXT: cmp w8, w0 +; CHECK-NEXT: cmp w0, w0, sxth ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %tmp0 = shl i32 %x, 16 ; 32-16 @@ -48,8 +47,7 @@ define i1 @shifts_eqcmp_i32_i16(i32 %x) nounwind { define i1 @shifts_eqcmp_i32_i8(i32 %x) nounwind { ; CHECK-LABEL: shifts_eqcmp_i32_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: sxtb w8, w0 -; CHECK-NEXT: cmp w8, w0 +; CHECK-NEXT: cmp w0, w0, sxtb ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %tmp0 = shl i32 %x, 24 ; 32-8 @@ -61,8 +59,7 @@ define i1 @shifts_eqcmp_i32_i8(i32 %x) nounwind { define i1 @shifts_eqcmp_i64_i32(i64 %x) nounwind { ; CHECK-LABEL: shifts_eqcmp_i64_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: sxtw x8, w0 -; CHECK-NEXT: cmp x8, x0 +; CHECK-NEXT: cmp x0, w0, sxtw ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %tmp0 = shl i64 %x, 32 ; 64-32 @@ -74,8 +71,7 @@ define i1 @shifts_eqcmp_i64_i32(i64 %x) nounwind { define i1 @shifts_eqcmp_i64_i16(i64 %x) nounwind { ; CHECK-LABEL: shifts_eqcmp_i64_i16: ; CHECK: // %bb.0: -; CHECK-NEXT: sxth x8, w0 -; CHECK-NEXT: cmp x8, x0 +; CHECK-NEXT: cmp x0, w0, sxth ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %tmp0 = shl i64 %x, 48 ; 64-16 @@ -87,8 +83,7 @@ define i1 @shifts_eqcmp_i64_i16(i64 %x) nounwind { define i1 @shifts_eqcmp_i64_i8(i64 %x) nounwind { ; CHECK-LABEL: shifts_eqcmp_i64_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: sxtb x8, w0 -; CHECK-NEXT: cmp x8, x0 +; CHECK-NEXT: cmp x0, w0, sxtb ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %tmp0 = shl i64 %x, 56 ; 64-8 @@ -117,8 +112,7 @@ define i1 @add_ugecmp_i16_i8(i16 %x) nounwind { define i1 @add_ugecmp_i32_i16(i32 %x) nounwind { ; CHECK-LABEL: add_ugecmp_i32_i16: ; CHECK: // %bb.0: -; CHECK-NEXT: sxth w8, w0 -; CHECK-NEXT: cmp w8, w0 +; CHECK-NEXT: cmp w0, w0, sxth ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %tmp0 = add i32 %x, -32768 ; ~0U << (16-1) @@ -129,8 +123,7 @@ define i1 @add_ugecmp_i32_i16(i32 %x) nounwind { define i1 @add_ugecmp_i32_i8(i32 %x) nounwind { ; CHECK-LABEL: add_ugecmp_i32_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: sxtb w8, w0 -; CHECK-NEXT: cmp w8, w0 +; CHECK-NEXT: cmp w0, w0, sxtb ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %tmp0 = add i32 %x, -128 ; ~0U << (8-1) @@ -141,8 +134,7 @@ define i1 @add_ugecmp_i32_i8(i32 %x) nounwind { define i1 @add_ugecmp_i64_i32(i64 %x) nounwind { ; CHECK-LABEL: add_ugecmp_i64_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: sxtw x8, w0 -; CHECK-NEXT: cmp x8, x0 +; CHECK-NEXT: cmp x0, w0, sxtw ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %tmp0 = add i64 %x, -2147483648 ; ~0U << (32-1) @@ -153,8 +145,7 @@ define i1 @add_ugecmp_i64_i32(i64 %x) nounwind { define i1 @add_ugecmp_i64_i16(i64 %x) nounwind { ; CHECK-LABEL: add_ugecmp_i64_i16: ; CHECK: // %bb.0: -; CHECK-NEXT: sxth x8, w0 -; CHECK-NEXT: cmp x8, x0 +; CHECK-NEXT: cmp x0, w0, sxth ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %tmp0 = add i64 %x, -32768 ; ~0U << (16-1) @@ -165,8 +156,7 @@ define i1 @add_ugecmp_i64_i16(i64 %x) nounwind { define i1 @add_ugecmp_i64_i8(i64 %x) nounwind { ; CHECK-LABEL: add_ugecmp_i64_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: sxtb x8, w0 -; CHECK-NEXT: cmp x8, x0 +; CHECK-NEXT: cmp x0, w0, sxtb ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %tmp0 = add i64 %x, -128 ; ~0U << (8-1) @@ -208,8 +198,7 @@ define i1 @add_ultcmp_i16_i8(i16 %x) nounwind { define i1 @add_ultcmp_i32_i16(i32 %x) nounwind { ; CHECK-LABEL: add_ultcmp_i32_i16: ; CHECK: // %bb.0: -; CHECK-NEXT: sxth w8, w0 -; CHECK-NEXT: cmp w8, w0 +; CHECK-NEXT: cmp w0, w0, sxth ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %tmp0 = add i32 %x, 32768 ; 1U << (16-1) @@ -220,8 +209,7 @@ define i1 @add_ultcmp_i32_i16(i32 %x) nounwind { define i1 @add_ultcmp_i32_i8(i32 %x) nounwind { ; CHECK-LABEL: add_ultcmp_i32_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: sxtb w8, w0 -; CHECK-NEXT: cmp w8, w0 +; CHECK-NEXT: cmp w0, w0, sxtb ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %tmp0 = add i32 %x, 128 ; 1U << (8-1) @@ -232,8 +220,7 @@ define i1 @add_ultcmp_i32_i8(i32 %x) nounwind { define i1 @add_ultcmp_i64_i32(i64 %x) nounwind { ; CHECK-LABEL: add_ultcmp_i64_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: sxtw x8, w0 -; CHECK-NEXT: cmp x8, x0 +; CHECK-NEXT: cmp x0, w0, sxtw ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %tmp0 = add i64 %x, 2147483648 ; 1U << (32-1) @@ -244,8 +231,7 @@ define i1 @add_ultcmp_i64_i32(i64 %x) nounwind { define i1 @add_ultcmp_i64_i16(i64 %x) nounwind { ; CHECK-LABEL: add_ultcmp_i64_i16: ; CHECK: // %bb.0: -; CHECK-NEXT: sxth x8, w0 -; CHECK-NEXT: cmp x8, x0 +; CHECK-NEXT: cmp x0, w0, sxth ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %tmp0 = add i64 %x, 32768 ; 1U << (16-1) @@ -256,8 +242,7 @@ define i1 @add_ultcmp_i64_i16(i64 %x) nounwind { define i1 @add_ultcmp_i64_i8(i64 %x) nounwind { ; CHECK-LABEL: add_ultcmp_i64_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: sxtb x8, w0 -; CHECK-NEXT: cmp x8, x0 +; CHECK-NEXT: cmp x0, w0, sxtb ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %tmp0 = add i64 %x, 128 ; 1U << (8-1) diff --git a/test/CodeGen/AArch64/swap-compare-operands.ll b/test/CodeGen/AArch64/swap-compare-operands.ll new file mode 100644 index 00000000000..7c19b911166 --- /dev/null +++ b/test/CodeGen/AArch64/swap-compare-operands.ll @@ -0,0 +1,632 @@ +; RUN: llc < %s -mtriple=arm64 | FileCheck %s + +define i1 @testSwapCmpWithLSL64_1(i64 %a, i64 %b) { +; CHECK-LABEL testSwapCmpWithLSL64_1: +; CHECK: cmp x1, x0, lsl #1 +; CHECK-NEXT: cset w0, gt +entry: + %shl = shl i64 %a, 1 + %cmp = icmp slt i64 %shl, %b + ret i1 %cmp +} + +define i1 @testSwapCmpWithLSL64_63(i64 %a, i64 %b) { +; CHECK-LABEL testSwapCmpWithLSL64_63: +; CHECK: cmp x1, x0, lsl #63 +; CHECK-NEXT: cset w0, gt +entry: + %shl = shl i64 %a, 63 + %cmp = icmp slt i64 %shl, %b + ret i1 %cmp +} + +define i1 @testSwapCmpWithLSL32_1(i32 %a, i32 %b) { +; CHECK-LABEL testSwapCmpWithLSL32_1: +; CHECK: cmp w1, w0, lsl #1 +; CHECK-NEXT: cset w0, gt +entry: + %shl = shl i32 %a, 1 + %cmp = icmp slt i32 %shl, %b + ret i1 %cmp +} + +define i1 @testSwapCmpWithLSL32_31(i32 %a, i32 %b) { +; CHECK-LABEL testSwapCmpWithLSL32_31: +; CHECK: cmp w1, w0, lsl #31 +; CHECK-NEXT: cset w0, gt +entry: + %shl = shl i32 %a, 31 + %cmp = icmp slt i32 %shl, %b + ret i1 %cmp +} + +define i1 @testSwapCmpWithLSR64_1(i64 %a, i64 %b) { +; CHECK-LABEL testSwapCmpWithLSR64_1: +; CHECK: cmp x1, x0, lsr #1 +; CHECK-NEXT: cset w0, gt +entry: + %lshr = lshr i64 %a, 1 + %cmp = icmp slt i64 %lshr, %b + ret i1 %cmp +} + +define i1 @testSwapCmpWithLSR64_63(i64 %a, i64 %b) { +; CHECK-LABEL testSwapCmpWithLSR64_63: +; CHECK: cmp x1, x0, lsr #63 +; CHECK-NEXT: cset w0, gt +entry: + %lshr = lshr i64 %a, 63 + %cmp = icmp slt i64 %lshr, %b + ret i1 %cmp +} + +define i1 @testSwapCmpWithLSR32_1(i32 %a, i32 %b) { +; CHECK-LABEL testSwapCmpWithLSR32_1: +; CHECK: cmp w1, w0, lsr #1 +; CHECK-NEXT: cset w0, gt +entry: + %lshr = lshr i32 %a, 1 + %cmp = icmp slt i32 %lshr, %b + ret i1 %cmp +} + +define i1 @testSwapCmpWithLSR32_31(i32 %a, i32 %b) { +; CHECK-LABEL testSwapCmpWithLSR32_31: +; CHECK: cmp w1, w0, lsr #31 +; CHECK-NEXT: cset w0, gt +entry: + %lshr = lshr i32 %a, 31 + %cmp = icmp slt i32 %lshr, %b + ret i1 %cmp +} + +define i1 @testSwapCmpWithASR64_1(i64 %a, i64 %b) { +; CHECK-LABEL testSwapCmpWithASR64_1: +; CHECK: cmp x1, x0, asr #1 +; CHECK-NEXT: cset w0, gt +entry: + %ashr = ashr i64 %a, 1 + %cmp = icmp slt i64 %ashr, %b + ret i1 %cmp +} + +define i1 @testSwapCmpWithASR64_63(i64 %a, i64 %b) { +; CHECK-LABEL testSwapCmpWithASR64_63: +; CHECK: cmp x1, x0, asr #63 +; CHECK-NEXT: cset w0, gt +entry: + %ashr = ashr i64 %a, 63 + %cmp = icmp slt i64 %ashr, %b + ret i1 %cmp +} + +define i1 @testSwapCmpWithASR32_1(i32 %a, i32 %b) { +; CHECK-LABEL testSwapCmpWithASR32_1: +; CHECK: cmp w1, w0, asr #1 +; CHECK-NEXT: cset w0, gt +entry: + %ashr = ashr i32 %a, 1 + %cmp = icmp slt i32 %ashr, %b + ret i1 %cmp +} + +define i1 @testSwapCmpWithASR32_31(i32 %a, i32 %b) { +; CHECK-LABEL testSwapCmpWithASR32_31: +; CHECK: cmp w1, w0, asr #31 +; CHECK-NEXT: cset w0, gt +entry: + %ashr = ashr i32 %a, 31 + %cmp = icmp slt i32 %ashr, %b + ret i1 %cmp +} + +define i1 @testSwapCmpWithShiftedZeroExtend32_64(i32 %a, i64 %b) { +; CHECK-LABEL testSwapCmpWithShiftedZeroExtend32_64 +; CHECK: cmp x1, w0, uxtw #2 +; CHECK-NEXT: cset w0, lo +entry: + %a64 = zext i32 %a to i64 + %shl.0 = shl i64 %a64, 2 + %cmp = icmp ugt i64 %shl.0, %b + ret i1 %cmp +} + +define i1 @testSwapCmpWithShiftedZeroExtend16_64(i16 %a, i64 %b) { +; CHECK-LABEL testSwapCmpWithShiftedZeroExtend16_64 +; CHECK: cmp x1, w0, uxth #2 +; CHECK-NEXT: cset w0, lo +entry: + %a64 = zext i16 %a to i64 + %shl.0 = shl i64 %a64, 2 + %cmp = icmp ugt i64 %shl.0, %b + ret i1 %cmp +} + +define i1 @testSwapCmpWithShiftedZeroExtend8_64(i8 %a, i64 %b) { +; CHECK-LABEL testSwapCmpWithShiftedZeroExtend8_64 +; CHECK: cmp x1, w0, uxtb #4 +; CHECK-NEXT: cset w0, lo +entry: + %a64 = zext i8 %a to i64 + %shl.2 = shl i64 %a64, 4 + %cmp = icmp ugt i64 %shl.2, %b + ret i1 %cmp +} + +define i1 @testSwapCmpWithShiftedZeroExtend16_32(i16 %a, i32 %b) { +; CHECK-LABEL testSwapCmpWithShiftedZeroExtend8_64 +; CHECK: cmp w1, w0, uxth #3 +; CHECK-NEXT: cset w0, lo +entry: + %a32 = zext i16 %a to i32 + %shl = shl i32 %a32, 3 + %cmp = icmp ugt i32 %shl, %b + ret i1 %cmp +} + +define i1 @testSwapCmpWithShiftedZeroExtend8_32(i8 %a, i32 %b) { +; CHECK-LABEL testSwapCmpWithShiftedZeroExtend8_64 +; CHECK: cmp w1, w0, uxtb #4 +; CHECK-NEXT: cset w0, lo +entry: + %a32 = zext i8 %a to i32 + %shl = shl i32 %a32, 4 + %cmp = icmp ugt i32 %shl, %b + ret i1 %cmp +} + +define i1 @testSwapCmpWithTooLargeShiftedZeroExtend8_32(i8 %a, i32 %b) { +; CHECK-LABEL testSwapCmpWithTooLargeShiftedZeroExtend8_64 +; CHECK: and [[REG:w[0-9]+]], w0, #0xff +; CHECK: cmp w1, [[REG]], lsl #5 +; CHECK-NEXT: cset w0, lo +entry: + %a32 = zext i8 %a to i32 + %shl = shl i32 %a32, 5 + %cmp = icmp ugt i32 %shl, %b + ret i1 %cmp +} + +define i1 @testSwapCmpWithZeroExtend8_32(i8 %a, i32 %b) { +; CHECK-LABEL testSwapCmpWithZeroExtend8_64 +; CHECK: cmp w1, w0, uxtb +; CHECK-NEXT: cset w0, lo +entry: + %a32 = zext i8 %a to i32 + %cmp = icmp ugt i32 %a32, %b + ret i1 %cmp +} + +define i1 @testSwapCmpWithShiftedSignExtend32_64(i32 %a, i64 %b) { +; CHECK-LABEL testSwapCmpWithShiftedZeroExtend32_64 +; CHECK: cmp x1, w0, sxtw #2 +; CHECK-NEXT: cset w0, lo +entry: + %a64 = sext i32 %a to i64 + %shl.0 = shl i64 %a64, 2 + %cmp = icmp ugt i64 %shl.0, %b + ret i1 %cmp +} + +define i1 @testSwapCmpWithShiftedSignExtend16_64(i16 %a, i64 %b) { +; CHECK-LABEL testSwapCmpWithShiftedZeroExtend16_64 +; CHECK: cmp x1, w0, sxth #2 +; CHECK-NEXT: cset w0, lo +entry: + %a64 = sext i16 %a to i64 + %shl.0 = shl i64 %a64, 2 + %cmp = icmp ugt i64 %shl.0, %b + ret i1 %cmp +} + +define i1 @testSwapCmpWithShiftedSignExtend8_64(i8 %a, i64 %b) { +; CHECK-LABEL testSwapCmpWithShiftedSignExtend8_64 +; CHECK: cmp x1, w0, sxtb #4 +; CHECK-NEXT: cset w0, lo +entry: + %a64 = sext i8 %a to i64 + %shl.2 = shl i64 %a64, 4 + %cmp = icmp ugt i64 %shl.2, %b + ret i1 %cmp +} + +define i1 @testSwapCmpWithShiftedSignExtend16_32(i16 %a, i32 %b) { +; CHECK-LABEL testSwapCmpWithShiftedSignExtend8_64 +; CHECK: cmp w1, w0, sxth #3 +; CHECK-NEXT: cset w0, lo +entry: + %a32 = sext i16 %a to i32 + %shl = shl i32 %a32, 3 + %cmp = icmp ugt i32 %shl, %b + ret i1 %cmp +} + +define i1 @testSwapCmpWithShiftedSignExtend8_32(i8 %a, i32 %b) { +; CHECK-LABEL testSwapCmpWithShiftedSignExtend8_64 +; CHECK: cmp w1, w0, sxtb #4 +; CHECK-NEXT: cset w0, lo +entry: + %a32 = sext i8 %a to i32 + %shl = shl i32 %a32, 4 + %cmp = icmp ugt i32 %shl, %b + ret i1 %cmp +} + +define i1 @testSwapCmpWithTooLargeShiftedSignExtend8_32(i8 %a, i32 %b) { +; CHECK-LABEL testSwapCmpWithTooLargeShiftedSignExtend8_64 +; CHECK: sxtb [[REG:w[0-9]+]], w0 +; CHECK-NEXT: cmp w1, [[REG]], lsl #5 +; CHECK-NEXT: cset w0, lo +entry: + %a32 = sext i8 %a to i32 + %shl = shl i32 %a32, 5 + %cmp = icmp ugt i32 %shl, %b + ret i1 %cmp +} + +define i1 @testSwapCmpWithSignExtend8_32(i8 %a, i32 %b) { +; CHECK-LABEL testSwapCmpWithSignExtend8_64 +; CHECK: cmp w1, w0, sxtb +; CHECK-NEXT: cset w0, lo +entry: + %a32 = sext i8 %a to i32 + %cmp = icmp ugt i32 %a32, %b + ret i1 %cmp +} + +define i1 @testSwapCmnWithLSL64_1(i64 %a, i64 %b) { +; CHECK-LABEL testSwapCmnWithLSL64_1: +; CHECK: cmn x1, x0, lsl #1 +; CHECK-NEXT: cset w0, ne +entry: + %shl = shl i64 %a, 1 + %na = sub i64 0, %shl + %cmp = icmp ne i64 %na, %b + ret i1 %cmp +} + +; Note: testing with a 62 bits shift as 63 has another optimization kicking in. +define i1 @testSwapCmnWithLSL64_62(i64 %a, i64 %b) { +; CHECK-LABEL testSwapCmnWithLSL64_62: +; CHECK: cmn x1, x0, lsl #62 +; CHECK-NEXT: cset w0, ne +entry: + %shl = shl i64 %a, 62 + %na = sub i64 0, %shl + %cmp = icmp ne i64 %na, %b + ret i1 %cmp +} + +; Note: the 63 bits shift triggers a different optimization path, which leads +; to a similar result in terms of performances. We try to catch here any change +; so that this test can be adapted should the optimization be done with the +; operand swap. +define i1 @testSwapCmnWithLSL64_63(i64 %a, i64 %b) { +; CHECK-LABEL testSwapCmnWithLSL64_63: +; CHECK: cmp x1, x0, lsl #63 +; CHECK-NEXT: cset w0, ne +entry: + %shl = shl i64 %a, 63 + %na = sub i64 0, %shl + %cmp = icmp ne i64 %na, %b + ret i1 %cmp +} + +define i1 @testSwapCmnWithLSL32_1(i32 %a, i32 %b) { +; CHECK-LABEL testSwapCmnWithLSL32_1: +; CHECK: cmn w1, w0, lsl #1 +; CHECK-NEXT: cset w0, ne +entry: + %shl = shl i32 %a, 1 + %na = sub i32 0, %shl + %cmp = icmp ne i32 %na, %b + ret i1 %cmp +} + +; Note: testing with a 30 bits shift as 30 has another optimization kicking in. +define i1 @testSwapCmnWithLSL32_30(i32 %a, i32 %b) { +; CHECK-LABEL testSwapCmnWithLSL32_30: +; CHECK: cmn w1, w0, lsl #30 +; CHECK-NEXT: cset w0, ne +entry: + %shl = shl i32 %a, 30 + %na = sub i32 0, %shl + %cmp = icmp ne i32 %na, %b + ret i1 %cmp +} + +; Note: the 31 bits shift triggers a different optimization path, which leads +; to a similar result in terms of performances. We try to catch here any change +; so that this test can be adapted should the optimization be done with the +; operand swap. +define i1 @testSwapCmnWithLSL32_31(i32 %a, i32 %b) { +; CHECK-LABEL testSwapCmnWithLSL32_31: +; CHECK: cmp w1, w0, lsl #31 +; CHECK-NEXT: cset w0, ne +entry: + %shl = shl i32 %a, 31 + %na = sub i32 0, %shl + %cmp = icmp ne i32 %na, %b + ret i1 %cmp +} + +define i1 @testSwapCmnWithLSR64_1(i64 %a, i64 %b) { +; CHECK-LABEL testSwapCmnWithLSR64_1: +; CHECK: cmn x1, x0, lsr #1 +; CHECK-NEXT: cset w0, ne +entry: + %lshr = lshr i64 %a, 1 + %na = sub i64 0, %lshr + %cmp = icmp ne i64 %na, %b + ret i1 %cmp +} + +; Note: testing with a 62 bits shift as 63 has another optimization kicking in. +define i1 @testSwapCmnWithLSR64_62(i64 %a, i64 %b) { +; CHECK-LABEL testSwapCmnWithLSR64_62: +; CHECK: cmn x1, x0, lsr #62 +; CHECK-NEXT: cset w0, ne +entry: + %lshr = lshr i64 %a, 62 + %na = sub i64 0, %lshr + %cmp = icmp ne i64 %na, %b + ret i1 %cmp +} + +; Note: the 63 bits shift triggers a different optimization path, which leads +; to a similar result in terms of performances. We try to catch here any change +; so that this test can be adapted should the optimization be done with the +; operand swap. +define i1 @testSwapCmnWithLSR64_63(i64 %a, i64 %b) { +; CHECK-LABEL testSwapCmnWithLSR64_63: +; CHECK: cmp x1, x0, asr #63 +; CHECK-NEXT: cset w0, ne +entry: + %lshr = lshr i64 %a, 63 + %na = sub i64 0, %lshr + %cmp = icmp ne i64 %na, %b + ret i1 %cmp +} + +define i1 @testSwapCmnWithLSR32_1(i32 %a, i32 %b) { +; CHECK-LABEL testSwapCmnWithLSR32_1: +; CHECK: cmn w1, w0, lsr #1 +; CHECK-NEXT: cset w0, ne +entry: + %lshr = lshr i32 %a, 1 + %na = sub i32 0, %lshr + %cmp = icmp ne i32 %na, %b + ret i1 %cmp +} + +; Note: testing with a 30 bits shift as 31 has another optimization kicking in. +define i1 @testSwapCmnWithLSR32_30(i32 %a, i32 %b) { +; CHECK-LABEL testSwapCmnWithLSR32_30: +; CHECK: cmn w1, w0, lsr #30 +; CHECK-NEXT: cset w0, ne +entry: + %lshr = lshr i32 %a, 30 + %na = sub i32 0, %lshr + %cmp = icmp ne i32 %na, %b + ret i1 %cmp +} + +; Note: the 31 bits shift triggers a different optimization path, which leads +; to a similar result in terms of performances. We try to catch here any change +; so that this test can be adapted should the optimization be done with the +; operand swap. +define i1 @testSwapCmnWithLSR32_31(i32 %a, i32 %b) { +; CHECK-LABEL testSwapCmnWithLSR32_31: +; CHECK: cmp w1, w0, asr #31 +; CHECK-NEXT: cset w0, ne +entry: + %lshr = lshr i32 %a, 31 + %na = sub i32 0, %lshr + %cmp = icmp ne i32 %na, %b + ret i1 %cmp +} + +define i1 @testSwapCmnWithASR64_1(i64 %a, i64 %b) { +; CHECK-LABEL testSwapCmnWithASR64_1: +; CHECK: cmn x1, x0, asr #3 +; CHECK-NEXT: cset w0, ne +entry: + %lshr = ashr i64 %a, 3 + %na = sub i64 0, %lshr + %cmp = icmp ne i64 %na, %b + ret i1 %cmp +} + +; Note: testing with a 62 bits shift as 63 has another optimization kicking in. +define i1 @testSwapCmnWithASR64_62(i64 %a, i64 %b) { +; CHECK-LABEL testSwapCmnWithASR64_62: +; CHECK: cmn x1, x0, asr #62 +; CHECK-NEXT: cset w0, ne +entry: + %lshr = ashr i64 %a, 62 + %na = sub i64 0, %lshr + %cmp = icmp ne i64 %na, %b + ret i1 %cmp +} + +; Note: the 63 bits shift triggers a different optimization path, which leads +; to a similar result in terms of performances. We try to catch here any change +; so that this test can be adapted should the optimization be done with the +; operand swap. +define i1 @testSwapCmnWithASR64_63(i64 %a, i64 %b) { +; CHECK-LABEL testSwapCmnWithASR64_63: +; CHECK: cmp x1, x0, lsr #63 +; CHECK-NEXT: cset w0, ne +entry: + %lshr = ashr i64 %a, 63 + %na = sub i64 0, %lshr + %cmp = icmp ne i64 %na, %b + ret i1 %cmp +} + +define i1 @testSwapCmnWithASR32_1(i32 %a, i32 %b) { +; CHECK-LABEL testSwapCmnWithASR32_1: +; CHECK: cmn w1, w0, asr #1 +; CHECK-NEXT: cset w0, eq +entry: + %lshr = ashr i32 %a, 1 + %na = sub i32 0, %lshr + %cmp = icmp eq i32 %na, %b + ret i1 %cmp +} + +; Note: testing with a 30 bits shift as 31 has another optimization kicking in. +define i1 @testSwapCmnWithASR32_30(i32 %a, i32 %b) { +; CHECK-LABEL testSwapCmnWithASR32_30: +; CHECK: cmn w1, w0, asr #30 +; CHECK-NEXT: cset w0, ne +entry: + %lshr = ashr i32 %a, 30 + %na = sub i32 0, %lshr + %cmp = icmp ne i32 %na, %b + ret i1 %cmp +} + +; Note: the 31 bits shift triggers a different optimization path, which leads +; to a similar result in terms of performances. We try to catch here any change +; so that this test can be adapted should the optimization be done with the +; operand swap. +define i1 @testSwapCmnWithASR32_31(i32 %a, i32 %b) { +; CHECK-LABEL testSwapCmnWithASR32_31: +; CHECK: cmp w1, w0, lsr #31 +; CHECK-NEXT: cset w0, ne +entry: + %lshr = ashr i32 %a, 31 + %na = sub i32 0, %lshr + %cmp = icmp ne i32 %na, %b + ret i1 %cmp +} + +define i64 @testSwapCmpToCmnWithZeroExtend(i32 %a32, i16 %a16, i8 %a8, i64 %b64, i32 %b32) { +; CHECK-LABEL testSwapCmpToCmnWithZeroExtend: +t0: + %conv0 = zext i32 %a32 to i64 + %shl0 = shl i64 %conv0, 1 + %na0 = sub i64 0, %shl0 + %cmp0 = icmp ne i64 %na0, %b64 +; CHECK: cmn x3, w0, uxtw #1 + br i1 %cmp0, label %t1, label %end + +t1: + %conv1 = zext i16 %a16 to i64 + %shl1 = shl i64 %conv1, 4 + %na1 = sub i64 0, %shl1 + %cmp1 = icmp ne i64 %na1, %b64 +; CHECK: cmn x3, w1, uxth #4 + br i1 %cmp1, label %t2, label %end + +t2: + %conv2 = zext i8 %a8 to i64 + %shl2 = shl i64 %conv2, 3 + %na2 = sub i64 0, %shl2 + %cmp2 = icmp ne i64 %na2, %b64 +; CHECK: cmn x3, w2, uxtb #3 + br i1 %cmp2, label %t3, label %end + +t3: + %conv3 = zext i16 %a16 to i32 + %shl3 = shl i32 %conv3, 2 + %na3 = sub i32 0, %shl3 + %cmp3 = icmp ne i32 %na3, %b32 +; CHECK: cmn w4, w1, uxth #2 + br i1 %cmp3, label %t4, label %end + +t4: + %conv4 = zext i8 %a8 to i32 + %shl4 = shl i32 %conv4, 1 + %na4 = sub i32 0, %shl4 + %cmp4 = icmp ne i32 %na4, %b32 +; CHECK: cmn w4, w2, uxtb #1 + br i1 %cmp4, label %t5, label %end + +t5: + %conv5 = zext i8 %a8 to i32 + %shl5 = shl i32 %conv5, 5 + %na5 = sub i32 0, %shl5 + %cmp5 = icmp ne i32 %na5, %b32 +; CHECK: and [[REG:w[0-9]+]], w2, #0xff +; CHECK: cmn w4, [[REG]], lsl #5 + br i1 %cmp5, label %t6, label %end + +t6: + %conv6 = zext i8 %a8 to i32 + %na6 = sub i32 0, %conv6 + %cmp6 = icmp ne i32 %na6, %b32 +; CHECK: cmn w4, w2, uxtb + br i1 %cmp6, label %t7, label %end + +t7: + ret i64 0 + +end: + ret i64 1 +} +define i64 @testSwapCmpToCmnWithSignExtend(i32 %a32, i16 %a16, i8 %a8, i64 %b64, i32 %b32) { +; CHECK-LABEL testSwapCmpToCmnWithSignExtend: +t0: + %conv0 = sext i32 %a32 to i64 + %shl0 = shl i64 %conv0, 1 + %na0 = sub i64 0, %shl0 + %cmp0 = icmp ne i64 %na0, %b64 +; CHECK: cmn x3, w0, sxtw #1 + br i1 %cmp0, label %t1, label %end + +t1: + %conv1 = sext i16 %a16 to i64 + %shl1 = shl i64 %conv1, 4 + %na1 = sub i64 0, %shl1 + %cmp1 = icmp ne i64 %na1, %b64 +; CHECK: cmn x3, w1, sxth #4 + br i1 %cmp1, label %t2, label %end + +t2: + %conv2 = sext i8 %a8 to i64 + %shl2 = shl i64 %conv2, 3 + %na2 = sub i64 0, %shl2 + %cmp2 = icmp ne i64 %na2, %b64 +; CHECK: cmn x3, w2, sxtb #3 + br i1 %cmp2, label %t3, label %end + +t3: + %conv3 = sext i16 %a16 to i32 + %shl3 = shl i32 %conv3, 2 + %na3 = sub i32 0, %shl3 + %cmp3 = icmp ne i32 %na3, %b32 +; CHECK: cmn w4, w1, sxth #2 + br i1 %cmp3, label %t4, label %end + +t4: + %conv4 = sext i8 %a8 to i32 + %shl4 = shl i32 %conv4, 1 + %na4 = sub i32 0, %shl4 + %cmp4 = icmp ne i32 %na4, %b32 +; CHECK: cmn w4, w2, sxtb #1 + br i1 %cmp4, label %t5, label %end + +t5: + %conv5 = sext i8 %a8 to i32 + %shl5 = shl i32 %conv5, 5 + %na5 = sub i32 0, %shl5 + %cmp5 = icmp ne i32 %na5, %b32 +; CHECK: sxtb [[REG:w[0-9]+]], w2 +; CHECK: cmn w4, [[REG]], lsl #5 + br i1 %cmp5, label %t6, label %end + +t6: + %conv6 = sext i8 %a8 to i32 + %na6 = sub i32 0, %conv6 + %cmp6 = icmp ne i32 %na6, %b32 +; CHECK: cmn w4, w2, sxtb + br i1 %cmp6, label %t7, label %end + +t7: + ret i64 0 + +end: + ret i64 1 +} -- GitLab From 0f13604417ac2a666797fa40530a3164ba917a7d Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sat, 13 Oct 2018 11:38:10 +0000 Subject: [PATCH 0142/1116] [X86][AVX] Add lowerVectorShuffleAsLanePermuteAndPermute for v4f64 shuffles (PR39161) Add shuffle lowering for the case where we can shuffle the lanes into place followed by an in-lane permute. This is mainly for cases where we can have non-repeating permutes in each lane, but for now I've just enabled it for v4f64 unary shuffles to fix PR39161 - there is no test coverage for other shuffles that might benefit yet. We now have several cross-lane shuffle lowering methods that all do something similar - I've looked at merging some of these (notably by making the repeated mask mechanism in lowerVectorShuffleByMerging128BitLanes optional), but there is a lot of assertions/assumptions in the way that makes this tricky - I ended up going for adding yet another relatively simple method instead. Differential Revision: https://reviews.llvm.org/D53148 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344446 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 60 +++++++++++++++++++++++ test/CodeGen/X86/vector-shuffle-256-v4.ll | 16 +++--- 2 files changed, 66 insertions(+), 10 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 86141965393..9020eebe203 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -13430,6 +13430,60 @@ static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG); } +/// Lower a vector shuffle crossing multiple 128-bit lanes as +/// a lane permutation followed by a per-lane permutation. +/// +/// This is mainly for cases where we can have non-repeating permutes +/// in each lane. +/// +/// TODO: This is very similar to lowerVectorShuffleByMerging128BitLanes, +/// we should investigate merging them. +static SDValue lowerVectorShuffleAsLanePermuteAndPermute( + const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, + SelectionDAG &DAG, const X86Subtarget &Subtarget) { + int NumElts = VT.getVectorNumElements(); + int NumLanes = VT.getSizeInBits() / 128; + int NumEltsPerLane = NumElts / NumLanes; + + SmallVector SrcLaneMask(NumLanes, SM_SentinelUndef); + SmallVector LaneMask(NumElts, SM_SentinelUndef); + SmallVector PermMask(NumElts, SM_SentinelUndef); + + for (int i = 0; i != NumElts; ++i) { + int M = Mask[i]; + if (M < 0) + continue; + + // Ensure that each lane comes from a single source lane. + int SrcLane = M / NumEltsPerLane; + int DstLane = i / NumEltsPerLane; + if (!isUndefOrEqual(SrcLaneMask[DstLane], SrcLane)) + return SDValue(); + SrcLaneMask[DstLane] = SrcLane; + + LaneMask[i] = (SrcLane * NumEltsPerLane) + (i % NumEltsPerLane); + PermMask[i] = (DstLane * NumEltsPerLane) + (M % NumEltsPerLane); + } + + // If we're only shuffling a single lowest lane and the rest are identity + // then don't bother. + // TODO - isShuffleMaskInputInPlace could be extended to something like this. + int NumIdentityLanes = 0; + bool OnlyShuffleLowestLane = true; + for (int i = 0; i != NumLanes; ++i) { + if (isSequentialOrUndefInRange(PermMask, i * NumEltsPerLane, NumEltsPerLane, + i * NumEltsPerLane)) + NumIdentityLanes++; + else if (SrcLaneMask[i] != 0 && SrcLaneMask[i] != NumLanes) + OnlyShuffleLowestLane = false; + } + if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1)) + return SDValue(); + + SDValue LanePermute = DAG.getVectorShuffle(VT, DL, V1, V2, LaneMask); + return DAG.getVectorShuffle(VT, DL, LanePermute, DAG.getUNDEF(VT), PermMask); +} + /// Lower a vector shuffle crossing multiple 128-bit lanes as /// a permutation and blend of those lanes. /// @@ -14166,6 +14220,11 @@ static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef Mask, DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) return V; + // Try to permute the lanes and then use a per-lane permute. + if (SDValue V = lowerVectorShuffleAsLanePermuteAndPermute( + DL, MVT::v4f64, V1, V2, Mask, DAG, Subtarget)) + return V; + // Otherwise, fall back. return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask, DAG, Subtarget); @@ -14200,6 +14259,7 @@ static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef Mask, if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) return Result; + // If we have VLX support, we can use VEXPAND. if (Subtarget.hasVLX()) if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll index ed281c31d46..b3750b74ad3 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -91,9 +91,8 @@ define <4 x double> @shuffle_v4f64_0300(<4 x double> %a, <4 x double> %b) { define <4 x double> @shuffle_v4f64_1000(<4 x double> %a, <4 x double> %b) { ; AVX1-LABEL: shuffle_v4f64_1000: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v4f64_1000: @@ -174,10 +173,8 @@ define <4 x double> @shuffle_v4f64_2222_bc(<4 x i64> %a, <4 x i64> %b) { define <4 x double> @shuffle_v4f64_2233(<4 x double> %a, <4 x double> %b) { ; AVX1-LABEL: shuffle_v4f64_2233: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] -; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,3] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v4f64_2233: @@ -766,9 +763,8 @@ define <4 x i64> @shuffle_v4i64_0300(<4 x i64> %a, <4 x i64> %b) { define <4 x i64> @shuffle_v4i64_1000(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: shuffle_v4i64_1000: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v4i64_1000: -- GitLab From e76d7099025f36d5c09a44404756b1f41f55af96 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sat, 13 Oct 2018 12:12:06 +0000 Subject: [PATCH 0143/1116] [X86][SSE] Change CTTZ vector lowering to cttz(x) = ctpop(~x & (x - 1)) This patch changes the vector CTTZ lowering from: cttz(x) = ctpop((x & -x) - 1) to: cttz(x) = ctpop(~x & (x - 1)) Not only does this make better use of the PANDN instruction, but it also matches the LegalizeDAG method which should allow us to remove the x86 specific code at some point in the future (we need to fix some issues with the bitcasted logic ops and CTPOP lowering first). Differential Revision: https://reviews.llvm.org/D53214 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344447 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 20 +- test/CodeGen/X86/vec_ctbits.ll | 82 +- test/CodeGen/X86/vector-tzcnt-128.ll | 1593 ++++++++++++-------------- test/CodeGen/X86/vector-tzcnt-256.ll | 924 +++++++-------- test/CodeGen/X86/vector-tzcnt-512.ll | 526 ++++----- 5 files changed, 1413 insertions(+), 1732 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 9020eebe203..5fb3ece19f2 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -22968,7 +22968,8 @@ static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget, return Op; } -static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) { +static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget, + SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); unsigned NumBits = VT.getScalarSizeInBits(); SDLoc dl(Op); @@ -22977,21 +22978,24 @@ static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) { SDValue N0 = Op.getOperand(0); SDValue Zero = DAG.getConstant(0, dl, VT); - // lsb(x) = (x & -x) - SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0, - DAG.getNode(ISD::SUB, dl, VT, Zero, N0)); + // Decompose 256-bit ops into smaller 128-bit ops. + if (VT.is256BitVector() && !Subtarget.hasInt256()) + return Lower256IntUnary(Op, DAG); - // cttz_undef(x) = (width - 1) - ctlz(lsb) + // cttz_undef(x) = (width - 1) - ctlz(x & -x) if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) { SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT); + SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0, + DAG.getNode(ISD::SUB, dl, VT, Zero, N0)); return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne, DAG.getNode(ISD::CTLZ, dl, VT, LSB)); } - // cttz(x) = ctpop(lsb - 1) + // cttz(x) = ctpop(~x & (x - 1)) SDValue One = DAG.getConstant(1, dl, VT); return DAG.getNode(ISD::CTPOP, dl, VT, - DAG.getNode(ISD::SUB, dl, VT, LSB, One)); + DAG.getNode(ISD::AND, dl, VT, DAG.getNOT(dl, N0, VT), + DAG.getNode(ISD::SUB, dl, VT, N0, One))); } assert(Op.getOpcode() == ISD::CTTZ && @@ -25918,7 +25922,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::CTLZ: case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG); case ISD::CTTZ: - case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, DAG); + case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, Subtarget, DAG); case ISD::MUL: return LowerMUL(Op, Subtarget, DAG); case ISD::MULHS: case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG); diff --git a/test/CodeGen/X86/vec_ctbits.ll b/test/CodeGen/X86/vec_ctbits.ll index 40e101756ef..002bcebdf71 100644 --- a/test/CodeGen/X86/vec_ctbits.ll +++ b/test/CodeGen/X86/vec_ctbits.ll @@ -8,27 +8,26 @@ declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) define <2 x i64> @footz(<2 x i64> %a) nounwind { ; CHECK-LABEL: footz: ; CHECK: # %bb.0: -; CHECK-NEXT: pxor %xmm1, %xmm1 -; CHECK-NEXT: pxor %xmm2, %xmm2 -; CHECK-NEXT: psubq %xmm0, %xmm2 -; CHECK-NEXT: pand %xmm0, %xmm2 -; CHECK-NEXT: pcmpeqd %xmm3, %xmm3 -; CHECK-NEXT: paddq %xmm2, %xmm3 -; CHECK-NEXT: movdqa %xmm3, %xmm0 -; CHECK-NEXT: psrlw $1, %xmm0 -; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 -; CHECK-NEXT: psubb %xmm0, %xmm3 -; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; CHECK-NEXT: movdqa %xmm3, %xmm2 -; CHECK-NEXT: pand %xmm0, %xmm2 -; CHECK-NEXT: psrlw $2, %xmm3 -; CHECK-NEXT: pand %xmm0, %xmm3 -; CHECK-NEXT: paddb %xmm2, %xmm3 -; CHECK-NEXT: movdqa %xmm3, %xmm0 -; CHECK-NEXT: psrlw $4, %xmm0 -; CHECK-NEXT: paddb %xmm3, %xmm0 -; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 -; CHECK-NEXT: psadbw %xmm1, %xmm0 +; CHECK-NEXT: pcmpeqd %xmm1, %xmm1 +; CHECK-NEXT: paddq %xmm0, %xmm1 +; CHECK-NEXT: pandn %xmm1, %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: psrlw $1, %xmm1 +; CHECK-NEXT: pand {{.*}}(%rip), %xmm1 +; CHECK-NEXT: psubb %xmm1, %xmm0 +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: pand %xmm1, %xmm2 +; CHECK-NEXT: psrlw $2, %xmm0 +; CHECK-NEXT: pand %xmm1, %xmm0 +; CHECK-NEXT: paddb %xmm2, %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: psrlw $4, %xmm1 +; CHECK-NEXT: paddb %xmm0, %xmm1 +; CHECK-NEXT: pand {{.*}}(%rip), %xmm1 +; CHECK-NEXT: pxor %xmm0, %xmm0 +; CHECK-NEXT: psadbw %xmm0, %xmm1 +; CHECK-NEXT: movdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %c = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 true) ret <2 x i64> %c @@ -112,27 +111,26 @@ define <2 x i32> @promtz(<2 x i32> %a) nounwind { ; CHECK-LABEL: promtz: ; CHECK: # %bb.0: ; CHECK-NEXT: por {{.*}}(%rip), %xmm0 -; CHECK-NEXT: pxor %xmm1, %xmm1 -; CHECK-NEXT: pxor %xmm2, %xmm2 -; CHECK-NEXT: psubq %xmm0, %xmm2 -; CHECK-NEXT: pand %xmm0, %xmm2 -; CHECK-NEXT: pcmpeqd %xmm3, %xmm3 -; CHECK-NEXT: paddq %xmm2, %xmm3 -; CHECK-NEXT: movdqa %xmm3, %xmm0 -; CHECK-NEXT: psrlw $1, %xmm0 -; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 -; CHECK-NEXT: psubb %xmm0, %xmm3 -; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; CHECK-NEXT: movdqa %xmm3, %xmm2 -; CHECK-NEXT: pand %xmm0, %xmm2 -; CHECK-NEXT: psrlw $2, %xmm3 -; CHECK-NEXT: pand %xmm0, %xmm3 -; CHECK-NEXT: paddb %xmm2, %xmm3 -; CHECK-NEXT: movdqa %xmm3, %xmm0 -; CHECK-NEXT: psrlw $4, %xmm0 -; CHECK-NEXT: paddb %xmm3, %xmm0 -; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 -; CHECK-NEXT: psadbw %xmm1, %xmm0 +; CHECK-NEXT: pcmpeqd %xmm1, %xmm1 +; CHECK-NEXT: paddq %xmm0, %xmm1 +; CHECK-NEXT: pandn %xmm1, %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: psrlw $1, %xmm1 +; CHECK-NEXT: pand {{.*}}(%rip), %xmm1 +; CHECK-NEXT: psubb %xmm1, %xmm0 +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: pand %xmm1, %xmm2 +; CHECK-NEXT: psrlw $2, %xmm0 +; CHECK-NEXT: pand %xmm1, %xmm0 +; CHECK-NEXT: paddb %xmm2, %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: psrlw $4, %xmm1 +; CHECK-NEXT: paddb %xmm0, %xmm1 +; CHECK-NEXT: pand {{.*}}(%rip), %xmm1 +; CHECK-NEXT: pxor %xmm0, %xmm0 +; CHECK-NEXT: psadbw %xmm0, %xmm1 +; CHECK-NEXT: movdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %c = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %a, i1 false) ret <2 x i32> %c diff --git a/test/CodeGen/X86/vector-tzcnt-128.ll b/test/CodeGen/X86/vector-tzcnt-128.ll index a532794f89d..1430ca72f68 100644 --- a/test/CodeGen/X86/vector-tzcnt-128.ll +++ b/test/CodeGen/X86/vector-tzcnt-128.ll @@ -18,121 +18,112 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { ; SSE2-LABEL: testv2i64: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: psubq %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 -; SSE2-NEXT: paddq %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: psrlw $1, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: psubb %xmm0, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: psrlw $2, %xmm3 -; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: paddb %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: psrlw $4, %xmm0 -; SSE2-NEXT: paddb %xmm3, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: psadbw %xmm1, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE2-NEXT: paddq %xmm0, %xmm1 +; SSE2-NEXT: pandn %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $1, %xmm1 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: psubb %xmm1, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: paddb %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $4, %xmm1 +; SSE2-NEXT: paddb %xmm0, %xmm1 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: psadbw %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: testv2i64: ; SSE3: # %bb.0: -; SSE3-NEXT: pxor %xmm1, %xmm1 -; SSE3-NEXT: pxor %xmm2, %xmm2 -; SSE3-NEXT: psubq %xmm0, %xmm2 -; SSE3-NEXT: pand %xmm0, %xmm2 -; SSE3-NEXT: pcmpeqd %xmm3, %xmm3 -; SSE3-NEXT: paddq %xmm2, %xmm3 -; SSE3-NEXT: movdqa %xmm3, %xmm0 -; SSE3-NEXT: psrlw $1, %xmm0 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE3-NEXT: psubb %xmm0, %xmm3 -; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE3-NEXT: movdqa %xmm3, %xmm2 -; SSE3-NEXT: pand %xmm0, %xmm2 -; SSE3-NEXT: psrlw $2, %xmm3 -; SSE3-NEXT: pand %xmm0, %xmm3 -; SSE3-NEXT: paddb %xmm2, %xmm3 -; SSE3-NEXT: movdqa %xmm3, %xmm0 -; SSE3-NEXT: psrlw $4, %xmm0 -; SSE3-NEXT: paddb %xmm3, %xmm0 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE3-NEXT: psadbw %xmm1, %xmm0 +; SSE3-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE3-NEXT: paddq %xmm0, %xmm1 +; SSE3-NEXT: pandn %xmm1, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlw $1, %xmm1 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: psubb %xmm1, %xmm0 +; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: pand %xmm1, %xmm2 +; SSE3-NEXT: psrlw $2, %xmm0 +; SSE3-NEXT: pand %xmm1, %xmm0 +; SSE3-NEXT: paddb %xmm2, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlw $4, %xmm1 +; SSE3-NEXT: paddb %xmm0, %xmm1 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: pxor %xmm0, %xmm0 +; SSE3-NEXT: psadbw %xmm0, %xmm1 +; SSE3-NEXT: movdqa %xmm1, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: testv2i64: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pxor %xmm1, %xmm1 -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: psubq %xmm0, %xmm2 -; SSSE3-NEXT: pand %xmm0, %xmm2 -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm3 -; SSSE3-NEXT: paddq %xmm2, %xmm3 -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 +; SSSE3-NEXT: paddq %xmm0, %xmm1 +; SSSE3-NEXT: pandn %xmm1, %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pand %xmm1, %xmm2 +; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: pand %xmm2, %xmm4 -; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSSE3-NEXT: movdqa %xmm0, %xmm5 -; SSSE3-NEXT: pshufb %xmm4, %xmm5 -; SSSE3-NEXT: psrlw $4, %xmm3 -; SSSE3-NEXT: pand %xmm2, %xmm3 -; SSSE3-NEXT: pshufb %xmm3, %xmm0 -; SSSE3-NEXT: paddb %xmm5, %xmm0 -; SSSE3-NEXT: psadbw %xmm1, %xmm0 +; SSSE3-NEXT: pshufb %xmm2, %xmm4 +; SSSE3-NEXT: psrlw $4, %xmm0 +; SSSE3-NEXT: pand %xmm1, %xmm0 +; SSSE3-NEXT: pshufb %xmm0, %xmm3 +; SSSE3-NEXT: paddb %xmm4, %xmm3 +; SSSE3-NEXT: pxor %xmm0, %xmm0 +; SSSE3-NEXT: psadbw %xmm3, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: testv2i64: ; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: psubq %xmm0, %xmm2 -; SSE41-NEXT: pand %xmm0, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm3, %xmm3 -; SSE41-NEXT: paddq %xmm2, %xmm3 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE41-NEXT: paddq %xmm0, %xmm1 +; SSE41-NEXT: pandn %xmm1, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pand %xmm1, %xmm2 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pand %xmm2, %xmm4 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pshufb %xmm4, %xmm5 -; SSE41-NEXT: psrlw $4, %xmm3 -; SSE41-NEXT: pand %xmm2, %xmm3 -; SSE41-NEXT: pshufb %xmm3, %xmm0 -; SSE41-NEXT: paddb %xmm5, %xmm0 -; SSE41-NEXT: psadbw %xmm1, %xmm0 +; SSE41-NEXT: pshufb %xmm2, %xmm4 +; SSE41-NEXT: psrlw $4, %xmm0 +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: pshufb %xmm0, %xmm3 +; SSE41-NEXT: paddb %xmm4, %xmm3 +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: testv2i64: ; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpsubq %xmm0, %xmm1, %xmm2 -; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX-NEXT: vpand %xmm2, %xmm0, %xmm3 -; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 ; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpshufb %xmm0, %xmm4, %xmm0 -; AVX-NEXT: vpaddb %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: testv2i64: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VPOPCNTDQ-NEXT: vpsubq %xmm0, %xmm1, %xmm1 -; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX512VPOPCNTDQ-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpaddq %xmm1, %xmm0, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper @@ -140,55 +131,50 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { ; ; AVX512VPOPCNTDQVL-LABEL: testv2i64: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VPOPCNTDQVL-NEXT: vpsubq %xmm0, %xmm1, %xmm1 -; AVX512VPOPCNTDQVL-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX512VPOPCNTDQVL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpaddq %xmm1, %xmm0, %xmm1 +; AVX512VPOPCNTDQVL-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: testv2i64: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG_NOVLX-NEXT: vpsubq %xmm0, %xmm1, %xmm2 -; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; BITALG_NOVLX-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; BITALG_NOVLX-NEXT: vpaddq %xmm1, %xmm0, %xmm1 +; BITALG_NOVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 +; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: testv2i64: ; BITALG: # %bb.0: -; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG-NEXT: vpsubq %xmm0, %xmm1, %xmm2 -; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; BITALG-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; BITALG-NEXT: vpaddq %xmm1, %xmm0, %xmm1 +; BITALG-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 +; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; BITALG-NEXT: retq ; ; X32-SSE-LABEL: testv2i64: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pxor %xmm1, %xmm1 -; X32-SSE-NEXT: pxor %xmm2, %xmm2 -; X32-SSE-NEXT: psubq %xmm0, %xmm2 -; X32-SSE-NEXT: pand %xmm0, %xmm2 -; X32-SSE-NEXT: psubq {{\.LCPI.*}}, %xmm2 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; X32-SSE-NEXT: movdqa %xmm2, %xmm4 -; X32-SSE-NEXT: pand %xmm3, %xmm4 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; X32-SSE-NEXT: movdqa %xmm0, %xmm5 -; X32-SSE-NEXT: pshufb %xmm4, %xmm5 -; X32-SSE-NEXT: psrlw $4, %xmm2 -; X32-SSE-NEXT: pand %xmm3, %xmm2 -; X32-SSE-NEXT: pshufb %xmm2, %xmm0 -; X32-SSE-NEXT: paddb %xmm5, %xmm0 -; X32-SSE-NEXT: psadbw %xmm1, %xmm0 +; X32-SSE-NEXT: movdqa %xmm0, %xmm1 +; X32-SSE-NEXT: psubq {{\.LCPI.*}}, %xmm1 +; X32-SSE-NEXT: pandn %xmm1, %xmm0 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-SSE-NEXT: movdqa %xmm0, %xmm2 +; X32-SSE-NEXT: pand %xmm1, %xmm2 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X32-SSE-NEXT: movdqa %xmm3, %xmm4 +; X32-SSE-NEXT: pshufb %xmm2, %xmm4 +; X32-SSE-NEXT: psrlw $4, %xmm0 +; X32-SSE-NEXT: pand %xmm1, %xmm0 +; X32-SSE-NEXT: pshufb %xmm0, %xmm3 +; X32-SSE-NEXT: paddb %xmm4, %xmm3 +; X32-SSE-NEXT: pxor %xmm0, %xmm0 +; X32-SSE-NEXT: psadbw %xmm3, %xmm0 ; X32-SSE-NEXT: retl %out = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %in, i1 0) ret <2 x i64> %out @@ -197,129 +183,121 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind { ; SSE2-LABEL: testv2i64u: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: psubq %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 -; SSE2-NEXT: paddq %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: psrlw $1, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: psubb %xmm0, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: psrlw $2, %xmm3 -; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: paddb %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: psrlw $4, %xmm0 -; SSE2-NEXT: paddb %xmm3, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: psadbw %xmm1, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE2-NEXT: paddq %xmm0, %xmm1 +; SSE2-NEXT: pandn %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $1, %xmm1 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: psubb %xmm1, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: paddb %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $4, %xmm1 +; SSE2-NEXT: paddb %xmm0, %xmm1 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: psadbw %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: testv2i64u: ; SSE3: # %bb.0: -; SSE3-NEXT: pxor %xmm1, %xmm1 -; SSE3-NEXT: pxor %xmm2, %xmm2 -; SSE3-NEXT: psubq %xmm0, %xmm2 -; SSE3-NEXT: pand %xmm0, %xmm2 -; SSE3-NEXT: pcmpeqd %xmm3, %xmm3 -; SSE3-NEXT: paddq %xmm2, %xmm3 -; SSE3-NEXT: movdqa %xmm3, %xmm0 -; SSE3-NEXT: psrlw $1, %xmm0 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE3-NEXT: psubb %xmm0, %xmm3 -; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE3-NEXT: movdqa %xmm3, %xmm2 -; SSE3-NEXT: pand %xmm0, %xmm2 -; SSE3-NEXT: psrlw $2, %xmm3 -; SSE3-NEXT: pand %xmm0, %xmm3 -; SSE3-NEXT: paddb %xmm2, %xmm3 -; SSE3-NEXT: movdqa %xmm3, %xmm0 -; SSE3-NEXT: psrlw $4, %xmm0 -; SSE3-NEXT: paddb %xmm3, %xmm0 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE3-NEXT: psadbw %xmm1, %xmm0 +; SSE3-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE3-NEXT: paddq %xmm0, %xmm1 +; SSE3-NEXT: pandn %xmm1, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlw $1, %xmm1 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: psubb %xmm1, %xmm0 +; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: pand %xmm1, %xmm2 +; SSE3-NEXT: psrlw $2, %xmm0 +; SSE3-NEXT: pand %xmm1, %xmm0 +; SSE3-NEXT: paddb %xmm2, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlw $4, %xmm1 +; SSE3-NEXT: paddb %xmm0, %xmm1 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: pxor %xmm0, %xmm0 +; SSE3-NEXT: psadbw %xmm0, %xmm1 +; SSE3-NEXT: movdqa %xmm1, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: testv2i64u: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pxor %xmm1, %xmm1 -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: psubq %xmm0, %xmm2 -; SSSE3-NEXT: pand %xmm0, %xmm2 -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm3 -; SSSE3-NEXT: paddq %xmm2, %xmm3 -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 +; SSSE3-NEXT: paddq %xmm0, %xmm1 +; SSSE3-NEXT: pandn %xmm1, %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pand %xmm1, %xmm2 +; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: pand %xmm2, %xmm4 -; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSSE3-NEXT: movdqa %xmm0, %xmm5 -; SSSE3-NEXT: pshufb %xmm4, %xmm5 -; SSSE3-NEXT: psrlw $4, %xmm3 -; SSSE3-NEXT: pand %xmm2, %xmm3 -; SSSE3-NEXT: pshufb %xmm3, %xmm0 -; SSSE3-NEXT: paddb %xmm5, %xmm0 -; SSSE3-NEXT: psadbw %xmm1, %xmm0 +; SSSE3-NEXT: pshufb %xmm2, %xmm4 +; SSSE3-NEXT: psrlw $4, %xmm0 +; SSSE3-NEXT: pand %xmm1, %xmm0 +; SSSE3-NEXT: pshufb %xmm0, %xmm3 +; SSSE3-NEXT: paddb %xmm4, %xmm3 +; SSSE3-NEXT: pxor %xmm0, %xmm0 +; SSSE3-NEXT: psadbw %xmm3, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: testv2i64u: ; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: psubq %xmm0, %xmm2 -; SSE41-NEXT: pand %xmm0, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm3, %xmm3 -; SSE41-NEXT: paddq %xmm2, %xmm3 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE41-NEXT: paddq %xmm0, %xmm1 +; SSE41-NEXT: pandn %xmm1, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pand %xmm1, %xmm2 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pand %xmm2, %xmm4 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pshufb %xmm4, %xmm5 -; SSE41-NEXT: psrlw $4, %xmm3 -; SSE41-NEXT: pand %xmm2, %xmm3 -; SSE41-NEXT: pshufb %xmm3, %xmm0 -; SSE41-NEXT: paddb %xmm5, %xmm0 -; SSE41-NEXT: psadbw %xmm1, %xmm0 +; SSE41-NEXT: pshufb %xmm2, %xmm4 +; SSE41-NEXT: psrlw $4, %xmm0 +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: pshufb %xmm0, %xmm3 +; SSE41-NEXT: paddb %xmm4, %xmm3 +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: testv2i64u: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpsubq %xmm0, %xmm1, %xmm2 -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: testv2i64u: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpsubq %xmm0, %xmm1, %xmm2 -; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm3 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 ; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm0, %xmm4, %xmm0 -; AVX2-NEXT: vpaddb %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; @@ -346,11 +324,9 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind { ; ; AVX512VPOPCNTDQ-LABEL: testv2i64u: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VPOPCNTDQ-NEXT: vpsubq %xmm0, %xmm1, %xmm1 -; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX512VPOPCNTDQ-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpaddq %xmm1, %xmm0, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper @@ -358,55 +334,50 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind { ; ; AVX512VPOPCNTDQVL-LABEL: testv2i64u: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VPOPCNTDQVL-NEXT: vpsubq %xmm0, %xmm1, %xmm1 -; AVX512VPOPCNTDQVL-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX512VPOPCNTDQVL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpaddq %xmm1, %xmm0, %xmm1 +; AVX512VPOPCNTDQVL-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: testv2i64u: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG_NOVLX-NEXT: vpsubq %xmm0, %xmm1, %xmm2 -; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; BITALG_NOVLX-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; BITALG_NOVLX-NEXT: vpaddq %xmm1, %xmm0, %xmm1 +; BITALG_NOVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 +; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: testv2i64u: ; BITALG: # %bb.0: -; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG-NEXT: vpsubq %xmm0, %xmm1, %xmm2 -; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; BITALG-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; BITALG-NEXT: vpaddq %xmm1, %xmm0, %xmm1 +; BITALG-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 +; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; BITALG-NEXT: retq ; ; X32-SSE-LABEL: testv2i64u: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pxor %xmm1, %xmm1 -; X32-SSE-NEXT: pxor %xmm2, %xmm2 -; X32-SSE-NEXT: psubq %xmm0, %xmm2 -; X32-SSE-NEXT: pand %xmm0, %xmm2 -; X32-SSE-NEXT: psubq {{\.LCPI.*}}, %xmm2 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; X32-SSE-NEXT: movdqa %xmm2, %xmm4 -; X32-SSE-NEXT: pand %xmm3, %xmm4 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; X32-SSE-NEXT: movdqa %xmm0, %xmm5 -; X32-SSE-NEXT: pshufb %xmm4, %xmm5 -; X32-SSE-NEXT: psrlw $4, %xmm2 -; X32-SSE-NEXT: pand %xmm3, %xmm2 -; X32-SSE-NEXT: pshufb %xmm2, %xmm0 -; X32-SSE-NEXT: paddb %xmm5, %xmm0 -; X32-SSE-NEXT: psadbw %xmm1, %xmm0 +; X32-SSE-NEXT: movdqa %xmm0, %xmm1 +; X32-SSE-NEXT: psubq {{\.LCPI.*}}, %xmm1 +; X32-SSE-NEXT: pandn %xmm1, %xmm0 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-SSE-NEXT: movdqa %xmm0, %xmm2 +; X32-SSE-NEXT: pand %xmm1, %xmm2 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X32-SSE-NEXT: movdqa %xmm3, %xmm4 +; X32-SSE-NEXT: pshufb %xmm2, %xmm4 +; X32-SSE-NEXT: psrlw $4, %xmm0 +; X32-SSE-NEXT: pand %xmm1, %xmm0 +; X32-SSE-NEXT: pshufb %xmm0, %xmm3 +; X32-SSE-NEXT: paddb %xmm4, %xmm3 +; X32-SSE-NEXT: pxor %xmm0, %xmm0 +; X32-SSE-NEXT: psadbw %xmm3, %xmm0 ; X32-SSE-NEXT: retl %out = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %in, i1 -1) ret <2 x i64> %out @@ -415,130 +386,124 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind { define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; SSE2-LABEL: testv4i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: psubd %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 -; SSE2-NEXT: paddd %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: psrlw $1, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: psubb %xmm0, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: psrlw $2, %xmm3 -; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: paddb %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: psrlw $4, %xmm0 -; SSE2-NEXT: paddb %xmm3, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: pandn %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $1, %xmm1 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: psubb %xmm1, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE2-NEXT: psadbw %xmm1, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: psadbw %xmm1, %xmm0 -; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: paddb %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $4, %xmm1 +; SSE2-NEXT: paddb %xmm0, %xmm1 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: psadbw %xmm0, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: psadbw %xmm0, %xmm1 +; SSE2-NEXT: packuswb %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: testv4i32: ; SSE3: # %bb.0: -; SSE3-NEXT: pxor %xmm1, %xmm1 -; SSE3-NEXT: pxor %xmm2, %xmm2 -; SSE3-NEXT: psubd %xmm0, %xmm2 -; SSE3-NEXT: pand %xmm0, %xmm2 -; SSE3-NEXT: pcmpeqd %xmm3, %xmm3 -; SSE3-NEXT: paddd %xmm2, %xmm3 -; SSE3-NEXT: movdqa %xmm3, %xmm0 -; SSE3-NEXT: psrlw $1, %xmm0 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE3-NEXT: psubb %xmm0, %xmm3 -; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE3-NEXT: movdqa %xmm3, %xmm2 -; SSE3-NEXT: pand %xmm0, %xmm2 -; SSE3-NEXT: psrlw $2, %xmm3 -; SSE3-NEXT: pand %xmm0, %xmm3 -; SSE3-NEXT: paddb %xmm2, %xmm3 -; SSE3-NEXT: movdqa %xmm3, %xmm0 -; SSE3-NEXT: psrlw $4, %xmm0 -; SSE3-NEXT: paddb %xmm3, %xmm0 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE3-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE3-NEXT: paddd %xmm0, %xmm1 +; SSE3-NEXT: pandn %xmm1, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlw $1, %xmm1 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: psubb %xmm1, %xmm0 +; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE3-NEXT: movdqa %xmm0, %xmm2 -; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE3-NEXT: psadbw %xmm1, %xmm2 -; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE3-NEXT: psadbw %xmm1, %xmm0 -; SSE3-NEXT: packuswb %xmm2, %xmm0 +; SSE3-NEXT: pand %xmm1, %xmm2 +; SSE3-NEXT: psrlw $2, %xmm0 +; SSE3-NEXT: pand %xmm1, %xmm0 +; SSE3-NEXT: paddb %xmm2, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlw $4, %xmm1 +; SSE3-NEXT: paddb %xmm0, %xmm1 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: pxor %xmm0, %xmm0 +; SSE3-NEXT: movdqa %xmm1, %xmm2 +; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE3-NEXT: psadbw %xmm0, %xmm2 +; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE3-NEXT: psadbw %xmm0, %xmm1 +; SSE3-NEXT: packuswb %xmm2, %xmm1 +; SSE3-NEXT: movdqa %xmm1, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: testv4i32: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pxor %xmm1, %xmm1 -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: psubd %xmm0, %xmm2 -; SSSE3-NEXT: pand %xmm0, %xmm2 -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm3 -; SSSE3-NEXT: paddd %xmm2, %xmm3 +; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 +; SSSE3-NEXT: paddd %xmm0, %xmm1 +; SSSE3-NEXT: pandn %xmm1, %xmm0 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: pand %xmm2, %xmm4 -; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSSE3-NEXT: movdqa %xmm0, %xmm5 -; SSSE3-NEXT: pshufb %xmm4, %xmm5 -; SSSE3-NEXT: psrlw $4, %xmm3 +; SSSE3-NEXT: movdqa %xmm0, %xmm3 ; SSSE3-NEXT: pand %xmm2, %xmm3 -; SSSE3-NEXT: pshufb %xmm3, %xmm0 -; SSSE3-NEXT: paddb %xmm5, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSSE3-NEXT: psadbw %xmm1, %xmm2 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSSE3-NEXT: psadbw %xmm1, %xmm0 -; SSSE3-NEXT: packuswb %xmm2, %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; SSSE3-NEXT: movdqa %xmm1, %xmm4 +; SSSE3-NEXT: pshufb %xmm3, %xmm4 +; SSSE3-NEXT: psrlw $4, %xmm0 +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: pshufb %xmm0, %xmm1 +; SSSE3-NEXT: paddb %xmm4, %xmm1 +; SSSE3-NEXT: pxor %xmm0, %xmm0 +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSSE3-NEXT: psadbw %xmm0, %xmm2 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: psadbw %xmm0, %xmm1 +; SSSE3-NEXT: packuswb %xmm2, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: testv4i32: ; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: psubd %xmm0, %xmm2 -; SSE41-NEXT: pand %xmm0, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE41-NEXT: paddd %xmm2, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pand %xmm2, %xmm3 -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pshufb %xmm3, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE41-NEXT: paddd %xmm0, %xmm1 +; SSE41-NEXT: pandn %xmm1, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pand %xmm1, %xmm2 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pshufb %xmm2, %xmm4 ; SSE41-NEXT: psrlw $4, %xmm0 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: pshufb %xmm0, %xmm4 -; SSE41-NEXT: paddb %xmm5, %xmm4 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm4[0],zero,xmm4[1],zero -; SSE41-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; SSE41-NEXT: psadbw %xmm1, %xmm4 +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: pshufb %xmm0, %xmm3 +; SSE41-NEXT: paddb %xmm4, %xmm3 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero +; SSE41-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE41-NEXT: psadbw %xmm1, %xmm3 ; SSE41-NEXT: psadbw %xmm1, %xmm0 -; SSE41-NEXT: packuswb %xmm4, %xmm0 +; SSE41-NEXT: packuswb %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: testv4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm2 -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero @@ -548,19 +513,18 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; ; AVX2-LABEL: testv4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpsubd %xmm0, %xmm1, %xmm2 -; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpaddd %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm3 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 ; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm0, %xmm4, %xmm0 -; AVX2-NEXT: vpaddb %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero @@ -570,19 +534,18 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; ; AVX512CDVL-LABEL: testv4i32: ; AVX512CDVL: # %bb.0: -; AVX512CDVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CDVL-NEXT: vpsubd %xmm0, %xmm1, %xmm2 -; AVX512CDVL-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX512CDVL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512CDVL-NEXT: vpaddd %xmm2, %xmm0, %xmm0 -; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512CDVL-NEXT: vpand %xmm2, %xmm0, %xmm3 -; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CDVL-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX512CDVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512CDVL-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; AVX512CDVL-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CDVL-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CDVL-NEXT: vpshufb %xmm2, %xmm3, %xmm2 ; AVX512CDVL-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512CDVL-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX512CDVL-NEXT: vpshufb %xmm0, %xmm4, %xmm0 -; AVX512CDVL-NEXT: vpaddb %xmm3, %xmm0, %xmm0 +; AVX512CDVL-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512CDVL-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX512CDVL-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX512CDVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512CDVL-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512CDVL-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 ; AVX512CDVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -592,19 +555,18 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; ; AVX512CD-LABEL: testv4i32: ; AVX512CD: # %bb.0: -; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CD-NEXT: vpsubd %xmm0, %xmm1, %xmm2 -; AVX512CD-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX512CD-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512CD-NEXT: vpaddd %xmm2, %xmm0, %xmm0 -; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512CD-NEXT: vpand %xmm2, %xmm0, %xmm3 -; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CD-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX512CD-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512CD-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; AVX512CD-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CD-NEXT: vpshufb %xmm2, %xmm3, %xmm2 ; AVX512CD-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512CD-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX512CD-NEXT: vpshufb %xmm0, %xmm4, %xmm0 -; AVX512CD-NEXT: vpaddb %xmm3, %xmm0, %xmm0 +; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512CD-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX512CD-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512CD-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512CD-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 ; AVX512CD-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero @@ -614,11 +576,9 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; ; AVX512VPOPCNTDQ-LABEL: testv4i32: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VPOPCNTDQ-NEXT: vpsubd %xmm0, %xmm1, %xmm1 -; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX512VPOPCNTDQ-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper @@ -626,22 +586,19 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; ; AVX512VPOPCNTDQVL-LABEL: testv4i32: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VPOPCNTDQVL-NEXT: vpsubd %xmm0, %xmm1, %xmm1 -; AVX512VPOPCNTDQVL-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX512VPOPCNTDQVL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; AVX512VPOPCNTDQVL-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: testv4i32: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG_NOVLX-NEXT: vpsubd %xmm0, %xmm1, %xmm2 -; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; BITALG_NOVLX-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; BITALG_NOVLX-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; BITALG_NOVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 +; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 ; BITALG_NOVLX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero @@ -652,12 +609,11 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; ; BITALG-LABEL: testv4i32: ; BITALG: # %bb.0: -; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG-NEXT: vpsubd %xmm0, %xmm1, %xmm2 -; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; BITALG-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; BITALG-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; BITALG-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 +; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; BITALG-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 ; BITALG-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -667,27 +623,25 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; ; X32-SSE-LABEL: testv4i32: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pxor %xmm1, %xmm1 -; X32-SSE-NEXT: pxor %xmm2, %xmm2 -; X32-SSE-NEXT: psubd %xmm0, %xmm2 -; X32-SSE-NEXT: pand %xmm0, %xmm2 -; X32-SSE-NEXT: pcmpeqd %xmm0, %xmm0 -; X32-SSE-NEXT: paddd %xmm2, %xmm0 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; X32-SSE-NEXT: movdqa %xmm0, %xmm3 -; X32-SSE-NEXT: pand %xmm2, %xmm3 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; X32-SSE-NEXT: movdqa %xmm4, %xmm5 -; X32-SSE-NEXT: pshufb %xmm3, %xmm5 +; X32-SSE-NEXT: pcmpeqd %xmm1, %xmm1 +; X32-SSE-NEXT: paddd %xmm0, %xmm1 +; X32-SSE-NEXT: pandn %xmm1, %xmm0 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-SSE-NEXT: movdqa %xmm0, %xmm2 +; X32-SSE-NEXT: pand %xmm1, %xmm2 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X32-SSE-NEXT: movdqa %xmm3, %xmm4 +; X32-SSE-NEXT: pshufb %xmm2, %xmm4 ; X32-SSE-NEXT: psrlw $4, %xmm0 -; X32-SSE-NEXT: pand %xmm2, %xmm0 -; X32-SSE-NEXT: pshufb %xmm0, %xmm4 -; X32-SSE-NEXT: paddb %xmm5, %xmm4 -; X32-SSE-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm4[0],zero,xmm4[1],zero -; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; X32-SSE-NEXT: psadbw %xmm1, %xmm4 +; X32-SSE-NEXT: pand %xmm1, %xmm0 +; X32-SSE-NEXT: pshufb %xmm0, %xmm3 +; X32-SSE-NEXT: paddb %xmm4, %xmm3 +; X32-SSE-NEXT: pxor %xmm1, %xmm1 +; X32-SSE-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero +; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; X32-SSE-NEXT: psadbw %xmm1, %xmm3 ; X32-SSE-NEXT: psadbw %xmm1, %xmm0 -; X32-SSE-NEXT: packuswb %xmm4, %xmm0 +; X32-SSE-NEXT: packuswb %xmm3, %xmm0 ; X32-SSE-NEXT: retl %out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %in, i1 0) ret <4 x i32> %out @@ -696,130 +650,124 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind { ; SSE2-LABEL: testv4i32u: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: psubd %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 -; SSE2-NEXT: paddd %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: psrlw $1, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: psubb %xmm0, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: psrlw $2, %xmm3 -; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: paddb %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: psrlw $4, %xmm0 -; SSE2-NEXT: paddb %xmm3, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: pandn %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $1, %xmm1 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: psubb %xmm1, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE2-NEXT: psadbw %xmm1, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: psadbw %xmm1, %xmm0 -; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: paddb %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $4, %xmm1 +; SSE2-NEXT: paddb %xmm0, %xmm1 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: psadbw %xmm0, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: psadbw %xmm0, %xmm1 +; SSE2-NEXT: packuswb %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: testv4i32u: ; SSE3: # %bb.0: -; SSE3-NEXT: pxor %xmm1, %xmm1 -; SSE3-NEXT: pxor %xmm2, %xmm2 -; SSE3-NEXT: psubd %xmm0, %xmm2 -; SSE3-NEXT: pand %xmm0, %xmm2 -; SSE3-NEXT: pcmpeqd %xmm3, %xmm3 -; SSE3-NEXT: paddd %xmm2, %xmm3 -; SSE3-NEXT: movdqa %xmm3, %xmm0 -; SSE3-NEXT: psrlw $1, %xmm0 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE3-NEXT: psubb %xmm0, %xmm3 -; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE3-NEXT: movdqa %xmm3, %xmm2 -; SSE3-NEXT: pand %xmm0, %xmm2 -; SSE3-NEXT: psrlw $2, %xmm3 -; SSE3-NEXT: pand %xmm0, %xmm3 -; SSE3-NEXT: paddb %xmm2, %xmm3 -; SSE3-NEXT: movdqa %xmm3, %xmm0 -; SSE3-NEXT: psrlw $4, %xmm0 -; SSE3-NEXT: paddb %xmm3, %xmm0 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE3-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE3-NEXT: paddd %xmm0, %xmm1 +; SSE3-NEXT: pandn %xmm1, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlw $1, %xmm1 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: psubb %xmm1, %xmm0 +; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE3-NEXT: movdqa %xmm0, %xmm2 -; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE3-NEXT: psadbw %xmm1, %xmm2 -; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE3-NEXT: psadbw %xmm1, %xmm0 -; SSE3-NEXT: packuswb %xmm2, %xmm0 +; SSE3-NEXT: pand %xmm1, %xmm2 +; SSE3-NEXT: psrlw $2, %xmm0 +; SSE3-NEXT: pand %xmm1, %xmm0 +; SSE3-NEXT: paddb %xmm2, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlw $4, %xmm1 +; SSE3-NEXT: paddb %xmm0, %xmm1 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: pxor %xmm0, %xmm0 +; SSE3-NEXT: movdqa %xmm1, %xmm2 +; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE3-NEXT: psadbw %xmm0, %xmm2 +; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE3-NEXT: psadbw %xmm0, %xmm1 +; SSE3-NEXT: packuswb %xmm2, %xmm1 +; SSE3-NEXT: movdqa %xmm1, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: testv4i32u: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pxor %xmm1, %xmm1 -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: psubd %xmm0, %xmm2 -; SSSE3-NEXT: pand %xmm0, %xmm2 -; SSSE3-NEXT: pcmpeqd %xmm3, %xmm3 -; SSSE3-NEXT: paddd %xmm2, %xmm3 +; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 +; SSSE3-NEXT: paddd %xmm0, %xmm1 +; SSSE3-NEXT: pandn %xmm1, %xmm0 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: pand %xmm2, %xmm4 -; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSSE3-NEXT: movdqa %xmm0, %xmm5 -; SSSE3-NEXT: pshufb %xmm4, %xmm5 -; SSSE3-NEXT: psrlw $4, %xmm3 +; SSSE3-NEXT: movdqa %xmm0, %xmm3 ; SSSE3-NEXT: pand %xmm2, %xmm3 -; SSSE3-NEXT: pshufb %xmm3, %xmm0 -; SSSE3-NEXT: paddb %xmm5, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSSE3-NEXT: psadbw %xmm1, %xmm2 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSSE3-NEXT: psadbw %xmm1, %xmm0 -; SSSE3-NEXT: packuswb %xmm2, %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; SSSE3-NEXT: movdqa %xmm1, %xmm4 +; SSSE3-NEXT: pshufb %xmm3, %xmm4 +; SSSE3-NEXT: psrlw $4, %xmm0 +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: pshufb %xmm0, %xmm1 +; SSSE3-NEXT: paddb %xmm4, %xmm1 +; SSSE3-NEXT: pxor %xmm0, %xmm0 +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSSE3-NEXT: psadbw %xmm0, %xmm2 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: psadbw %xmm0, %xmm1 +; SSSE3-NEXT: packuswb %xmm2, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: testv4i32u: ; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: psubd %xmm0, %xmm2 -; SSE41-NEXT: pand %xmm0, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE41-NEXT: paddd %xmm2, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pand %xmm2, %xmm3 -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pshufb %xmm3, %xmm5 +; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE41-NEXT: paddd %xmm0, %xmm1 +; SSE41-NEXT: pandn %xmm1, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pand %xmm1, %xmm2 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pshufb %xmm2, %xmm4 ; SSE41-NEXT: psrlw $4, %xmm0 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: pshufb %xmm0, %xmm4 -; SSE41-NEXT: paddb %xmm5, %xmm4 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm4[0],zero,xmm4[1],zero -; SSE41-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; SSE41-NEXT: psadbw %xmm1, %xmm4 +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: pshufb %xmm0, %xmm3 +; SSE41-NEXT: paddb %xmm4, %xmm3 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero +; SSE41-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE41-NEXT: psadbw %xmm1, %xmm3 ; SSE41-NEXT: psadbw %xmm1, %xmm0 -; SSE41-NEXT: packuswb %xmm4, %xmm0 +; SSE41-NEXT: packuswb %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: testv4i32u: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm2 -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero @@ -829,19 +777,18 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind { ; ; AVX2-LABEL: testv4i32u: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpsubd %xmm0, %xmm1, %xmm2 -; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpaddd %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm3 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 ; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm0, %xmm4, %xmm0 -; AVX2-NEXT: vpaddb %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero @@ -872,11 +819,9 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind { ; ; AVX512VPOPCNTDQ-LABEL: testv4i32u: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VPOPCNTDQ-NEXT: vpsubd %xmm0, %xmm1, %xmm1 -; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX512VPOPCNTDQ-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper @@ -884,22 +829,19 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind { ; ; AVX512VPOPCNTDQVL-LABEL: testv4i32u: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VPOPCNTDQVL-NEXT: vpsubd %xmm0, %xmm1, %xmm1 -; AVX512VPOPCNTDQVL-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX512VPOPCNTDQVL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; AVX512VPOPCNTDQVL-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: testv4i32u: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG_NOVLX-NEXT: vpsubd %xmm0, %xmm1, %xmm2 -; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; BITALG_NOVLX-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; BITALG_NOVLX-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; BITALG_NOVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 +; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 ; BITALG_NOVLX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero @@ -910,12 +852,11 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind { ; ; BITALG-LABEL: testv4i32u: ; BITALG: # %bb.0: -; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG-NEXT: vpsubd %xmm0, %xmm1, %xmm2 -; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm0 -; BITALG-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; BITALG-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; BITALG-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; BITALG-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; BITALG-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 +; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; BITALG-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 ; BITALG-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -925,27 +866,25 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind { ; ; X32-SSE-LABEL: testv4i32u: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pxor %xmm1, %xmm1 -; X32-SSE-NEXT: pxor %xmm2, %xmm2 -; X32-SSE-NEXT: psubd %xmm0, %xmm2 -; X32-SSE-NEXT: pand %xmm0, %xmm2 -; X32-SSE-NEXT: pcmpeqd %xmm0, %xmm0 -; X32-SSE-NEXT: paddd %xmm2, %xmm0 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; X32-SSE-NEXT: movdqa %xmm0, %xmm3 -; X32-SSE-NEXT: pand %xmm2, %xmm3 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; X32-SSE-NEXT: movdqa %xmm4, %xmm5 -; X32-SSE-NEXT: pshufb %xmm3, %xmm5 +; X32-SSE-NEXT: pcmpeqd %xmm1, %xmm1 +; X32-SSE-NEXT: paddd %xmm0, %xmm1 +; X32-SSE-NEXT: pandn %xmm1, %xmm0 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-SSE-NEXT: movdqa %xmm0, %xmm2 +; X32-SSE-NEXT: pand %xmm1, %xmm2 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X32-SSE-NEXT: movdqa %xmm3, %xmm4 +; X32-SSE-NEXT: pshufb %xmm2, %xmm4 ; X32-SSE-NEXT: psrlw $4, %xmm0 -; X32-SSE-NEXT: pand %xmm2, %xmm0 -; X32-SSE-NEXT: pshufb %xmm0, %xmm4 -; X32-SSE-NEXT: paddb %xmm5, %xmm4 -; X32-SSE-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm4[0],zero,xmm4[1],zero -; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; X32-SSE-NEXT: psadbw %xmm1, %xmm4 +; X32-SSE-NEXT: pand %xmm1, %xmm0 +; X32-SSE-NEXT: pshufb %xmm0, %xmm3 +; X32-SSE-NEXT: paddb %xmm4, %xmm3 +; X32-SSE-NEXT: pxor %xmm1, %xmm1 +; X32-SSE-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero +; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; X32-SSE-NEXT: psadbw %xmm1, %xmm3 ; X32-SSE-NEXT: psadbw %xmm1, %xmm0 -; X32-SSE-NEXT: packuswb %xmm4, %xmm0 +; X32-SSE-NEXT: packuswb %xmm3, %xmm0 ; X32-SSE-NEXT: retl %out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %in, i1 -1) ret <4 x i32> %out @@ -954,11 +893,9 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind { define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { ; SSE2-LABEL: testv8i16: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: psubw %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: paddw %xmm1, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE2-NEXT: paddw %xmm0, %xmm1 +; SSE2-NEXT: pandn %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $1, %xmm1 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 @@ -981,11 +918,9 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { ; ; SSE3-LABEL: testv8i16: ; SSE3: # %bb.0: -; SSE3-NEXT: pxor %xmm1, %xmm1 -; SSE3-NEXT: psubw %xmm0, %xmm1 -; SSE3-NEXT: pand %xmm0, %xmm1 -; SSE3-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE3-NEXT: paddw %xmm1, %xmm0 +; SSE3-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE3-NEXT: paddw %xmm0, %xmm1 +; SSE3-NEXT: pandn %xmm1, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $1, %xmm1 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 @@ -1008,11 +943,9 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { ; ; SSSE3-LABEL: testv8i16: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pxor %xmm1, %xmm1 -; SSSE3-NEXT: psubw %xmm0, %xmm1 -; SSSE3-NEXT: pand %xmm0, %xmm1 -; SSSE3-NEXT: pcmpeqd %xmm0, %xmm0 -; SSSE3-NEXT: paddw %xmm1, %xmm0 +; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 +; SSSE3-NEXT: paddw %xmm0, %xmm1 +; SSSE3-NEXT: pandn %xmm1, %xmm0 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; SSSE3-NEXT: movdqa %xmm0, %xmm2 ; SSSE3-NEXT: pand %xmm1, %xmm2 @@ -1031,11 +964,9 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { ; ; SSE41-LABEL: testv8i16: ; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: psubw %xmm0, %xmm1 -; SSE41-NEXT: pand %xmm0, %xmm1 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE41-NEXT: paddw %xmm1, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE41-NEXT: paddw %xmm0, %xmm1 +; SSE41-NEXT: pandn %xmm1, %xmm0 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: pand %xmm1, %xmm2 @@ -1054,11 +985,9 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { ; ; AVX-LABEL: testv8i16: ; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpsubw %xmm0, %xmm1, %xmm1 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] @@ -1074,11 +1003,9 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { ; ; AVX512VPOPCNTDQ-LABEL: testv8i16: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VPOPCNTDQ-NEXT: vpsubw %xmm0, %xmm1, %xmm1 -; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX512VPOPCNTDQ-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpaddw %xmm1, %xmm0, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 @@ -1088,11 +1015,9 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { ; ; AVX512VPOPCNTDQVL-LABEL: testv8i16: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VPOPCNTDQVL-NEXT: vpsubw %xmm0, %xmm1, %xmm1 -; AVX512VPOPCNTDQVL-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX512VPOPCNTDQVL-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpaddw %xmm1, %xmm0, %xmm1 +; AVX512VPOPCNTDQVL-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0 @@ -1101,11 +1026,9 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { ; ; BITALG_NOVLX-LABEL: testv8i16: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG_NOVLX-NEXT: vpsubw %xmm0, %xmm1, %xmm1 -; BITALG_NOVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; BITALG_NOVLX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpaddw %xmm1, %xmm0, %xmm1 +; BITALG_NOVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; BITALG_NOVLX-NEXT: vzeroupper @@ -1113,21 +1036,17 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { ; ; BITALG-LABEL: testv8i16: ; BITALG: # %bb.0: -; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG-NEXT: vpsubw %xmm0, %xmm1, %xmm1 -; BITALG-NEXT: vpand %xmm1, %xmm0, %xmm0 ; BITALG-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; BITALG-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpaddw %xmm1, %xmm0, %xmm1 +; BITALG-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; BITALG-NEXT: vpopcntw %xmm0, %xmm0 ; BITALG-NEXT: retq ; ; X32-SSE-LABEL: testv8i16: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pxor %xmm1, %xmm1 -; X32-SSE-NEXT: psubw %xmm0, %xmm1 -; X32-SSE-NEXT: pand %xmm0, %xmm1 -; X32-SSE-NEXT: pcmpeqd %xmm0, %xmm0 -; X32-SSE-NEXT: paddw %xmm1, %xmm0 +; X32-SSE-NEXT: pcmpeqd %xmm1, %xmm1 +; X32-SSE-NEXT: paddw %xmm0, %xmm1 +; X32-SSE-NEXT: pandn %xmm1, %xmm0 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; X32-SSE-NEXT: movdqa %xmm0, %xmm2 ; X32-SSE-NEXT: pand %xmm1, %xmm2 @@ -1150,11 +1069,9 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind { ; SSE2-LABEL: testv8i16u: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: psubw %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: paddw %xmm1, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE2-NEXT: paddw %xmm0, %xmm1 +; SSE2-NEXT: pandn %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $1, %xmm1 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 @@ -1177,11 +1094,9 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind { ; ; SSE3-LABEL: testv8i16u: ; SSE3: # %bb.0: -; SSE3-NEXT: pxor %xmm1, %xmm1 -; SSE3-NEXT: psubw %xmm0, %xmm1 -; SSE3-NEXT: pand %xmm0, %xmm1 -; SSE3-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE3-NEXT: paddw %xmm1, %xmm0 +; SSE3-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE3-NEXT: paddw %xmm0, %xmm1 +; SSE3-NEXT: pandn %xmm1, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $1, %xmm1 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 @@ -1204,11 +1119,9 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind { ; ; SSSE3-LABEL: testv8i16u: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pxor %xmm1, %xmm1 -; SSSE3-NEXT: psubw %xmm0, %xmm1 -; SSSE3-NEXT: pand %xmm0, %xmm1 -; SSSE3-NEXT: pcmpeqd %xmm0, %xmm0 -; SSSE3-NEXT: paddw %xmm1, %xmm0 +; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 +; SSSE3-NEXT: paddw %xmm0, %xmm1 +; SSSE3-NEXT: pandn %xmm1, %xmm0 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; SSSE3-NEXT: movdqa %xmm0, %xmm2 ; SSSE3-NEXT: pand %xmm1, %xmm2 @@ -1227,11 +1140,9 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind { ; ; SSE41-LABEL: testv8i16u: ; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: psubw %xmm0, %xmm1 -; SSE41-NEXT: pand %xmm0, %xmm1 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE41-NEXT: paddw %xmm1, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE41-NEXT: paddw %xmm0, %xmm1 +; SSE41-NEXT: pandn %xmm1, %xmm0 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: pand %xmm1, %xmm2 @@ -1250,11 +1161,9 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind { ; ; AVX-LABEL: testv8i16u: ; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpsubw %xmm0, %xmm1, %xmm1 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] @@ -1270,11 +1179,9 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind { ; ; AVX512VPOPCNTDQ-LABEL: testv8i16u: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VPOPCNTDQ-NEXT: vpsubw %xmm0, %xmm1, %xmm1 -; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX512VPOPCNTDQ-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpaddw %xmm1, %xmm0, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 @@ -1284,11 +1191,9 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind { ; ; AVX512VPOPCNTDQVL-LABEL: testv8i16u: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VPOPCNTDQVL-NEXT: vpsubw %xmm0, %xmm1, %xmm1 -; AVX512VPOPCNTDQVL-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX512VPOPCNTDQVL-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpaddw %xmm1, %xmm0, %xmm1 +; AVX512VPOPCNTDQVL-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0 @@ -1297,11 +1202,9 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind { ; ; BITALG_NOVLX-LABEL: testv8i16u: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG_NOVLX-NEXT: vpsubw %xmm0, %xmm1, %xmm1 -; BITALG_NOVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; BITALG_NOVLX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpaddw %xmm1, %xmm0, %xmm1 +; BITALG_NOVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; BITALG_NOVLX-NEXT: vzeroupper @@ -1309,21 +1212,17 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind { ; ; BITALG-LABEL: testv8i16u: ; BITALG: # %bb.0: -; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG-NEXT: vpsubw %xmm0, %xmm1, %xmm1 -; BITALG-NEXT: vpand %xmm1, %xmm0, %xmm0 ; BITALG-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; BITALG-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpaddw %xmm1, %xmm0, %xmm1 +; BITALG-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; BITALG-NEXT: vpopcntw %xmm0, %xmm0 ; BITALG-NEXT: retq ; ; X32-SSE-LABEL: testv8i16u: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pxor %xmm1, %xmm1 -; X32-SSE-NEXT: psubw %xmm0, %xmm1 -; X32-SSE-NEXT: pand %xmm0, %xmm1 -; X32-SSE-NEXT: pcmpeqd %xmm0, %xmm0 -; X32-SSE-NEXT: paddw %xmm1, %xmm0 +; X32-SSE-NEXT: pcmpeqd %xmm1, %xmm1 +; X32-SSE-NEXT: paddw %xmm0, %xmm1 +; X32-SSE-NEXT: pandn %xmm1, %xmm0 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; X32-SSE-NEXT: movdqa %xmm0, %xmm2 ; X32-SSE-NEXT: pand %xmm1, %xmm2 @@ -1346,95 +1245,89 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind { define <16 x i8> @testv16i8(<16 x i8> %in) nounwind { ; SSE2-LABEL: testv16i8: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: psubb %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: paddb %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: psrlw $1, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: psubb %xmm0, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: psrlw $2, %xmm2 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: paddb %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: psrlw $4, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE2-NEXT: paddb %xmm0, %xmm1 +; SSE2-NEXT: pandn %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $1, %xmm1 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: psubb %xmm1, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: paddb %xmm2, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $4, %xmm1 +; SSE2-NEXT: paddb %xmm0, %xmm1 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: testv16i8: ; SSE3: # %bb.0: -; SSE3-NEXT: pxor %xmm1, %xmm1 -; SSE3-NEXT: psubb %xmm0, %xmm1 -; SSE3-NEXT: pand %xmm0, %xmm1 -; SSE3-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE3-NEXT: paddb %xmm1, %xmm2 -; SSE3-NEXT: movdqa %xmm2, %xmm0 -; SSE3-NEXT: psrlw $1, %xmm0 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE3-NEXT: psubb %xmm0, %xmm2 -; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE3-NEXT: movdqa %xmm2, %xmm1 -; SSE3-NEXT: pand %xmm0, %xmm1 -; SSE3-NEXT: psrlw $2, %xmm2 -; SSE3-NEXT: pand %xmm0, %xmm2 -; SSE3-NEXT: paddb %xmm1, %xmm2 -; SSE3-NEXT: movdqa %xmm2, %xmm0 -; SSE3-NEXT: psrlw $4, %xmm0 +; SSE3-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE3-NEXT: paddb %xmm0, %xmm1 +; SSE3-NEXT: pandn %xmm1, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlw $1, %xmm1 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: psubb %xmm1, %xmm0 +; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: pand %xmm1, %xmm2 +; SSE3-NEXT: psrlw $2, %xmm0 +; SSE3-NEXT: pand %xmm1, %xmm0 ; SSE3-NEXT: paddb %xmm2, %xmm0 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlw $4, %xmm1 +; SSE3-NEXT: paddb %xmm0, %xmm1 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: movdqa %xmm1, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: testv16i8: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pxor %xmm1, %xmm1 -; SSSE3-NEXT: psubb %xmm0, %xmm1 -; SSSE3-NEXT: pand %xmm0, %xmm1 -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm2 -; SSSE3-NEXT: paddb %xmm1, %xmm2 -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSSE3-NEXT: movdqa %xmm2, %xmm3 -; SSSE3-NEXT: pand %xmm1, %xmm3 -; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 +; SSSE3-NEXT: paddb %xmm0, %xmm1 +; SSSE3-NEXT: pandn %xmm1, %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; SSSE3-NEXT: movdqa %xmm1, %xmm4 ; SSSE3-NEXT: pshufb %xmm3, %xmm4 -; SSSE3-NEXT: psrlw $4, %xmm2 -; SSSE3-NEXT: pand %xmm1, %xmm2 -; SSSE3-NEXT: pshufb %xmm2, %xmm0 -; SSSE3-NEXT: paddb %xmm4, %xmm0 +; SSSE3-NEXT: psrlw $4, %xmm0 +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: pshufb %xmm0, %xmm1 +; SSSE3-NEXT: paddb %xmm4, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: testv16i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: psubb %xmm0, %xmm1 -; SSE41-NEXT: pand %xmm0, %xmm1 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE41-NEXT: paddb %xmm1, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: pand %xmm1, %xmm3 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE41-NEXT: paddb %xmm0, %xmm1 +; SSE41-NEXT: pandn %xmm1, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: pand %xmm2, %xmm3 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; SSE41-NEXT: movdqa %xmm1, %xmm4 ; SSE41-NEXT: pshufb %xmm3, %xmm4 -; SSE41-NEXT: psrlw $4, %xmm2 -; SSE41-NEXT: pand %xmm1, %xmm2 -; SSE41-NEXT: pshufb %xmm2, %xmm0 -; SSE41-NEXT: paddb %xmm4, %xmm0 +; SSE41-NEXT: psrlw $4, %xmm0 +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: pshufb %xmm0, %xmm1 +; SSE41-NEXT: paddb %xmm4, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: testv16i8: ; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpsubb %xmm0, %xmm1, %xmm1 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] @@ -1447,11 +1340,9 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind { ; ; AVX512VPOPCNTDQ-LABEL: testv16i8: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VPOPCNTDQ-NEXT: vpsubb %xmm0, %xmm1, %xmm1 -; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm1, %xmm0, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdb %zmm0, %xmm0 @@ -1460,11 +1351,9 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind { ; ; AVX512VPOPCNTDQVL-LABEL: testv16i8: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VPOPCNTDQVL-NEXT: vpsubb %xmm0, %xmm1, %xmm1 -; AVX512VPOPCNTDQVL-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX512VPOPCNTDQVL-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpaddb %xmm1, %xmm0, %xmm1 +; AVX512VPOPCNTDQVL-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdb %zmm0, %xmm0 @@ -1473,11 +1362,9 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind { ; ; BITALG_NOVLX-LABEL: testv16i8: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG_NOVLX-NEXT: vpsubb %xmm0, %xmm1, %xmm1 -; BITALG_NOVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; BITALG_NOVLX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpaddb %xmm1, %xmm0, %xmm1 +; BITALG_NOVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; BITALG_NOVLX-NEXT: vzeroupper @@ -1485,31 +1372,28 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind { ; ; BITALG-LABEL: testv16i8: ; BITALG: # %bb.0: -; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG-NEXT: vpsubb %xmm0, %xmm1, %xmm1 -; BITALG-NEXT: vpand %xmm1, %xmm0, %xmm0 ; BITALG-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; BITALG-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpaddb %xmm1, %xmm0, %xmm1 +; BITALG-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: retq ; ; X32-SSE-LABEL: testv16i8: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pxor %xmm1, %xmm1 -; X32-SSE-NEXT: psubb %xmm0, %xmm1 -; X32-SSE-NEXT: pand %xmm0, %xmm1 -; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2 -; X32-SSE-NEXT: paddb %xmm1, %xmm2 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; X32-SSE-NEXT: movdqa %xmm2, %xmm3 -; X32-SSE-NEXT: pand %xmm1, %xmm3 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; X32-SSE-NEXT: movdqa %xmm0, %xmm4 +; X32-SSE-NEXT: pcmpeqd %xmm1, %xmm1 +; X32-SSE-NEXT: paddb %xmm0, %xmm1 +; X32-SSE-NEXT: pandn %xmm1, %xmm0 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-SSE-NEXT: movdqa %xmm0, %xmm3 +; X32-SSE-NEXT: pand %xmm2, %xmm3 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X32-SSE-NEXT: movdqa %xmm1, %xmm4 ; X32-SSE-NEXT: pshufb %xmm3, %xmm4 -; X32-SSE-NEXT: psrlw $4, %xmm2 -; X32-SSE-NEXT: pand %xmm1, %xmm2 -; X32-SSE-NEXT: pshufb %xmm2, %xmm0 -; X32-SSE-NEXT: paddb %xmm4, %xmm0 +; X32-SSE-NEXT: psrlw $4, %xmm0 +; X32-SSE-NEXT: pand %xmm2, %xmm0 +; X32-SSE-NEXT: pshufb %xmm0, %xmm1 +; X32-SSE-NEXT: paddb %xmm4, %xmm1 +; X32-SSE-NEXT: movdqa %xmm1, %xmm0 ; X32-SSE-NEXT: retl %out = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %in, i1 0) ret <16 x i8> %out @@ -1518,95 +1402,89 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind { define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind { ; SSE2-LABEL: testv16i8u: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: psubb %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: paddb %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: psrlw $1, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: psubb %xmm0, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: psrlw $2, %xmm2 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: paddb %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: psrlw $4, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE2-NEXT: paddb %xmm0, %xmm1 +; SSE2-NEXT: pandn %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $1, %xmm1 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: psubb %xmm1, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: paddb %xmm2, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $4, %xmm1 +; SSE2-NEXT: paddb %xmm0, %xmm1 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: testv16i8u: ; SSE3: # %bb.0: -; SSE3-NEXT: pxor %xmm1, %xmm1 -; SSE3-NEXT: psubb %xmm0, %xmm1 -; SSE3-NEXT: pand %xmm0, %xmm1 -; SSE3-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE3-NEXT: paddb %xmm1, %xmm2 -; SSE3-NEXT: movdqa %xmm2, %xmm0 -; SSE3-NEXT: psrlw $1, %xmm0 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE3-NEXT: psubb %xmm0, %xmm2 -; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE3-NEXT: movdqa %xmm2, %xmm1 -; SSE3-NEXT: pand %xmm0, %xmm1 -; SSE3-NEXT: psrlw $2, %xmm2 -; SSE3-NEXT: pand %xmm0, %xmm2 -; SSE3-NEXT: paddb %xmm1, %xmm2 -; SSE3-NEXT: movdqa %xmm2, %xmm0 -; SSE3-NEXT: psrlw $4, %xmm0 +; SSE3-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE3-NEXT: paddb %xmm0, %xmm1 +; SSE3-NEXT: pandn %xmm1, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlw $1, %xmm1 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: psubb %xmm1, %xmm0 +; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: pand %xmm1, %xmm2 +; SSE3-NEXT: psrlw $2, %xmm0 +; SSE3-NEXT: pand %xmm1, %xmm0 ; SSE3-NEXT: paddb %xmm2, %xmm0 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlw $4, %xmm1 +; SSE3-NEXT: paddb %xmm0, %xmm1 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: movdqa %xmm1, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: testv16i8u: ; SSSE3: # %bb.0: -; SSSE3-NEXT: pxor %xmm1, %xmm1 -; SSSE3-NEXT: psubb %xmm0, %xmm1 -; SSSE3-NEXT: pand %xmm0, %xmm1 -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm2 -; SSSE3-NEXT: paddb %xmm1, %xmm2 -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSSE3-NEXT: movdqa %xmm2, %xmm3 -; SSSE3-NEXT: pand %xmm1, %xmm3 -; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 +; SSSE3-NEXT: paddb %xmm0, %xmm1 +; SSSE3-NEXT: pandn %xmm1, %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; SSSE3-NEXT: movdqa %xmm1, %xmm4 ; SSSE3-NEXT: pshufb %xmm3, %xmm4 -; SSSE3-NEXT: psrlw $4, %xmm2 -; SSSE3-NEXT: pand %xmm1, %xmm2 -; SSSE3-NEXT: pshufb %xmm2, %xmm0 -; SSSE3-NEXT: paddb %xmm4, %xmm0 +; SSSE3-NEXT: psrlw $4, %xmm0 +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: pshufb %xmm0, %xmm1 +; SSSE3-NEXT: paddb %xmm4, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: testv16i8u: ; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: psubb %xmm0, %xmm1 -; SSE41-NEXT: pand %xmm0, %xmm1 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE41-NEXT: paddb %xmm1, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: pand %xmm1, %xmm3 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE41-NEXT: paddb %xmm0, %xmm1 +; SSE41-NEXT: pandn %xmm1, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: pand %xmm2, %xmm3 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; SSE41-NEXT: movdqa %xmm1, %xmm4 ; SSE41-NEXT: pshufb %xmm3, %xmm4 -; SSE41-NEXT: psrlw $4, %xmm2 -; SSE41-NEXT: pand %xmm1, %xmm2 -; SSE41-NEXT: pshufb %xmm2, %xmm0 -; SSE41-NEXT: paddb %xmm4, %xmm0 +; SSE41-NEXT: psrlw $4, %xmm0 +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: pshufb %xmm0, %xmm1 +; SSE41-NEXT: paddb %xmm4, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: testv16i8u: ; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpsubb %xmm0, %xmm1, %xmm1 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] @@ -1619,11 +1497,9 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind { ; ; AVX512VPOPCNTDQ-LABEL: testv16i8u: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VPOPCNTDQ-NEXT: vpsubb %xmm0, %xmm1, %xmm1 -; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %xmm1, %xmm0, %xmm1 +; AVX512VPOPCNTDQ-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdb %zmm0, %xmm0 @@ -1632,11 +1508,9 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind { ; ; AVX512VPOPCNTDQVL-LABEL: testv16i8u: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VPOPCNTDQVL-NEXT: vpsubb %xmm0, %xmm1, %xmm1 -; AVX512VPOPCNTDQVL-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX512VPOPCNTDQVL-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512VPOPCNTDQVL-NEXT: vpaddb %xmm1, %xmm0, %xmm1 +; AVX512VPOPCNTDQVL-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdb %zmm0, %xmm0 @@ -1645,11 +1519,9 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind { ; ; BITALG_NOVLX-LABEL: testv16i8u: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG_NOVLX-NEXT: vpsubb %xmm0, %xmm1, %xmm1 -; BITALG_NOVLX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; BITALG_NOVLX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; BITALG_NOVLX-NEXT: vpaddb %xmm1, %xmm0, %xmm1 +; BITALG_NOVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; BITALG_NOVLX-NEXT: vzeroupper @@ -1657,31 +1529,28 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind { ; ; BITALG-LABEL: testv16i8u: ; BITALG: # %bb.0: -; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG-NEXT: vpsubb %xmm0, %xmm1, %xmm1 -; BITALG-NEXT: vpand %xmm1, %xmm0, %xmm0 ; BITALG-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; BITALG-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; BITALG-NEXT: vpaddb %xmm1, %xmm0, %xmm1 +; BITALG-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; BITALG-NEXT: vpopcntb %xmm0, %xmm0 ; BITALG-NEXT: retq ; ; X32-SSE-LABEL: testv16i8u: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pxor %xmm1, %xmm1 -; X32-SSE-NEXT: psubb %xmm0, %xmm1 -; X32-SSE-NEXT: pand %xmm0, %xmm1 -; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2 -; X32-SSE-NEXT: paddb %xmm1, %xmm2 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; X32-SSE-NEXT: movdqa %xmm2, %xmm3 -; X32-SSE-NEXT: pand %xmm1, %xmm3 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; X32-SSE-NEXT: movdqa %xmm0, %xmm4 +; X32-SSE-NEXT: pcmpeqd %xmm1, %xmm1 +; X32-SSE-NEXT: paddb %xmm0, %xmm1 +; X32-SSE-NEXT: pandn %xmm1, %xmm0 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-SSE-NEXT: movdqa %xmm0, %xmm3 +; X32-SSE-NEXT: pand %xmm2, %xmm3 +; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X32-SSE-NEXT: movdqa %xmm1, %xmm4 ; X32-SSE-NEXT: pshufb %xmm3, %xmm4 -; X32-SSE-NEXT: psrlw $4, %xmm2 -; X32-SSE-NEXT: pand %xmm1, %xmm2 -; X32-SSE-NEXT: pshufb %xmm2, %xmm0 -; X32-SSE-NEXT: paddb %xmm4, %xmm0 +; X32-SSE-NEXT: psrlw $4, %xmm0 +; X32-SSE-NEXT: pand %xmm2, %xmm0 +; X32-SSE-NEXT: pshufb %xmm0, %xmm1 +; X32-SSE-NEXT: paddb %xmm4, %xmm1 +; X32-SSE-NEXT: movdqa %xmm1, %xmm0 ; X32-SSE-NEXT: retl %out = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %in, i1 -1) ret <16 x i8> %out diff --git a/test/CodeGen/X86/vector-tzcnt-256.ll b/test/CodeGen/X86/vector-tzcnt-256.ll index cae0a2d605a..46c34fb0d44 100644 --- a/test/CodeGen/X86/vector-tzcnt-256.ll +++ b/test/CodeGen/X86/vector-tzcnt-256.ll @@ -15,144 +15,132 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind { ; AVX1-LABEL: testv4i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm3 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm5 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpandn %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX1-NEXT: vpaddb %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vpsadbw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm5 -; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm3 -; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpandn %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 +; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 -; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpsadbw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: testv4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm2 -; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vpandn %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX2-NEXT: vpaddb %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512CDVL-LABEL: testv4i64: ; AVX512CDVL: # %bb.0: -; AVX512CDVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CDVL-NEXT: vpsubq %ymm0, %ymm1, %ymm2 -; AVX512CDVL-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512CDVL-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512CDVL-NEXT: vpaddq %ymm2, %ymm0, %ymm0 -; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512CDVL-NEXT: vpand %ymm2, %ymm0, %ymm3 -; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CDVL-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; AVX512CDVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512CDVL-NEXT: vpaddq %ymm1, %ymm0, %ymm1 +; AVX512CDVL-NEXT: vpandn %ymm1, %ymm0, %ymm0 +; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CDVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512CDVL-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512CDVL-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512CDVL-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512CDVL-NEXT: vpaddb %ymm3, %ymm0, %ymm0 +; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512CDVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512CDVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX512CDVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512CDVL-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; AVX512CDVL-NEXT: retq ; ; AVX512CD-LABEL: testv4i64: ; AVX512CD: # %bb.0: -; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CD-NEXT: vpsubq %ymm0, %ymm1, %ymm2 -; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512CD-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512CD-NEXT: vpaddq %ymm2, %ymm0, %ymm0 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm3 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CD-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; AVX512CD-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512CD-NEXT: vpaddq %ymm1, %ymm0, %ymm1 +; AVX512CD-NEXT: vpandn %ymm1, %ymm0, %ymm0 +; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CD-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512CD-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512CD-NEXT: vpaddb %ymm3, %ymm0, %ymm0 +; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512CD-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512CD-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; AVX512CD-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: testv4i64: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VPOPCNTDQ-NEXT: vpsubq %ymm0, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpaddq %ymm1, %ymm0, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: testv4i64: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VPOPCNTDQVL-NEXT: vpsubq %ymm0, %ymm1, %ymm1 -; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512VPOPCNTDQVL-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpaddq %ymm1, %ymm0, %ymm1 +; AVX512VPOPCNTDQVL-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: testv4i64: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG_NOVLX-NEXT: vpsubq %ymm0, %ymm1, %ymm2 -; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm0 -; BITALG_NOVLX-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; BITALG_NOVLX-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; BITALG_NOVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; BITALG_NOVLX-NEXT: vpaddq %ymm1, %ymm0, %ymm1 +; BITALG_NOVLX-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 +; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: testv4i64: ; BITALG: # %bb.0: -; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG-NEXT: vpsubq %ymm0, %ymm1, %ymm2 -; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm0 -; BITALG-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; BITALG-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; BITALG-NEXT: vpaddq %ymm1, %ymm0, %ymm1 +; BITALG-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 +; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; BITALG-NEXT: retq ; ; X32-AVX-LABEL: testv4i64: ; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X32-AVX-NEXT: vpsubq %ymm0, %ymm1, %ymm2 -; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm0 -; X32-AVX-NEXT: vpsubq {{\.LCPI.*}}, %ymm0, %ymm0 -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm3 -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; X32-AVX-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; X32-AVX-NEXT: vpsubq {{\.LCPI.*}}, %ymm0, %ymm1 +; X32-AVX-NEXT: vpandn %ymm1, %ymm0, %ymm0 +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0 -; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm0 -; X32-AVX-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; X32-AVX-NEXT: vpaddb %ymm3, %ymm0, %ymm0 +; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0 +; X32-AVX-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; X32-AVX-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; X32-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X32-AVX-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; X32-AVX-NEXT: retl %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %in, i1 0) @@ -163,48 +151,45 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind { ; AVX1-LABEL: testv4i64u: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm3 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm5 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpandn %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX1-NEXT: vpaddb %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vpsadbw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm5 -; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm3 -; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpandn %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 +; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 -; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpsadbw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: testv4i64u: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm2 -; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vpandn %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX2-NEXT: vpaddb %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; @@ -230,61 +215,54 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind { ; ; AVX512VPOPCNTDQ-LABEL: testv4i64u: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VPOPCNTDQ-NEXT: vpsubq %ymm0, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpaddq %ymm1, %ymm0, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: testv4i64u: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VPOPCNTDQVL-NEXT: vpsubq %ymm0, %ymm1, %ymm1 -; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512VPOPCNTDQVL-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpaddq %ymm1, %ymm0, %ymm1 +; AVX512VPOPCNTDQVL-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: testv4i64u: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG_NOVLX-NEXT: vpsubq %ymm0, %ymm1, %ymm2 -; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm0 -; BITALG_NOVLX-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; BITALG_NOVLX-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; BITALG_NOVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; BITALG_NOVLX-NEXT: vpaddq %ymm1, %ymm0, %ymm1 +; BITALG_NOVLX-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 +; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: testv4i64u: ; BITALG: # %bb.0: -; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG-NEXT: vpsubq %ymm0, %ymm1, %ymm2 -; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm0 -; BITALG-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; BITALG-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; BITALG-NEXT: vpaddq %ymm1, %ymm0, %ymm1 +; BITALG-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 +; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; BITALG-NEXT: retq ; ; X32-AVX-LABEL: testv4i64u: ; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X32-AVX-NEXT: vpsubq %ymm0, %ymm1, %ymm2 -; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm0 -; X32-AVX-NEXT: vpsubq {{\.LCPI.*}}, %ymm0, %ymm0 -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm3 -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; X32-AVX-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; X32-AVX-NEXT: vpsubq {{\.LCPI.*}}, %ymm0, %ymm1 +; X32-AVX-NEXT: vpandn %ymm1, %ymm0, %ymm0 +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0 -; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm0 -; X32-AVX-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; X32-AVX-NEXT: vpaddb %ymm3, %ymm0, %ymm0 +; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0 +; X32-AVX-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; X32-AVX-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; X32-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X32-AVX-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; X32-AVX-NEXT: retl %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %in, i1 -1) @@ -295,56 +273,53 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { ; AVX1-LABEL: testv8i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm3 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm5 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpandn %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX1-NEXT: vpaddb %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX1-NEXT: vpsadbw %xmm2, %xmm5, %xmm5 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; AVX1-NEXT: vpsadbw %xmm4, %xmm6, %xmm6 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX1-NEXT: vpsadbw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vpsubd %xmm0, %xmm2, %xmm5 -; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm3 -; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 +; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpandn %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 +; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 -; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX1-NEXT: vpsadbw %xmm2, %xmm3, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: testv8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpsubd %ymm0, %ymm1, %ymm2 -; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vpandn %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX2-NEXT: vpaddb %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] @@ -354,19 +329,18 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { ; ; AVX512CDVL-LABEL: testv8i32: ; AVX512CDVL: # %bb.0: -; AVX512CDVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CDVL-NEXT: vpsubd %ymm0, %ymm1, %ymm2 -; AVX512CDVL-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512CDVL-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512CDVL-NEXT: vpaddd %ymm2, %ymm0, %ymm0 -; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512CDVL-NEXT: vpand %ymm2, %ymm0, %ymm3 -; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CDVL-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; AVX512CDVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512CDVL-NEXT: vpaddd %ymm1, %ymm0, %ymm1 +; AVX512CDVL-NEXT: vpandn %ymm1, %ymm0, %ymm0 +; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CDVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512CDVL-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512CDVL-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512CDVL-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512CDVL-NEXT: vpaddb %ymm3, %ymm0, %ymm0 +; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512CDVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512CDVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX512CDVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512CDVL-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX512CDVL-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 ; AVX512CDVL-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] @@ -376,19 +350,18 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { ; ; AVX512CD-LABEL: testv8i32: ; AVX512CD: # %bb.0: -; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CD-NEXT: vpsubd %ymm0, %ymm1, %ymm2 -; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512CD-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512CD-NEXT: vpaddd %ymm2, %ymm0, %ymm0 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm3 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CD-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; AVX512CD-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512CD-NEXT: vpaddd %ymm1, %ymm0, %ymm1 +; AVX512CD-NEXT: vpandn %ymm1, %ymm0, %ymm0 +; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CD-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512CD-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512CD-NEXT: vpaddb %ymm3, %ymm0, %ymm0 +; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512CD-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512CD-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX512CD-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 ; AVX512CD-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] @@ -398,33 +371,28 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { ; ; AVX512VPOPCNTDQ-LABEL: testv8i32: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VPOPCNTDQ-NEXT: vpsubd %ymm0, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpaddd %ymm1, %ymm0, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: testv8i32: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VPOPCNTDQVL-NEXT: vpsubd %ymm0, %ymm1, %ymm1 -; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512VPOPCNTDQVL-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpaddd %ymm1, %ymm0, %ymm1 +; AVX512VPOPCNTDQVL-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: testv8i32: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG_NOVLX-NEXT: vpsubd %ymm0, %ymm1, %ymm2 -; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm0 -; BITALG_NOVLX-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; BITALG_NOVLX-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; BITALG_NOVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; BITALG_NOVLX-NEXT: vpaddd %ymm1, %ymm0, %ymm1 +; BITALG_NOVLX-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 +; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; BITALG_NOVLX-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 ; BITALG_NOVLX-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] @@ -434,12 +402,11 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { ; ; BITALG-LABEL: testv8i32: ; BITALG: # %bb.0: -; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG-NEXT: vpsubd %ymm0, %ymm1, %ymm2 -; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm0 -; BITALG-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; BITALG-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; BITALG-NEXT: vpaddd %ymm1, %ymm0, %ymm1 +; BITALG-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 +; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; BITALG-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 ; BITALG-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] @@ -449,19 +416,18 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { ; ; X32-AVX-LABEL: testv8i32: ; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X32-AVX-NEXT: vpsubd %ymm0, %ymm1, %ymm2 -; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm0 -; X32-AVX-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; X32-AVX-NEXT: vpaddd %ymm2, %ymm0, %ymm0 -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm3 -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; X32-AVX-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; X32-AVX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; X32-AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm1 +; X32-AVX-NEXT: vpandn %ymm1, %ymm0, %ymm0 +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0 -; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm0 -; X32-AVX-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; X32-AVX-NEXT: vpaddb %ymm3, %ymm0, %ymm0 +; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0 +; X32-AVX-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; X32-AVX-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; X32-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X32-AVX-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; X32-AVX-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 ; X32-AVX-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] @@ -476,56 +442,53 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind { ; AVX1-LABEL: testv8i32u: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm3 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm5 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpandn %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX1-NEXT: vpaddb %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX1-NEXT: vpsadbw %xmm2, %xmm5, %xmm5 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; AVX1-NEXT: vpsadbw %xmm4, %xmm6, %xmm6 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX1-NEXT: vpsadbw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vpsubd %xmm0, %xmm2, %xmm5 -; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm3 -; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 +; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpandn %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 +; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 -; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX1-NEXT: vpsadbw %xmm2, %xmm3, %xmm3 +; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpsadbw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: testv8i32u: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpsubd %ymm0, %ymm1, %ymm2 -; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vpandn %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX2-NEXT: vpaddb %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] @@ -555,33 +518,28 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind { ; ; AVX512VPOPCNTDQ-LABEL: testv8i32u: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VPOPCNTDQ-NEXT: vpsubd %ymm0, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpaddd %ymm1, %ymm0, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: testv8i32u: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VPOPCNTDQVL-NEXT: vpsubd %ymm0, %ymm1, %ymm1 -; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512VPOPCNTDQVL-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpaddd %ymm1, %ymm0, %ymm1 +; AVX512VPOPCNTDQVL-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: testv8i32u: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG_NOVLX-NEXT: vpsubd %ymm0, %ymm1, %ymm2 -; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm0 -; BITALG_NOVLX-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; BITALG_NOVLX-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; BITALG_NOVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; BITALG_NOVLX-NEXT: vpaddd %ymm1, %ymm0, %ymm1 +; BITALG_NOVLX-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 +; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; BITALG_NOVLX-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 ; BITALG_NOVLX-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] @@ -591,12 +549,11 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind { ; ; BITALG-LABEL: testv8i32u: ; BITALG: # %bb.0: -; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG-NEXT: vpsubd %ymm0, %ymm1, %ymm2 -; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm0 -; BITALG-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; BITALG-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; BITALG-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; BITALG-NEXT: vpaddd %ymm1, %ymm0, %ymm1 +; BITALG-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 +; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; BITALG-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 ; BITALG-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] @@ -606,19 +563,18 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind { ; ; X32-AVX-LABEL: testv8i32u: ; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X32-AVX-NEXT: vpsubd %ymm0, %ymm1, %ymm2 -; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm0 -; X32-AVX-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; X32-AVX-NEXT: vpaddd %ymm2, %ymm0, %ymm0 -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm3 -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; X32-AVX-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; X32-AVX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; X32-AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm1 +; X32-AVX-NEXT: vpandn %ymm1, %ymm0, %ymm0 +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 +; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0 -; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm0 -; X32-AVX-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; X32-AVX-NEXT: vpaddb %ymm3, %ymm0, %ymm0 +; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0 +; X32-AVX-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; X32-AVX-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; X32-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X32-AVX-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; X32-AVX-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 ; X32-AVX-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] @@ -632,31 +588,28 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind { define <16 x i16> @testv16i16(<16 x i16> %in) nounwind { ; AVX1-LABEL: testv16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm2 -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpaddw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpandn %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2 -; AVX1-NEXT: vpaddb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vpsllw $8, %xmm2, %xmm5 -; AVX1-NEXT: vpaddb %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpaddb %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpsllw $8, %xmm2, %xmm4 +; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpaddw %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 @@ -666,11 +619,9 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind { ; ; AVX2-LABEL: testv16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpsubw %ymm0, %ymm1, %ymm1 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] @@ -686,11 +637,9 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind { ; ; AVX512CDVL-LABEL: testv16i16: ; AVX512CDVL: # %bb.0: -; AVX512CDVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CDVL-NEXT: vpsubw %ymm0, %ymm1, %ymm1 -; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512CDVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512CDVL-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; AVX512CDVL-NEXT: vpaddw %ymm1, %ymm0, %ymm1 +; AVX512CDVL-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] @@ -706,11 +655,9 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind { ; ; AVX512CD-LABEL: testv16i16: ; AVX512CD: # %bb.0: -; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CD-NEXT: vpsubw %ymm0, %ymm1, %ymm1 -; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512CD-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512CD-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; AVX512CD-NEXT: vpaddw %ymm1, %ymm0, %ymm1 +; AVX512CD-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] @@ -726,11 +673,9 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind { ; ; AVX512VPOPCNTDQ-LABEL: testv16i16: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm0, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm1, %ymm0, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 @@ -738,11 +683,9 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind { ; ; AVX512VPOPCNTDQVL-LABEL: testv16i16: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VPOPCNTDQVL-NEXT: vpsubw %ymm0, %ymm1, %ymm1 -; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512VPOPCNTDQVL-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpaddw %ymm1, %ymm0, %ymm1 +; AVX512VPOPCNTDQVL-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0 @@ -750,32 +693,26 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind { ; ; BITALG_NOVLX-LABEL: testv16i16: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG_NOVLX-NEXT: vpsubw %ymm0, %ymm1, %ymm1 -; BITALG_NOVLX-NEXT: vpand %ymm1, %ymm0, %ymm0 ; BITALG_NOVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; BITALG_NOVLX-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; BITALG_NOVLX-NEXT: vpaddw %ymm1, %ymm0, %ymm1 +; BITALG_NOVLX-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: testv16i16: ; BITALG: # %bb.0: -; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG-NEXT: vpsubw %ymm0, %ymm1, %ymm1 -; BITALG-NEXT: vpand %ymm1, %ymm0, %ymm0 ; BITALG-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; BITALG-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpaddw %ymm1, %ymm0, %ymm1 +; BITALG-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; BITALG-NEXT: vpopcntw %ymm0, %ymm0 ; BITALG-NEXT: retq ; ; X32-AVX-LABEL: testv16i16: ; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X32-AVX-NEXT: vpsubw %ymm0, %ymm1, %ymm1 -; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0 ; X32-AVX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; X32-AVX-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; X32-AVX-NEXT: vpaddw %ymm1, %ymm0, %ymm1 +; X32-AVX-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] @@ -795,31 +732,28 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind { define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind { ; AVX1-LABEL: testv16i16u: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm2 -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpaddw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpandn %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2 -; AVX1-NEXT: vpaddb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vpsllw $8, %xmm2, %xmm5 -; AVX1-NEXT: vpaddb %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpaddb %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpsllw $8, %xmm2, %xmm4 +; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpaddw %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 @@ -829,11 +763,9 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind { ; ; AVX2-LABEL: testv16i16u: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpsubw %ymm0, %ymm1, %ymm1 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] @@ -849,11 +781,9 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind { ; ; AVX512CDVL-LABEL: testv16i16u: ; AVX512CDVL: # %bb.0: -; AVX512CDVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CDVL-NEXT: vpsubw %ymm0, %ymm1, %ymm1 -; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512CDVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512CDVL-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; AVX512CDVL-NEXT: vpaddw %ymm1, %ymm0, %ymm1 +; AVX512CDVL-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] @@ -869,11 +799,9 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind { ; ; AVX512CD-LABEL: testv16i16u: ; AVX512CD: # %bb.0: -; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CD-NEXT: vpsubw %ymm0, %ymm1, %ymm1 -; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512CD-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512CD-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; AVX512CD-NEXT: vpaddw %ymm1, %ymm0, %ymm1 +; AVX512CD-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] @@ -889,11 +817,9 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind { ; ; AVX512VPOPCNTDQ-LABEL: testv16i16u: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm0, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm1, %ymm0, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 @@ -901,11 +827,9 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind { ; ; AVX512VPOPCNTDQVL-LABEL: testv16i16u: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VPOPCNTDQVL-NEXT: vpsubw %ymm0, %ymm1, %ymm1 -; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512VPOPCNTDQVL-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpaddw %ymm1, %ymm0, %ymm1 +; AVX512VPOPCNTDQVL-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0 @@ -913,32 +837,26 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind { ; ; BITALG_NOVLX-LABEL: testv16i16u: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG_NOVLX-NEXT: vpsubw %ymm0, %ymm1, %ymm1 -; BITALG_NOVLX-NEXT: vpand %ymm1, %ymm0, %ymm0 ; BITALG_NOVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; BITALG_NOVLX-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; BITALG_NOVLX-NEXT: vpaddw %ymm1, %ymm0, %ymm1 +; BITALG_NOVLX-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: testv16i16u: ; BITALG: # %bb.0: -; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG-NEXT: vpsubw %ymm0, %ymm1, %ymm1 -; BITALG-NEXT: vpand %ymm1, %ymm0, %ymm0 ; BITALG-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; BITALG-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpaddw %ymm1, %ymm0, %ymm1 +; BITALG-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; BITALG-NEXT: vpopcntw %ymm0, %ymm0 ; BITALG-NEXT: retq ; ; X32-AVX-LABEL: testv16i16u: ; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X32-AVX-NEXT: vpsubw %ymm0, %ymm1, %ymm1 -; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0 ; X32-AVX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; X32-AVX-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; X32-AVX-NEXT: vpaddw %ymm1, %ymm0, %ymm1 +; X32-AVX-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] @@ -959,38 +877,33 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind { ; AVX1-LABEL: testv32i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm3 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm5 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpandn %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX1-NEXT: vpaddb %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vpsubb %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm2 -; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpandn %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 +; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: testv32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpsubb %ymm0, %ymm1, %ymm1 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] @@ -1003,11 +916,9 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind { ; ; AVX512CDVL-LABEL: testv32i8: ; AVX512CDVL: # %bb.0: -; AVX512CDVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CDVL-NEXT: vpsubb %ymm0, %ymm1, %ymm1 -; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512CDVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512CDVL-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512CDVL-NEXT: vpaddb %ymm1, %ymm0, %ymm1 +; AVX512CDVL-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] @@ -1020,11 +931,9 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind { ; ; AVX512CD-LABEL: testv32i8: ; AVX512CD: # %bb.0: -; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CD-NEXT: vpsubb %ymm0, %ymm1, %ymm1 -; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512CD-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512CD-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512CD-NEXT: vpaddb %ymm1, %ymm0, %ymm1 +; AVX512CD-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] @@ -1037,11 +946,9 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind { ; ; AVX512VPOPCNTDQ-LABEL: testv32i8: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm0, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm1, %ymm0, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] @@ -1054,11 +961,9 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind { ; ; AVX512VPOPCNTDQVL-LABEL: testv32i8: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VPOPCNTDQVL-NEXT: vpsubb %ymm0, %ymm1, %ymm1 -; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512VPOPCNTDQVL-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpaddb %ymm1, %ymm0, %ymm1 +; AVX512VPOPCNTDQVL-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] @@ -1071,32 +976,26 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind { ; ; BITALG_NOVLX-LABEL: testv32i8: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG_NOVLX-NEXT: vpsubb %ymm0, %ymm1, %ymm1 -; BITALG_NOVLX-NEXT: vpand %ymm1, %ymm0, %ymm0 ; BITALG_NOVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; BITALG_NOVLX-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; BITALG_NOVLX-NEXT: vpaddb %ymm1, %ymm0, %ymm1 +; BITALG_NOVLX-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: testv32i8: ; BITALG: # %bb.0: -; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG-NEXT: vpsubb %ymm0, %ymm1, %ymm1 -; BITALG-NEXT: vpand %ymm1, %ymm0, %ymm0 ; BITALG-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; BITALG-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpaddb %ymm1, %ymm0, %ymm1 +; BITALG-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: retq ; ; X32-AVX-LABEL: testv32i8: ; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X32-AVX-NEXT: vpsubb %ymm0, %ymm1, %ymm1 -; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0 ; X32-AVX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; X32-AVX-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; X32-AVX-NEXT: vpaddb %ymm1, %ymm0, %ymm1 +; X32-AVX-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] @@ -1114,38 +1013,33 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind { ; AVX1-LABEL: testv32i8u: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm3 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm5 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpandn %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX1-NEXT: vpaddb %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vpsubb %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm2 -; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpandn %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2 +; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0 +; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0 ; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: testv32i8u: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpsubb %ymm0, %ymm1, %ymm1 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] @@ -1158,11 +1052,9 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind { ; ; AVX512CDVL-LABEL: testv32i8u: ; AVX512CDVL: # %bb.0: -; AVX512CDVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CDVL-NEXT: vpsubb %ymm0, %ymm1, %ymm1 -; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512CDVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512CDVL-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512CDVL-NEXT: vpaddb %ymm1, %ymm0, %ymm1 +; AVX512CDVL-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] @@ -1175,11 +1067,9 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind { ; ; AVX512CD-LABEL: testv32i8u: ; AVX512CD: # %bb.0: -; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CD-NEXT: vpsubb %ymm0, %ymm1, %ymm1 -; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512CD-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512CD-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512CD-NEXT: vpaddb %ymm1, %ymm0, %ymm1 +; AVX512CD-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] @@ -1192,11 +1082,9 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind { ; ; AVX512VPOPCNTDQ-LABEL: testv32i8u: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm0, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm1, %ymm0, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] @@ -1209,11 +1097,9 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind { ; ; AVX512VPOPCNTDQVL-LABEL: testv32i8u: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VPOPCNTDQVL-NEXT: vpsubb %ymm0, %ymm1, %ymm1 -; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512VPOPCNTDQVL-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512VPOPCNTDQVL-NEXT: vpaddb %ymm1, %ymm0, %ymm1 +; AVX512VPOPCNTDQVL-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] @@ -1226,32 +1112,26 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind { ; ; BITALG_NOVLX-LABEL: testv32i8u: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG_NOVLX-NEXT: vpsubb %ymm0, %ymm1, %ymm1 -; BITALG_NOVLX-NEXT: vpand %ymm1, %ymm0, %ymm0 ; BITALG_NOVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; BITALG_NOVLX-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; BITALG_NOVLX-NEXT: vpaddb %ymm1, %ymm0, %ymm1 +; BITALG_NOVLX-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: testv32i8u: ; BITALG: # %bb.0: -; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG-NEXT: vpsubb %ymm0, %ymm1, %ymm1 -; BITALG-NEXT: vpand %ymm1, %ymm0, %ymm0 ; BITALG-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; BITALG-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; BITALG-NEXT: vpaddb %ymm1, %ymm0, %ymm1 +; BITALG-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; BITALG-NEXT: vpopcntb %ymm0, %ymm0 ; BITALG-NEXT: retq ; ; X32-AVX-LABEL: testv32i8u: ; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X32-AVX-NEXT: vpsubb %ymm0, %ymm1, %ymm1 -; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0 ; X32-AVX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; X32-AVX-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; X32-AVX-NEXT: vpaddb %ymm1, %ymm0, %ymm1 +; X32-AVX-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] diff --git a/test/CodeGen/X86/vector-tzcnt-512.ll b/test/CodeGen/X86/vector-tzcnt-512.ll index 4a9fd82593a..300d7b4ac6c 100644 --- a/test/CodeGen/X86/vector-tzcnt-512.ll +++ b/test/CodeGen/X86/vector-tzcnt-512.ll @@ -8,11 +8,9 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind { ; AVX512CD-LABEL: testv8i64: ; AVX512CD: # %bb.0: -; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CD-NEXT: vpsubq %zmm0, %zmm1, %zmm1 -; AVX512CD-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512CD-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 -; AVX512CD-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; AVX512CD-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512CD-NEXT: vpandnq %zmm1, %zmm0, %zmm0 ; AVX512CD-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CD-NEXT: vpand %ymm2, %ymm1, %ymm3 @@ -36,58 +34,53 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind { ; ; AVX512CDBW-LABEL: testv8i64: ; AVX512CDBW: # %bb.0: -; AVX512CDBW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CDBW-NEXT: vpsubq %zmm0, %zmm1, %zmm2 -; AVX512CDBW-NEXT: vpandq %zmm2, %zmm0, %zmm0 -; AVX512CDBW-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 -; AVX512CDBW-NEXT: vpaddq %zmm2, %zmm0, %zmm0 -; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512CDBW-NEXT: vpandq %zmm2, %zmm0, %zmm3 -; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CDBW-NEXT: vpshufb %zmm3, %zmm4, %zmm3 +; AVX512CDBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512CDBW-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512CDBW-NEXT: vpandnq %zmm1, %zmm0, %zmm0 +; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm2 +; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CDBW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512CDBW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512CDBW-NEXT: vpandq %zmm2, %zmm0, %zmm0 -; AVX512CDBW-NEXT: vpshufb %zmm0, %zmm4, %zmm0 -; AVX512CDBW-NEXT: vpaddb %zmm3, %zmm0, %zmm0 +; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512CDBW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 +; AVX512CDBW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 +; AVX512CDBW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512CDBW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 ; AVX512CDBW-NEXT: retq ; ; AVX512BW-LABEL: testv8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsubq %zmm0, %zmm1, %zmm2 -; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 -; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: vpshufb %zmm3, %zmm4, %zmm3 +; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufb %zmm0, %zmm4, %zmm0 -; AVX512BW-NEXT: vpaddb %zmm3, %zmm0, %zmm0 +; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 +; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: testv8i64: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VPOPCNTDQ-NEXT: vpsubq %zmm0, %zmm1, %zmm1 -; AVX512VPOPCNTDQ-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 -; AVX512VPOPCNTDQ-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpandnq %zmm1, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: retq ; ; BITALG-LABEL: testv8i64: ; BITALG: # %bb.0: -; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG-NEXT: vpsubq %zmm0, %zmm1, %zmm2 -; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm0 -; BITALG-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 -; BITALG-NEXT: vpaddq %zmm2, %zmm0, %zmm0 +; BITALG-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; BITALG-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; BITALG-NEXT: vpandnq %zmm1, %zmm0, %zmm0 ; BITALG-NEXT: vpopcntb %zmm0, %zmm0 +; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 ; BITALG-NEXT: retq %out = call <8 x i64> @llvm.cttz.v8i64(<8 x i64> %in, i1 0) @@ -117,40 +110,36 @@ define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind { ; ; AVX512BW-LABEL: testv8i64u: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsubq %zmm0, %zmm1, %zmm2 -; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 -; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: vpshufb %zmm3, %zmm4, %zmm3 +; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufb %zmm0, %zmm4, %zmm0 -; AVX512BW-NEXT: vpaddb %zmm3, %zmm0, %zmm0 +; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 +; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: testv8i64u: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VPOPCNTDQ-NEXT: vpsubq %zmm0, %zmm1, %zmm1 -; AVX512VPOPCNTDQ-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 -; AVX512VPOPCNTDQ-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpandnq %zmm1, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: retq ; ; BITALG-LABEL: testv8i64u: ; BITALG: # %bb.0: -; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG-NEXT: vpsubq %zmm0, %zmm1, %zmm2 -; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm0 -; BITALG-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 -; BITALG-NEXT: vpaddq %zmm2, %zmm0, %zmm0 +; BITALG-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; BITALG-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; BITALG-NEXT: vpandnq %zmm1, %zmm0, %zmm0 ; BITALG-NEXT: vpopcntb %zmm0, %zmm0 +; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 ; BITALG-NEXT: retq %out = call <8 x i64> @llvm.cttz.v8i64(<8 x i64> %in, i1 -1) @@ -160,11 +149,9 @@ define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind { define <16 x i32> @testv16i32(<16 x i32> %in) nounwind { ; AVX512CD-LABEL: testv16i32: ; AVX512CD: # %bb.0: -; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CD-NEXT: vpsubd %zmm0, %zmm1, %zmm1 -; AVX512CD-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512CD-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 -; AVX512CD-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512CD-NEXT: vpaddd %zmm1, %zmm0, %zmm1 +; AVX512CD-NEXT: vpandnq %zmm1, %zmm0, %zmm0 ; AVX512CD-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CD-NEXT: vpand %ymm2, %ymm1, %ymm3 @@ -196,19 +183,18 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind { ; ; AVX512CDBW-LABEL: testv16i32: ; AVX512CDBW: # %bb.0: -; AVX512CDBW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CDBW-NEXT: vpsubd %zmm0, %zmm1, %zmm2 -; AVX512CDBW-NEXT: vpandq %zmm2, %zmm0, %zmm0 -; AVX512CDBW-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 -; AVX512CDBW-NEXT: vpaddd %zmm2, %zmm0, %zmm0 -; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512CDBW-NEXT: vpandq %zmm2, %zmm0, %zmm3 -; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CDBW-NEXT: vpshufb %zmm3, %zmm4, %zmm3 +; AVX512CDBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512CDBW-NEXT: vpaddd %zmm1, %zmm0, %zmm1 +; AVX512CDBW-NEXT: vpandnq %zmm1, %zmm0, %zmm0 +; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm2 +; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CDBW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512CDBW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512CDBW-NEXT: vpandq %zmm2, %zmm0, %zmm0 -; AVX512CDBW-NEXT: vpshufb %zmm0, %zmm4, %zmm0 -; AVX512CDBW-NEXT: vpaddb %zmm3, %zmm0, %zmm0 +; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512CDBW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 +; AVX512CDBW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 +; AVX512CDBW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512CDBW-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] ; AVX512CDBW-NEXT: vpsadbw %zmm1, %zmm2, %zmm2 ; AVX512CDBW-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] @@ -218,19 +204,18 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind { ; ; AVX512BW-LABEL: testv16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsubd %zmm0, %zmm1, %zmm2 -; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 -; AVX512BW-NEXT: vpaddd %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: vpshufb %zmm3, %zmm4, %zmm3 +; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm1 +; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufb %zmm0, %zmm4, %zmm0 -; AVX512BW-NEXT: vpaddb %zmm3, %zmm0, %zmm0 +; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 +; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] ; AVX512BW-NEXT: vpsadbw %zmm1, %zmm2, %zmm2 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] @@ -240,22 +225,19 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind { ; ; AVX512VPOPCNTDQ-LABEL: testv16i32: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VPOPCNTDQ-NEXT: vpsubd %zmm0, %zmm1, %zmm1 -; AVX512VPOPCNTDQ-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 -; AVX512VPOPCNTDQ-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpaddd %zmm1, %zmm0, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpandnq %zmm1, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: retq ; ; BITALG-LABEL: testv16i32: ; BITALG: # %bb.0: -; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG-NEXT: vpsubd %zmm0, %zmm1, %zmm2 -; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm0 -; BITALG-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 -; BITALG-NEXT: vpaddd %zmm2, %zmm0, %zmm0 +; BITALG-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; BITALG-NEXT: vpaddd %zmm1, %zmm0, %zmm1 +; BITALG-NEXT: vpandnq %zmm1, %zmm0, %zmm0 ; BITALG-NEXT: vpopcntb %zmm0, %zmm0 +; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] ; BITALG-NEXT: vpsadbw %zmm1, %zmm2, %zmm2 ; BITALG-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] @@ -289,19 +271,18 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind { ; ; AVX512BW-LABEL: testv16i32u: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsubd %zmm0, %zmm1, %zmm2 -; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 -; AVX512BW-NEXT: vpaddd %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: vpshufb %zmm3, %zmm4, %zmm3 +; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm1 +; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufb %zmm0, %zmm4, %zmm0 -; AVX512BW-NEXT: vpaddb %zmm3, %zmm0, %zmm0 +; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 +; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] ; AVX512BW-NEXT: vpsadbw %zmm1, %zmm2, %zmm2 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] @@ -311,22 +292,19 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind { ; ; AVX512VPOPCNTDQ-LABEL: testv16i32u: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VPOPCNTDQ-NEXT: vpsubd %zmm0, %zmm1, %zmm1 -; AVX512VPOPCNTDQ-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 -; AVX512VPOPCNTDQ-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpaddd %zmm1, %zmm0, %zmm1 +; AVX512VPOPCNTDQ-NEXT: vpandnq %zmm1, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: retq ; ; BITALG-LABEL: testv16i32u: ; BITALG: # %bb.0: -; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG-NEXT: vpsubd %zmm0, %zmm1, %zmm2 -; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm0 -; BITALG-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 -; BITALG-NEXT: vpaddd %zmm2, %zmm0, %zmm0 +; BITALG-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; BITALG-NEXT: vpaddd %zmm1, %zmm0, %zmm1 +; BITALG-NEXT: vpandnq %zmm1, %zmm0, %zmm0 ; BITALG-NEXT: vpopcntb %zmm0, %zmm0 +; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] ; BITALG-NEXT: vpsadbw %zmm1, %zmm2, %zmm2 ; BITALG-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] @@ -340,30 +318,27 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind { define <32 x i16> @testv32i16(<32 x i16> %in) nounwind { ; AVX512CD-LABEL: testv32i16: ; AVX512CD: # %bb.0: -; AVX512CD-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512CD-NEXT: vpsubw %ymm0, %ymm2, %ymm3 -; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX512CD-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 -; AVX512CD-NEXT: vpaddw %ymm3, %ymm0, %ymm0 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512CD-NEXT: vpand %ymm4, %ymm0, %ymm5 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CD-NEXT: vpshufb %ymm5, %ymm6, %ymm5 +; AVX512CD-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX512CD-NEXT: vpaddw %ymm2, %ymm0, %ymm3 +; AVX512CD-NEXT: vpandn %ymm3, %ymm0, %ymm0 +; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm4 +; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CD-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512CD-NEXT: vpand %ymm4, %ymm0, %ymm0 -; AVX512CD-NEXT: vpshufb %ymm0, %ymm6, %ymm0 -; AVX512CD-NEXT: vpaddb %ymm5, %ymm0, %ymm0 -; AVX512CD-NEXT: vpsllw $8, %ymm0, %ymm5 -; AVX512CD-NEXT: vpaddb %ymm0, %ymm5, %ymm0 +; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX512CD-NEXT: vpshufb %ymm0, %ymm5, %ymm0 +; AVX512CD-NEXT: vpaddb %ymm4, %ymm0, %ymm0 +; AVX512CD-NEXT: vpsllw $8, %ymm0, %ymm4 +; AVX512CD-NEXT: vpaddb %ymm0, %ymm4, %ymm0 ; AVX512CD-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512CD-NEXT: vpsubw %ymm1, %ymm2, %ymm2 -; AVX512CD-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512CD-NEXT: vpaddw %ymm3, %ymm1, %ymm1 -; AVX512CD-NEXT: vpand %ymm4, %ymm1, %ymm2 -; AVX512CD-NEXT: vpshufb %ymm2, %ymm6, %ymm2 +; AVX512CD-NEXT: vpaddw %ymm2, %ymm1, %ymm2 +; AVX512CD-NEXT: vpandn %ymm2, %ymm1, %ymm1 +; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm2 +; AVX512CD-NEXT: vpshufb %ymm2, %ymm5, %ymm2 ; AVX512CD-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512CD-NEXT: vpand %ymm4, %ymm1, %ymm1 -; AVX512CD-NEXT: vpshufb %ymm1, %ymm6, %ymm1 +; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX512CD-NEXT: vpshufb %ymm1, %ymm5, %ymm1 ; AVX512CD-NEXT: vpaddb %ymm2, %ymm1, %ymm1 ; AVX512CD-NEXT: vpsllw $8, %ymm1, %ymm2 ; AVX512CD-NEXT: vpaddb %ymm1, %ymm2, %ymm1 @@ -372,11 +347,9 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind { ; ; AVX512CDBW-LABEL: testv32i16: ; AVX512CDBW: # %bb.0: -; AVX512CDBW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CDBW-NEXT: vpsubw %zmm0, %zmm1, %zmm1 -; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512CDBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 -; AVX512CDBW-NEXT: vpaddw %zmm1, %zmm0, %zmm0 +; AVX512CDBW-NEXT: vpaddw %zmm1, %zmm0, %zmm1 +; AVX512CDBW-NEXT: vpandnq %zmm1, %zmm0, %zmm0 ; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] @@ -392,11 +365,9 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind { ; ; AVX512BW-LABEL: testv32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsubw %zmm0, %zmm1, %zmm1 -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 -; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm1 +; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] @@ -412,17 +383,14 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind { ; ; AVX512VPOPCNTDQ-LABEL: testv32i16: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm0, %ymm2, %ymm3 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 -; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm3, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm2, %ymm0, %ymm3 +; AVX512VPOPCNTDQ-NEXT: vpandn %ymm3, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm1, %ymm2, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm3, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm2, %ymm1, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpandn %ymm2, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm1, %zmm1 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm1, %ymm1 @@ -430,11 +398,9 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind { ; ; BITALG-LABEL: testv32i16: ; BITALG: # %bb.0: -; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG-NEXT: vpsubw %zmm0, %zmm1, %zmm1 -; BITALG-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; BITALG-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 -; BITALG-NEXT: vpaddw %zmm1, %zmm0, %zmm0 +; BITALG-NEXT: vpaddw %zmm1, %zmm0, %zmm1 +; BITALG-NEXT: vpandnq %zmm1, %zmm0, %zmm0 ; BITALG-NEXT: vpopcntw %zmm0, %zmm0 ; BITALG-NEXT: retq %out = call <32 x i16> @llvm.cttz.v32i16(<32 x i16> %in, i1 0) @@ -444,30 +410,27 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind { define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind { ; AVX512CD-LABEL: testv32i16u: ; AVX512CD: # %bb.0: -; AVX512CD-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512CD-NEXT: vpsubw %ymm0, %ymm2, %ymm3 -; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX512CD-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 -; AVX512CD-NEXT: vpaddw %ymm3, %ymm0, %ymm0 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512CD-NEXT: vpand %ymm4, %ymm0, %ymm5 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CD-NEXT: vpshufb %ymm5, %ymm6, %ymm5 +; AVX512CD-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX512CD-NEXT: vpaddw %ymm2, %ymm0, %ymm3 +; AVX512CD-NEXT: vpandn %ymm3, %ymm0, %ymm0 +; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm4 +; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CD-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512CD-NEXT: vpand %ymm4, %ymm0, %ymm0 -; AVX512CD-NEXT: vpshufb %ymm0, %ymm6, %ymm0 -; AVX512CD-NEXT: vpaddb %ymm5, %ymm0, %ymm0 -; AVX512CD-NEXT: vpsllw $8, %ymm0, %ymm5 -; AVX512CD-NEXT: vpaddb %ymm0, %ymm5, %ymm0 +; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX512CD-NEXT: vpshufb %ymm0, %ymm5, %ymm0 +; AVX512CD-NEXT: vpaddb %ymm4, %ymm0, %ymm0 +; AVX512CD-NEXT: vpsllw $8, %ymm0, %ymm4 +; AVX512CD-NEXT: vpaddb %ymm0, %ymm4, %ymm0 ; AVX512CD-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512CD-NEXT: vpsubw %ymm1, %ymm2, %ymm2 -; AVX512CD-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512CD-NEXT: vpaddw %ymm3, %ymm1, %ymm1 -; AVX512CD-NEXT: vpand %ymm4, %ymm1, %ymm2 -; AVX512CD-NEXT: vpshufb %ymm2, %ymm6, %ymm2 +; AVX512CD-NEXT: vpaddw %ymm2, %ymm1, %ymm2 +; AVX512CD-NEXT: vpandn %ymm2, %ymm1, %ymm1 +; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm2 +; AVX512CD-NEXT: vpshufb %ymm2, %ymm5, %ymm2 ; AVX512CD-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512CD-NEXT: vpand %ymm4, %ymm1, %ymm1 -; AVX512CD-NEXT: vpshufb %ymm1, %ymm6, %ymm1 +; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX512CD-NEXT: vpshufb %ymm1, %ymm5, %ymm1 ; AVX512CD-NEXT: vpaddb %ymm2, %ymm1, %ymm1 ; AVX512CD-NEXT: vpsllw $8, %ymm1, %ymm2 ; AVX512CD-NEXT: vpaddb %ymm1, %ymm2, %ymm1 @@ -476,11 +439,9 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind { ; ; AVX512CDBW-LABEL: testv32i16u: ; AVX512CDBW: # %bb.0: -; AVX512CDBW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CDBW-NEXT: vpsubw %zmm0, %zmm1, %zmm1 -; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512CDBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 -; AVX512CDBW-NEXT: vpaddw %zmm1, %zmm0, %zmm0 +; AVX512CDBW-NEXT: vpaddw %zmm1, %zmm0, %zmm1 +; AVX512CDBW-NEXT: vpandnq %zmm1, %zmm0, %zmm0 ; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] @@ -496,11 +457,9 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind { ; ; AVX512BW-LABEL: testv32i16u: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsubw %zmm0, %zmm1, %zmm1 -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 -; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm1 +; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] @@ -516,17 +475,14 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind { ; ; AVX512VPOPCNTDQ-LABEL: testv32i16u: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm0, %ymm2, %ymm3 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 -; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm3, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm2, %ymm0, %ymm3 +; AVX512VPOPCNTDQ-NEXT: vpandn %ymm3, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm1, %ymm2, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm3, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm2, %ymm1, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpandn %ymm2, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm1, %zmm1 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm1, %ymm1 @@ -534,11 +490,9 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind { ; ; BITALG-LABEL: testv32i16u: ; BITALG: # %bb.0: -; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG-NEXT: vpsubw %zmm0, %zmm1, %zmm1 -; BITALG-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; BITALG-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 -; BITALG-NEXT: vpaddw %zmm1, %zmm0, %zmm0 +; BITALG-NEXT: vpaddw %zmm1, %zmm0, %zmm1 +; BITALG-NEXT: vpandnq %zmm1, %zmm0, %zmm0 ; BITALG-NEXT: vpopcntw %zmm0, %zmm0 ; BITALG-NEXT: retq %out = call <32 x i16> @llvm.cttz.v32i16(<32 x i16> %in, i1 -1) @@ -548,37 +502,32 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind { define <64 x i8> @testv64i8(<64 x i8> %in) nounwind { ; AVX512CD-LABEL: testv64i8: ; AVX512CD: # %bb.0: -; AVX512CD-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512CD-NEXT: vpsubb %ymm0, %ymm2, %ymm3 -; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX512CD-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 -; AVX512CD-NEXT: vpaddb %ymm3, %ymm0, %ymm0 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512CD-NEXT: vpand %ymm4, %ymm0, %ymm5 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CD-NEXT: vpshufb %ymm5, %ymm6, %ymm5 +; AVX512CD-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm3 +; AVX512CD-NEXT: vpandn %ymm3, %ymm0, %ymm0 +; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm4 +; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CD-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512CD-NEXT: vpand %ymm4, %ymm0, %ymm0 -; AVX512CD-NEXT: vpshufb %ymm0, %ymm6, %ymm0 -; AVX512CD-NEXT: vpaddb %ymm5, %ymm0, %ymm0 -; AVX512CD-NEXT: vpsubb %ymm1, %ymm2, %ymm2 -; AVX512CD-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512CD-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512CD-NEXT: vpand %ymm4, %ymm1, %ymm2 -; AVX512CD-NEXT: vpshufb %ymm2, %ymm6, %ymm2 +; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX512CD-NEXT: vpshufb %ymm0, %ymm5, %ymm0 +; AVX512CD-NEXT: vpaddb %ymm4, %ymm0, %ymm0 +; AVX512CD-NEXT: vpaddb %ymm2, %ymm1, %ymm2 +; AVX512CD-NEXT: vpandn %ymm2, %ymm1, %ymm1 +; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm2 +; AVX512CD-NEXT: vpshufb %ymm2, %ymm5, %ymm2 ; AVX512CD-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512CD-NEXT: vpand %ymm4, %ymm1, %ymm1 -; AVX512CD-NEXT: vpshufb %ymm1, %ymm6, %ymm1 +; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX512CD-NEXT: vpshufb %ymm1, %ymm5, %ymm1 ; AVX512CD-NEXT: vpaddb %ymm2, %ymm1, %ymm1 ; AVX512CD-NEXT: retq ; ; AVX512CDBW-LABEL: testv64i8: ; AVX512CDBW: # %bb.0: -; AVX512CDBW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CDBW-NEXT: vpsubb %zmm0, %zmm1, %zmm1 -; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512CDBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 -; AVX512CDBW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 +; AVX512CDBW-NEXT: vpaddb %zmm1, %zmm0, %zmm1 +; AVX512CDBW-NEXT: vpandnq %zmm1, %zmm0, %zmm0 ; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] @@ -591,11 +540,9 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind { ; ; AVX512BW-LABEL: testv64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsubb %zmm0, %zmm1, %zmm1 -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 -; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm1 +; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] @@ -608,37 +555,32 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind { ; ; AVX512VPOPCNTDQ-LABEL: testv64i8: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm0, %ymm2, %ymm3 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm3, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm5 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm5, %ymm6, %ymm5 +; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm3 +; AVX512VPOPCNTDQ-NEXT: vpandn %ymm3, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm4 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm6, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm5, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm1, %ymm2, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm6, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm5, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm4, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm1, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpandn %ymm2, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm1, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm5, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm1, %ymm6, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm1, %ymm5, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NEXT: retq ; ; BITALG-LABEL: testv64i8: ; BITALG: # %bb.0: -; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG-NEXT: vpsubb %zmm0, %zmm1, %zmm1 -; BITALG-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; BITALG-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 -; BITALG-NEXT: vpaddb %zmm1, %zmm0, %zmm0 +; BITALG-NEXT: vpaddb %zmm1, %zmm0, %zmm1 +; BITALG-NEXT: vpandnq %zmm1, %zmm0, %zmm0 ; BITALG-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG-NEXT: retq %out = call <64 x i8> @llvm.cttz.v64i8(<64 x i8> %in, i1 0) @@ -648,37 +590,32 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind { define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind { ; AVX512CD-LABEL: testv64i8u: ; AVX512CD: # %bb.0: -; AVX512CD-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512CD-NEXT: vpsubb %ymm0, %ymm2, %ymm3 -; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX512CD-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 -; AVX512CD-NEXT: vpaddb %ymm3, %ymm0, %ymm0 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512CD-NEXT: vpand %ymm4, %ymm0, %ymm5 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CD-NEXT: vpshufb %ymm5, %ymm6, %ymm5 +; AVX512CD-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm3 +; AVX512CD-NEXT: vpandn %ymm3, %ymm0, %ymm0 +; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm4 +; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CD-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512CD-NEXT: vpand %ymm4, %ymm0, %ymm0 -; AVX512CD-NEXT: vpshufb %ymm0, %ymm6, %ymm0 -; AVX512CD-NEXT: vpaddb %ymm5, %ymm0, %ymm0 -; AVX512CD-NEXT: vpsubb %ymm1, %ymm2, %ymm2 -; AVX512CD-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512CD-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512CD-NEXT: vpand %ymm4, %ymm1, %ymm2 -; AVX512CD-NEXT: vpshufb %ymm2, %ymm6, %ymm2 +; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX512CD-NEXT: vpshufb %ymm0, %ymm5, %ymm0 +; AVX512CD-NEXT: vpaddb %ymm4, %ymm0, %ymm0 +; AVX512CD-NEXT: vpaddb %ymm2, %ymm1, %ymm2 +; AVX512CD-NEXT: vpandn %ymm2, %ymm1, %ymm1 +; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm2 +; AVX512CD-NEXT: vpshufb %ymm2, %ymm5, %ymm2 ; AVX512CD-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512CD-NEXT: vpand %ymm4, %ymm1, %ymm1 -; AVX512CD-NEXT: vpshufb %ymm1, %ymm6, %ymm1 +; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX512CD-NEXT: vpshufb %ymm1, %ymm5, %ymm1 ; AVX512CD-NEXT: vpaddb %ymm2, %ymm1, %ymm1 ; AVX512CD-NEXT: retq ; ; AVX512CDBW-LABEL: testv64i8u: ; AVX512CDBW: # %bb.0: -; AVX512CDBW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CDBW-NEXT: vpsubb %zmm0, %zmm1, %zmm1 -; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512CDBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 -; AVX512CDBW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 +; AVX512CDBW-NEXT: vpaddb %zmm1, %zmm0, %zmm1 +; AVX512CDBW-NEXT: vpandnq %zmm1, %zmm0, %zmm0 ; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] @@ -691,11 +628,9 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind { ; ; AVX512BW-LABEL: testv64i8u: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsubb %zmm0, %zmm1, %zmm1 -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 -; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm1 +; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] @@ -708,37 +643,32 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind { ; ; AVX512VPOPCNTDQ-LABEL: testv64i8u: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm0, %ymm2, %ymm3 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm3, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm5 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm5, %ymm6, %ymm5 +; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm3 +; AVX512VPOPCNTDQ-NEXT: vpandn %ymm3, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm4 +; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm6, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm5, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm1, %ymm2, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm6, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm5, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm4, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm1, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpandn %ymm2, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm1, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm5, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm4, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm1, %ymm6, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm1, %ymm5, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NEXT: retq ; ; BITALG-LABEL: testv64i8u: ; BITALG: # %bb.0: -; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; BITALG-NEXT: vpsubb %zmm0, %zmm1, %zmm1 -; BITALG-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; BITALG-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 -; BITALG-NEXT: vpaddb %zmm1, %zmm0, %zmm0 +; BITALG-NEXT: vpaddb %zmm1, %zmm0, %zmm1 +; BITALG-NEXT: vpandnq %zmm1, %zmm0, %zmm0 ; BITALG-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG-NEXT: retq %out = call <64 x i8> @llvm.cttz.v64i8(<64 x i8> %in, i1 -1) -- GitLab From 3b18a97f3ac2eb58715703462cef16f8e0e8d4e1 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sat, 13 Oct 2018 13:05:19 +0000 Subject: [PATCH 0144/1116] [X86][SSE] Improve CTTZ lowering when CTLZ is legal If we have better CTLZ support than CTPOP, then use cttz(x) = width - ctlz(~x & (x - 1)) - and remove the CTTZ_ZERO_UNDEF handling as it no longer gives better codegen. Similar to rL344447, this is also closer to LegalizeDAG's approach git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344448 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 24 ++--- test/CodeGen/X86/vector-tzcnt-128.ll | 137 +++++++++++++++------------ test/CodeGen/X86/vector-tzcnt-256.ll | 92 ++++++------------ test/CodeGen/X86/vector-tzcnt-512.ll | 114 ++++++---------------- 4 files changed, 146 insertions(+), 221 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 5fb3ece19f2..5f1e9ef1b03 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -22982,20 +22982,22 @@ static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget, if (VT.is256BitVector() && !Subtarget.hasInt256()) return Lower256IntUnary(Op, DAG); - // cttz_undef(x) = (width - 1) - ctlz(x & -x) - if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) { - SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT); - SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0, - DAG.getNode(ISD::SUB, dl, VT, Zero, N0)); - return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne, - DAG.getNode(ISD::CTLZ, dl, VT, LSB)); + // Tmp = ~x & (x - 1) + SDValue One = DAG.getConstant(1, dl, VT); + SDValue Tmp = DAG.getNode(ISD::AND, dl, VT, DAG.getNOT(dl, N0, VT), + DAG.getNode(ISD::SUB, dl, VT, N0, One)); + + // cttz(x) = width - ctlz(~x & (x - 1)) + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (TLI.isOperationLegal(ISD::CTLZ, VT) && + !TLI.isOperationLegal(ISD::CTPOP, VT)) { + SDValue Width = DAG.getConstant(NumBits, dl, VT); + return DAG.getNode(ISD::SUB, dl, VT, Width, + DAG.getNode(ISD::CTLZ, dl, VT, Tmp)); } // cttz(x) = ctpop(~x & (x - 1)) - SDValue One = DAG.getConstant(1, dl, VT); - return DAG.getNode(ISD::CTPOP, dl, VT, - DAG.getNode(ISD::AND, dl, VT, DAG.getNOT(dl, N0, VT), - DAG.getNode(ISD::SUB, dl, VT, N0, One))); + return DAG.getNode(ISD::CTPOP, dl, VT, Tmp); } assert(Op.getOpcode() == ISD::CTTZ && diff --git a/test/CodeGen/X86/vector-tzcnt-128.ll b/test/CodeGen/X86/vector-tzcnt-128.ll index 1430ca72f68..0d392bb5117 100644 --- a/test/CodeGen/X86/vector-tzcnt-128.ll +++ b/test/CodeGen/X86/vector-tzcnt-128.ll @@ -102,22 +102,60 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: testv2i64: -; AVX: # %bb.0: -; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: testv2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: testv2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512CDVL-LABEL: testv2i64: +; AVX512CDVL: # %bb.0: +; AVX512CDVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512CDVL-NEXT: vpaddq %xmm1, %xmm0, %xmm1 +; AVX512CDVL-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX512CDVL-NEXT: vplzcntq %xmm0, %xmm0 +; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm1 = [64,64] +; AVX512CDVL-NEXT: vpsubq %xmm0, %xmm1, %xmm0 +; AVX512CDVL-NEXT: retq +; +; AVX512CD-LABEL: testv2i64: +; AVX512CD: # %bb.0: +; AVX512CD-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512CD-NEXT: vpaddq %xmm1, %xmm0, %xmm1 +; AVX512CD-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0 +; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm1 = [64,64] +; AVX512CD-NEXT: vpsubq %xmm0, %xmm1, %xmm0 +; AVX512CD-NEXT: vzeroupper +; AVX512CD-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: testv2i64: ; AVX512VPOPCNTDQ: # %bb.0: @@ -303,21 +341,21 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind { ; ; AVX512CDVL-LABEL: testv2i64u: ; AVX512CDVL: # %bb.0: -; AVX512CDVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CDVL-NEXT: vpsubq %xmm0, %xmm1, %xmm1 -; AVX512CDVL-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512CDVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512CDVL-NEXT: vpaddq %xmm1, %xmm0, %xmm1 +; AVX512CDVL-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; AVX512CDVL-NEXT: vplzcntq %xmm0, %xmm0 -; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm1 = [63,63] +; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm1 = [64,64] ; AVX512CDVL-NEXT: vpsubq %xmm0, %xmm1, %xmm0 ; AVX512CDVL-NEXT: retq ; ; AVX512CD-LABEL: testv2i64u: ; AVX512CD: # %bb.0: -; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CD-NEXT: vpsubq %xmm0, %xmm1, %xmm1 -; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512CD-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512CD-NEXT: vpaddq %xmm1, %xmm0, %xmm1 +; AVX512CD-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0 -; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm1 = [63,63] +; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm1 = [64,64] ; AVX512CD-NEXT: vpsubq %xmm0, %xmm1, %xmm0 ; AVX512CD-NEXT: vzeroupper ; AVX512CD-NEXT: retq @@ -537,20 +575,9 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; AVX512CDVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX512CDVL-NEXT: vpaddd %xmm1, %xmm0, %xmm1 ; AVX512CDVL-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512CDVL-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CDVL-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX512CDVL-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512CDVL-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512CDVL-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX512CDVL-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX512CDVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CDVL-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512CDVL-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 -; AVX512CDVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512CDVL-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX512CDVL-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX512CDVL-NEXT: vplzcntd %xmm0, %xmm0 +; AVX512CDVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [32,32,32,32] +; AVX512CDVL-NEXT: vpsubd %xmm0, %xmm1, %xmm0 ; AVX512CDVL-NEXT: retq ; ; AVX512CD-LABEL: testv4i32: @@ -558,20 +585,10 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; AVX512CD-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX512CD-NEXT: vpaddd %xmm1, %xmm0, %xmm1 ; AVX512CD-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CD-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX512CD-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512CD-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX512CD-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CD-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512CD-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 -; AVX512CD-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX512CD-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX512CD-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 +; AVX512CD-NEXT: vpbroadcastd {{.*#+}} xmm1 = [32,32,32,32] +; AVX512CD-NEXT: vpsubd %xmm0, %xmm1, %xmm0 +; AVX512CD-NEXT: vzeroupper ; AVX512CD-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: testv4i32: @@ -798,21 +815,21 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind { ; ; AVX512CDVL-LABEL: testv4i32u: ; AVX512CDVL: # %bb.0: -; AVX512CDVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CDVL-NEXT: vpsubd %xmm0, %xmm1, %xmm1 -; AVX512CDVL-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512CDVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512CDVL-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; AVX512CDVL-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; AVX512CDVL-NEXT: vplzcntd %xmm0, %xmm0 -; AVX512CDVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [31,31,31,31] +; AVX512CDVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [32,32,32,32] ; AVX512CDVL-NEXT: vpsubd %xmm0, %xmm1, %xmm0 ; AVX512CDVL-NEXT: retq ; ; AVX512CD-LABEL: testv4i32u: ; AVX512CD: # %bb.0: -; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CD-NEXT: vpsubd %xmm0, %xmm1, %xmm1 -; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512CD-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512CD-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; AVX512CD-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 -; AVX512CD-NEXT: vpbroadcastd {{.*#+}} xmm1 = [31,31,31,31] +; AVX512CD-NEXT: vpbroadcastd {{.*#+}} xmm1 = [32,32,32,32] ; AVX512CD-NEXT: vpsubd %xmm0, %xmm1, %xmm0 ; AVX512CD-NEXT: vzeroupper ; AVX512CD-NEXT: retq diff --git a/test/CodeGen/X86/vector-tzcnt-256.ll b/test/CodeGen/X86/vector-tzcnt-256.ll index 46c34fb0d44..59911e5805b 100644 --- a/test/CodeGen/X86/vector-tzcnt-256.ll +++ b/test/CodeGen/X86/vector-tzcnt-256.ll @@ -62,16 +62,9 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind { ; AVX512CDVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX512CDVL-NEXT: vpaddq %ymm1, %ymm0, %ymm1 ; AVX512CDVL-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CDVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX512CDVL-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX512CDVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX512CDVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512CDVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CDVL-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 +; AVX512CDVL-NEXT: vplzcntq %ymm0, %ymm0 +; AVX512CDVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [64,64,64,64] +; AVX512CDVL-NEXT: vpsubq %ymm0, %ymm1, %ymm0 ; AVX512CDVL-NEXT: retq ; ; AVX512CD-LABEL: testv4i64: @@ -79,16 +72,9 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind { ; AVX512CD-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX512CD-NEXT: vpaddq %ymm1, %ymm0, %ymm1 ; AVX512CD-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CD-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX512CD-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CD-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 +; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0 +; AVX512CD-NEXT: vpbroadcastq {{.*#+}} ymm1 = [64,64,64,64] +; AVX512CD-NEXT: vpsubq %ymm0, %ymm1, %ymm0 ; AVX512CD-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: testv4i64: @@ -195,21 +181,21 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind { ; ; AVX512CDVL-LABEL: testv4i64u: ; AVX512CDVL: # %bb.0: -; AVX512CDVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CDVL-NEXT: vpsubq %ymm0, %ymm1, %ymm1 -; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512CDVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512CDVL-NEXT: vpaddq %ymm1, %ymm0, %ymm1 +; AVX512CDVL-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; AVX512CDVL-NEXT: vplzcntq %ymm0, %ymm0 -; AVX512CDVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [63,63,63,63] +; AVX512CDVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [64,64,64,64] ; AVX512CDVL-NEXT: vpsubq %ymm0, %ymm1, %ymm0 ; AVX512CDVL-NEXT: retq ; ; AVX512CD-LABEL: testv4i64u: ; AVX512CD: # %bb.0: -; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CD-NEXT: vpsubq %ymm0, %ymm1, %ymm1 -; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512CD-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512CD-NEXT: vpaddq %ymm1, %ymm0, %ymm1 +; AVX512CD-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0 -; AVX512CD-NEXT: vpbroadcastq {{.*#+}} ymm1 = [63,63,63,63] +; AVX512CD-NEXT: vpbroadcastq {{.*#+}} ymm1 = [64,64,64,64] ; AVX512CD-NEXT: vpsubq %ymm0, %ymm1, %ymm0 ; AVX512CD-NEXT: retq ; @@ -332,20 +318,9 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { ; AVX512CDVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX512CDVL-NEXT: vpaddd %ymm1, %ymm0, %ymm1 ; AVX512CDVL-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512CDVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CDVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX512CDVL-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX512CDVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX512CDVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512CDVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CDVL-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX512CDVL-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 -; AVX512CDVL-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX512CDVL-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX512CDVL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; AVX512CDVL-NEXT: vplzcntd %ymm0, %ymm0 +; AVX512CDVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [32,32,32,32,32,32,32,32] +; AVX512CDVL-NEXT: vpsubd %ymm0, %ymm1, %ymm0 ; AVX512CDVL-NEXT: retq ; ; AVX512CD-LABEL: testv8i32: @@ -353,20 +328,9 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { ; AVX512CD-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX512CD-NEXT: vpaddd %ymm1, %ymm0, %ymm1 ; AVX512CD-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CD-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX512CD-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CD-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX512CD-NEXT: vpsadbw %ymm1, %ymm2, %ymm2 -; AVX512CD-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX512CD-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX512CD-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 +; AVX512CD-NEXT: vpbroadcastd {{.*#+}} ymm1 = [32,32,32,32,32,32,32,32] +; AVX512CD-NEXT: vpsubd %ymm0, %ymm1, %ymm0 ; AVX512CD-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: testv8i32: @@ -498,21 +462,21 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind { ; ; AVX512CDVL-LABEL: testv8i32u: ; AVX512CDVL: # %bb.0: -; AVX512CDVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CDVL-NEXT: vpsubd %ymm0, %ymm1, %ymm1 -; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512CDVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512CDVL-NEXT: vpaddd %ymm1, %ymm0, %ymm1 +; AVX512CDVL-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; AVX512CDVL-NEXT: vplzcntd %ymm0, %ymm0 -; AVX512CDVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [31,31,31,31,31,31,31,31] +; AVX512CDVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [32,32,32,32,32,32,32,32] ; AVX512CDVL-NEXT: vpsubd %ymm0, %ymm1, %ymm0 ; AVX512CDVL-NEXT: retq ; ; AVX512CD-LABEL: testv8i32u: ; AVX512CD: # %bb.0: -; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CD-NEXT: vpsubd %ymm0, %ymm1, %ymm1 -; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512CD-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512CD-NEXT: vpaddd %ymm1, %ymm0, %ymm1 +; AVX512CD-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 -; AVX512CD-NEXT: vpbroadcastd {{.*#+}} ymm1 = [31,31,31,31,31,31,31,31] +; AVX512CD-NEXT: vpbroadcastd {{.*#+}} ymm1 = [32,32,32,32,32,32,32,32] ; AVX512CD-NEXT: vpsubd %ymm0, %ymm1, %ymm0 ; AVX512CD-NEXT: retq ; diff --git a/test/CodeGen/X86/vector-tzcnt-512.ll b/test/CodeGen/X86/vector-tzcnt-512.ll index 300d7b4ac6c..1de03463e19 100644 --- a/test/CodeGen/X86/vector-tzcnt-512.ll +++ b/test/CodeGen/X86/vector-tzcnt-512.ll @@ -11,25 +11,9 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind { ; AVX512CD-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 ; AVX512CD-NEXT: vpaddq %zmm1, %zmm0, %zmm1 ; AVX512CD-NEXT: vpandnq %zmm1, %zmm0, %zmm0 -; AVX512CD-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512CD-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CD-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512CD-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512CD-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512CD-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512CD-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512CD-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512CD-NEXT: vpsadbw %ymm3, %ymm1, %ymm1 -; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm5 -; AVX512CD-NEXT: vpshufb %ymm5, %ymm4, %ymm5 -; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512CD-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512CD-NEXT: vpaddb %ymm5, %ymm0, %ymm0 -; AVX512CD-NEXT: vpsadbw %ymm3, %ymm0, %ymm0 -; AVX512CD-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0 +; AVX512CD-NEXT: vpbroadcastq {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64] +; AVX512CD-NEXT: vpsubq %zmm0, %zmm1, %zmm0 ; AVX512CD-NEXT: retq ; ; AVX512CDBW-LABEL: testv8i64: @@ -37,16 +21,9 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind { ; AVX512CDBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 ; AVX512CDBW-NEXT: vpaddq %zmm1, %zmm0, %zmm1 ; AVX512CDBW-NEXT: vpandnq %zmm1, %zmm0, %zmm0 -; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CDBW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 -; AVX512CDBW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512CDBW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 -; AVX512CDBW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 -; AVX512CDBW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CDBW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 +; AVX512CDBW-NEXT: vplzcntq %zmm0, %zmm0 +; AVX512CDBW-NEXT: vpbroadcastq {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64] +; AVX512CDBW-NEXT: vpsubq %zmm0, %zmm1, %zmm0 ; AVX512CDBW-NEXT: retq ; ; AVX512BW-LABEL: testv8i64: @@ -90,21 +67,21 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind { define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind { ; AVX512CD-LABEL: testv8i64u: ; AVX512CD: # %bb.0: -; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CD-NEXT: vpsubq %zmm0, %zmm1, %zmm1 -; AVX512CD-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512CD-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512CD-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512CD-NEXT: vpandnq %zmm1, %zmm0, %zmm0 ; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0 -; AVX512CD-NEXT: vpbroadcastq {{.*#+}} zmm1 = [63,63,63,63,63,63,63,63] +; AVX512CD-NEXT: vpbroadcastq {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64] ; AVX512CD-NEXT: vpsubq %zmm0, %zmm1, %zmm0 ; AVX512CD-NEXT: retq ; ; AVX512CDBW-LABEL: testv8i64u: ; AVX512CDBW: # %bb.0: -; AVX512CDBW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CDBW-NEXT: vpsubq %zmm0, %zmm1, %zmm1 -; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512CDBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512CDBW-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512CDBW-NEXT: vpandnq %zmm1, %zmm0, %zmm0 ; AVX512CDBW-NEXT: vplzcntq %zmm0, %zmm0 -; AVX512CDBW-NEXT: vpbroadcastq {{.*#+}} zmm1 = [63,63,63,63,63,63,63,63] +; AVX512CDBW-NEXT: vpbroadcastq {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64] ; AVX512CDBW-NEXT: vpsubq %zmm0, %zmm1, %zmm0 ; AVX512CDBW-NEXT: retq ; @@ -152,33 +129,9 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind { ; AVX512CD-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 ; AVX512CD-NEXT: vpaddd %zmm1, %zmm0, %zmm1 ; AVX512CD-NEXT: vpandnq %zmm1, %zmm0, %zmm0 -; AVX512CD-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512CD-NEXT: vpand %ymm2, %ymm1, %ymm3 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CD-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512CD-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512CD-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512CD-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512CD-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512CD-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512CD-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] -; AVX512CD-NEXT: vpsadbw %ymm3, %ymm5, %ymm5 -; AVX512CD-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] -; AVX512CD-NEXT: vpsadbw %ymm3, %ymm1, %ymm1 -; AVX512CD-NEXT: vpackuswb %ymm5, %ymm1, %ymm1 -; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm5 -; AVX512CD-NEXT: vpshufb %ymm5, %ymm4, %ymm5 -; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512CD-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512CD-NEXT: vpaddb %ymm5, %ymm0, %ymm0 -; AVX512CD-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[6],ymm3[6],ymm0[7],ymm3[7] -; AVX512CD-NEXT: vpsadbw %ymm3, %ymm2, %ymm2 -; AVX512CD-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[4],ymm3[4],ymm0[5],ymm3[5] -; AVX512CD-NEXT: vpsadbw %ymm3, %ymm0, %ymm0 -; AVX512CD-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX512CD-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 +; AVX512CD-NEXT: vpbroadcastd {{.*#+}} zmm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; AVX512CD-NEXT: vpsubd %zmm0, %zmm1, %zmm0 ; AVX512CD-NEXT: retq ; ; AVX512CDBW-LABEL: testv16i32: @@ -186,20 +139,9 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind { ; AVX512CDBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 ; AVX512CDBW-NEXT: vpaddd %zmm1, %zmm0, %zmm1 ; AVX512CDBW-NEXT: vpandnq %zmm1, %zmm0, %zmm0 -; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm2 -; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512CDBW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 -; AVX512CDBW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512CDBW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 -; AVX512CDBW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 -; AVX512CDBW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CDBW-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] -; AVX512CDBW-NEXT: vpsadbw %zmm1, %zmm2, %zmm2 -; AVX512CDBW-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] -; AVX512CDBW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 -; AVX512CDBW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 +; AVX512CDBW-NEXT: vplzcntd %zmm0, %zmm0 +; AVX512CDBW-NEXT: vpbroadcastd {{.*#+}} zmm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; AVX512CDBW-NEXT: vpsubd %zmm0, %zmm1, %zmm0 ; AVX512CDBW-NEXT: retq ; ; AVX512BW-LABEL: testv16i32: @@ -251,21 +193,21 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind { define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind { ; AVX512CD-LABEL: testv16i32u: ; AVX512CD: # %bb.0: -; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CD-NEXT: vpsubd %zmm0, %zmm1, %zmm1 -; AVX512CD-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512CD-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512CD-NEXT: vpaddd %zmm1, %zmm0, %zmm1 +; AVX512CD-NEXT: vpandnq %zmm1, %zmm0, %zmm0 ; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 -; AVX512CD-NEXT: vpbroadcastd {{.*#+}} zmm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; AVX512CD-NEXT: vpbroadcastd {{.*#+}} zmm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] ; AVX512CD-NEXT: vpsubd %zmm0, %zmm1, %zmm0 ; AVX512CD-NEXT: retq ; ; AVX512CDBW-LABEL: testv16i32u: ; AVX512CDBW: # %bb.0: -; AVX512CDBW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512CDBW-NEXT: vpsubd %zmm0, %zmm1, %zmm1 -; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512CDBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512CDBW-NEXT: vpaddd %zmm1, %zmm0, %zmm1 +; AVX512CDBW-NEXT: vpandnq %zmm1, %zmm0, %zmm0 ; AVX512CDBW-NEXT: vplzcntd %zmm0, %zmm0 -; AVX512CDBW-NEXT: vpbroadcastd {{.*#+}} zmm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; AVX512CDBW-NEXT: vpbroadcastd {{.*#+}} zmm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] ; AVX512CDBW-NEXT: vpsubd %zmm0, %zmm1, %zmm0 ; AVX512CDBW-NEXT: retq ; -- GitLab From 1a9bbe2528fdcbac90886b04897d51f10a1ecb84 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sat, 13 Oct 2018 13:30:10 +0000 Subject: [PATCH 0145/1116] Remove unused variable. NFCI. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344449 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 5f1e9ef1b03..bb75f6e0f17 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -22976,7 +22976,6 @@ static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget, if (VT.isVector()) { SDValue N0 = Op.getOperand(0); - SDValue Zero = DAG.getConstant(0, dl, VT); // Decompose 256-bit ops into smaller 128-bit ops. if (VT.is256BitVector() && !Subtarget.hasInt256()) -- GitLab From 7ecda486e97cf17a7d97bbf9e6bfa7e768d51879 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sat, 13 Oct 2018 13:33:32 +0000 Subject: [PATCH 0146/1116] Pull out repeated getOperand(). NFCI. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344450 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index bb75f6e0f17..835e272f52b 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -22972,11 +22972,10 @@ static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); unsigned NumBits = VT.getScalarSizeInBits(); + SDValue N0 = Op.getOperand(0); SDLoc dl(Op); if (VT.isVector()) { - SDValue N0 = Op.getOperand(0); - // Decompose 256-bit ops into smaller 128-bit ops. if (VT.is256BitVector() && !Subtarget.hasInt256()) return Lower256IntUnary(Op, DAG); @@ -23004,7 +23003,7 @@ static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget, // Issue a bsf (scan bits forward) which also sets EFLAGS. SDVTList VTs = DAG.getVTList(VT, MVT::i32); - Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0)); + Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0); // If src is zero (i.e. bsf sets ZF), returns NumBits. SDValue Ops[] = { -- GitLab From 8965b5dc749a3b320c5e902cd639837f7a734015 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sat, 13 Oct 2018 14:28:40 +0000 Subject: [PATCH 0147/1116] [X86] Pull out target constant splat helper function. NFCI. The code in LowerScalarImmediateShift is just a more powerful version of ISD::isConstantSplatVector. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344451 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 44 ++++++++++++++++++------------ 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 835e272f52b..d6699c6e678 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -5830,6 +5830,30 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, return false; } +static bool isConstantSplat(SDValue Op, APInt &SplatVal) { + APInt UndefElts; + SmallVector EltBits; + if (getTargetConstantBitsFromNode(Op, Op.getScalarValueSizeInBits(), + UndefElts, EltBits, true, false)) { + int SplatIndex = -1; + for (int i = 0, e = EltBits.size(); i != e; ++i) { + if (UndefElts[i]) + continue; + if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) { + SplatIndex = -1; + break; + } + SplatIndex = i; + } + if (0 <= SplatIndex) { + SplatVal = EltBits[SplatIndex]; + return true; + } + } + + return false; +} + static bool getTargetShuffleMaskIndices(SDValue MaskNode, unsigned MaskEltSizeInBits, SmallVectorImpl &RawMask) { @@ -23600,7 +23624,6 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, SDLoc dl(Op); SDValue R = Op.getOperand(0); SDValue Amt = Op.getOperand(1); - unsigned EltSizeInBits = VT.getScalarSizeInBits(); unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false); auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) { @@ -23644,24 +23667,11 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, }; // Optimize shl/srl/sra with constant shift amount. - APInt UndefElts; - SmallVector EltBits; - if (!getTargetConstantBitsFromNode(Amt, EltSizeInBits, UndefElts, EltBits, - true, false)) - return SDValue(); - - int SplatIndex = -1; - for (int i = 0, e = VT.getVectorNumElements(); i != e; ++i) { - if (UndefElts[i]) - continue; - if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) - return SDValue(); - SplatIndex = i; - } - if (SplatIndex < 0) + APInt APIntShiftAmt; + if (!isConstantSplat(Amt, APIntShiftAmt)) return SDValue(); + uint64_t ShiftAmt = APIntShiftAmt.getZExtValue(); - uint64_t ShiftAmt = EltBits[SplatIndex].getZExtValue(); if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG); -- GitLab From 097be3b516e9c3d213769b842b55800b0159339b Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sat, 13 Oct 2018 14:45:44 +0000 Subject: [PATCH 0148/1116] [X86][SSE] combineIncDecVector - use isConstantSplat Use isConstantSplat instead of ISD::isConstantSplatVector to let us us peek through to illegal types (in this case for i686 targets to recognise i64 constants) git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344452 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 4 +--- test/CodeGen/X86/vector-tzcnt-128.ll | 8 ++++---- test/CodeGen/X86/vector-tzcnt-256.ll | 6 ++++-- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index d6699c6e678..18c5f60f2f2 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -39603,10 +39603,8 @@ static SDValue combineIncDecVector(SDNode *N, SelectionDAG &DAG) { if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector()) return SDValue(); - SDNode *N1 = N->getOperand(1).getNode(); APInt SplatVal; - if (!ISD::isConstantSplatVector(N1, SplatVal) || - !SplatVal.isOneValue()) + if (!isConstantSplat(N->getOperand(1), SplatVal) || !SplatVal.isOneValue()) return SDValue(); SDValue AllOnesVec = getOnesVector(VT, DAG, SDLoc(N)); diff --git a/test/CodeGen/X86/vector-tzcnt-128.ll b/test/CodeGen/X86/vector-tzcnt-128.ll index 0d392bb5117..21142ff3970 100644 --- a/test/CodeGen/X86/vector-tzcnt-128.ll +++ b/test/CodeGen/X86/vector-tzcnt-128.ll @@ -198,8 +198,8 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { ; ; X32-SSE-LABEL: testv2i64: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE-NEXT: psubq {{\.LCPI.*}}, %xmm1 +; X32-SSE-NEXT: pcmpeqd %xmm1, %xmm1 +; X32-SSE-NEXT: paddq %xmm0, %xmm1 ; X32-SSE-NEXT: pandn %xmm1, %xmm0 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; X32-SSE-NEXT: movdqa %xmm0, %xmm2 @@ -401,8 +401,8 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind { ; ; X32-SSE-LABEL: testv2i64u: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE-NEXT: psubq {{\.LCPI.*}}, %xmm1 +; X32-SSE-NEXT: pcmpeqd %xmm1, %xmm1 +; X32-SSE-NEXT: paddq %xmm0, %xmm1 ; X32-SSE-NEXT: pandn %xmm1, %xmm0 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; X32-SSE-NEXT: movdqa %xmm0, %xmm2 diff --git a/test/CodeGen/X86/vector-tzcnt-256.ll b/test/CodeGen/X86/vector-tzcnt-256.ll index 59911e5805b..c7087037e01 100644 --- a/test/CodeGen/X86/vector-tzcnt-256.ll +++ b/test/CodeGen/X86/vector-tzcnt-256.ll @@ -116,7 +116,8 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind { ; ; X32-AVX-LABEL: testv4i64: ; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vpsubq {{\.LCPI.*}}, %ymm0, %ymm1 +; X32-AVX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; X32-AVX-NEXT: vpaddq %ymm1, %ymm0, %ymm1 ; X32-AVX-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 @@ -238,7 +239,8 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind { ; ; X32-AVX-LABEL: testv4i64u: ; X32-AVX: # %bb.0: -; X32-AVX-NEXT: vpsubq {{\.LCPI.*}}, %ymm0, %ymm1 +; X32-AVX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; X32-AVX-NEXT: vpaddq %ymm1, %ymm0, %ymm1 ; X32-AVX-NEXT: vpandn %ymm1, %ymm0, %ymm0 ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 -- GitLab From e3800c79ea4638c860b7fe2d6bfd120793cd3c48 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sat, 13 Oct 2018 15:16:55 +0000 Subject: [PATCH 0149/1116] [X86][SSE] Begin removing vector CTTZ custom lowering and use LegalizeDAG instead. Adds CTTZ vector legalization support and begins the removal of the X86/SSE custom lowering. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344453 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 2 +- lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp | 18 +++++++++++++++--- lib/Target/X86/X86ISelLowering.cpp | 17 ++++++++--------- 3 files changed, 24 insertions(+), 13 deletions(-) diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index 56025110f0a..884d7174440 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -2794,7 +2794,7 @@ SDValue SelectionDAGLegalize::ExpandBitCount(unsigned Opc, SDValue Op, // This trivially expands to CTTZ. return DAG.getNode(ISD::CTTZ, dl, VT, Op); case ISD::CTTZ: { - unsigned Len = VT.getSizeInBits(); + unsigned Len = VT.getScalarSizeInBits(); if (TLI.isOperationLegalOrCustom(ISD::CTTZ_ZERO_UNDEF, VT)) { EVT SetCCVT = getSetCCResultType(VT); diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index 8cc37b5f233..58d86e8e52e 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -129,7 +129,7 @@ class VectorLegalizer { SDValue ExpandFSUB(SDValue Op); SDValue ExpandBITREVERSE(SDValue Op); SDValue ExpandCTLZ(SDValue Op); - SDValue ExpandCTTZ_ZERO_UNDEF(SDValue Op); + SDValue ExpandCTTZ(SDValue Op); SDValue ExpandStrictFPOp(SDValue Op); /// Implements vector promotion. @@ -717,8 +717,9 @@ SDValue VectorLegalizer::Expand(SDValue Op) { case ISD::CTLZ: case ISD::CTLZ_ZERO_UNDEF: return ExpandCTLZ(Op); + case ISD::CTTZ: case ISD::CTTZ_ZERO_UNDEF: - return ExpandCTTZ_ZERO_UNDEF(Op); + return ExpandCTTZ(Op); case ISD::STRICT_FADD: case ISD::STRICT_FSUB: case ISD::STRICT_FMUL: @@ -1094,8 +1095,9 @@ SDValue VectorLegalizer::ExpandCTLZ(SDValue Op) { return DAG.UnrollVectorOp(Op.getNode()); } -SDValue VectorLegalizer::ExpandCTTZ_ZERO_UNDEF(SDValue Op) { +SDValue VectorLegalizer::ExpandCTTZ(SDValue Op) { EVT VT = Op.getValueType(); + unsigned NumBitsPerElt = VT.getScalarSizeInBits(); // If the non-ZERO_UNDEF version is supported we can use that instead. if (TLI.isOperationLegalOrCustom(ISD::CTTZ, VT)) { @@ -1103,6 +1105,16 @@ SDValue VectorLegalizer::ExpandCTTZ_ZERO_UNDEF(SDValue Op) { return DAG.getNode(ISD::CTTZ, DL, VT, Op.getOperand(0)); } + // If we have the appropriate vector bit operations, it is better to use them + // than unrolling and expanding each component. + if (isPowerOf2_32(NumBitsPerElt) && + (TLI.isOperationLegalOrCustom(ISD::CTPOP, VT) || + TLI.isOperationLegalOrCustom(ISD::CTLZ, VT)) && + TLI.isOperationLegalOrCustom(ISD::SUB, VT) && + TLI.isOperationLegalOrCustomOrPromote(ISD::AND, VT) && + TLI.isOperationLegalOrCustomOrPromote(ISD::XOR, VT)) + return Op; + // Otherwise go ahead and unroll. return DAG.UnrollVectorOp(Op.getNode()); } diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 18c5f60f2f2..1411cf18902 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -23004,22 +23004,21 @@ static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget, if (VT.is256BitVector() && !Subtarget.hasInt256()) return Lower256IntUnary(Op, DAG); - // Tmp = ~x & (x - 1) - SDValue One = DAG.getConstant(1, dl, VT); - SDValue Tmp = DAG.getNode(ISD::AND, dl, VT, DAG.getNOT(dl, N0, VT), - DAG.getNode(ISD::SUB, dl, VT, N0, One)); - // cttz(x) = width - ctlz(~x & (x - 1)) const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (TLI.isOperationLegal(ISD::CTLZ, VT) && !TLI.isOperationLegal(ISD::CTPOP, VT)) { + SDValue One = DAG.getConstant(1, dl, VT); SDValue Width = DAG.getConstant(NumBits, dl, VT); - return DAG.getNode(ISD::SUB, dl, VT, Width, - DAG.getNode(ISD::CTLZ, dl, VT, Tmp)); + return DAG.getNode( + ISD::SUB, dl, VT, Width, + DAG.getNode(ISD::CTLZ, dl, VT, + DAG.getNode(ISD::AND, dl, VT, DAG.getNOT(dl, N0, VT), + DAG.getNode(ISD::SUB, dl, VT, N0, One)))); } - // cttz(x) = ctpop(~x & (x - 1)) - return DAG.getNode(ISD::CTPOP, dl, VT, Tmp); + // Else leave it to the legalizer. + return SDValue(); } assert(Op.getOpcode() == ISD::CTTZ && -- GitLab From c323e923ae334e978f63714e73517d6d1a51abe9 Mon Sep 17 00:00:00 2001 From: David Bolvansky Date: Sat, 13 Oct 2018 15:21:55 +0000 Subject: [PATCH 0150/1116] [InstCombine] Fixed crash with aliased functions Summary: Fixes PR39177 Reviewers: spatel, jbuening Reviewed By: jbuening Subscribers: jbuening, llvm-commits Differential Revision: https://reviews.llvm.org/D53129 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344454 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Transforms/Utils/BuildLibCalls.h | 2 +- lib/Transforms/IPO/InferFunctionAttrs.cpp | 2 +- lib/Transforms/Scalar/LoopIdiomRecognize.cpp | 2 +- lib/Transforms/Utils/BuildLibCalls.cpp | 46 +++++++------ lib/Transforms/Utils/SimplifyLibCalls.cpp | 2 +- test/Transforms/InstCombine/pr39177.ll | 66 +++++++++++++++++++ 6 files changed, 95 insertions(+), 25 deletions(-) create mode 100644 test/Transforms/InstCombine/pr39177.ll diff --git a/include/llvm/Transforms/Utils/BuildLibCalls.h b/include/llvm/Transforms/Utils/BuildLibCalls.h index bdcdf6f361f..ab7d22c024c 100644 --- a/include/llvm/Transforms/Utils/BuildLibCalls.h +++ b/include/llvm/Transforms/Utils/BuildLibCalls.h @@ -28,7 +28,7 @@ namespace llvm { /// If the library function is unavailable, this doesn't modify it. /// /// Returns true if any attributes were set and false otherwise. - bool inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI); + bool inferLibFuncAttributes(Function *Func, const TargetLibraryInfo &TLI); /// Check whether the overloaded unary floating point function /// corresponding to \a Ty is available. diff --git a/lib/Transforms/IPO/InferFunctionAttrs.cpp b/lib/Transforms/IPO/InferFunctionAttrs.cpp index 470f97b8ba6..c53a9b5e819 100644 --- a/lib/Transforms/IPO/InferFunctionAttrs.cpp +++ b/lib/Transforms/IPO/InferFunctionAttrs.cpp @@ -27,7 +27,7 @@ static bool inferAllPrototypeAttributes(Module &M, // We only infer things using the prototype and the name; we don't need // definitions. if (F.isDeclaration() && !F.hasFnAttribute((Attribute::OptimizeNone))) - Changed |= inferLibFuncAttributes(F, TLI); + Changed |= inferLibFuncAttributes(&F, TLI); return Changed; } diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index 68abf9719a9..9a45551f64b 100644 --- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -931,7 +931,7 @@ bool LoopIdiomRecognize::processLoopStridedStore( Value *MSP = M->getOrInsertFunction("memset_pattern16", Builder.getVoidTy(), Int8PtrTy, Int8PtrTy, IntPtr); - inferLibFuncAttributes(*M->getFunction("memset_pattern16"), *TLI); + inferLibFuncAttributes(M->getFunction("memset_pattern16"), *TLI); // Otherwise we should form a memset_pattern16. PatternValue is known to be // an constant array of 16-bytes. Plop the value into a mergable global. diff --git a/lib/Transforms/Utils/BuildLibCalls.cpp b/lib/Transforms/Utils/BuildLibCalls.cpp index 6eb39e5b959..234449b2bf8 100644 --- a/lib/Transforms/Utils/BuildLibCalls.cpp +++ b/lib/Transforms/Utils/BuildLibCalls.cpp @@ -121,7 +121,11 @@ static bool setNonLazyBind(Function &F) { return true; } -bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { +bool llvm::inferLibFuncAttributes(Function *Func, + const TargetLibraryInfo &TLI) { + if (!Func) + return false; + Function &F = *Func; LibFunc TheLibFunc; if (!(TLI.getLibFunc(F, TheLibFunc) && TLI.has(TheLibFunc))) return false; @@ -773,7 +777,7 @@ Value *llvm::emitStrLen(Value *Ptr, IRBuilder<> &B, const DataLayout &DL, LLVMContext &Context = B.GetInsertBlock()->getContext(); Constant *StrLen = M->getOrInsertFunction("strlen", DL.getIntPtrType(Context), B.getInt8PtrTy()); - inferLibFuncAttributes(*M->getFunction("strlen"), *TLI); + inferLibFuncAttributes(M->getFunction("strlen"), *TLI); CallInst *CI = B.CreateCall(StrLen, castToCStr(Ptr, B), "strlen"); if (const Function *F = dyn_cast(StrLen->stripPointerCasts())) CI->setCallingConv(F->getCallingConv()); @@ -791,7 +795,7 @@ Value *llvm::emitStrChr(Value *Ptr, char C, IRBuilder<> &B, Type *I32Ty = B.getInt32Ty(); Constant *StrChr = M->getOrInsertFunction("strchr", I8Ptr, I8Ptr, I32Ty); - inferLibFuncAttributes(*M->getFunction("strchr"), *TLI); + inferLibFuncAttributes(M->getFunction("strchr"), *TLI); CallInst *CI = B.CreateCall( StrChr, {castToCStr(Ptr, B), ConstantInt::get(I32Ty, C)}, "strchr"); if (const Function *F = dyn_cast(StrChr->stripPointerCasts())) @@ -809,7 +813,7 @@ Value *llvm::emitStrNCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilder<> &B, Value *StrNCmp = M->getOrInsertFunction("strncmp", B.getInt32Ty(), B.getInt8PtrTy(), B.getInt8PtrTy(), DL.getIntPtrType(Context)); - inferLibFuncAttributes(*M->getFunction("strncmp"), *TLI); + inferLibFuncAttributes(M->getFunction("strncmp"), *TLI); CallInst *CI = B.CreateCall( StrNCmp, {castToCStr(Ptr1, B), castToCStr(Ptr2, B), Len}, "strncmp"); @@ -827,7 +831,7 @@ Value *llvm::emitStrCpy(Value *Dst, Value *Src, IRBuilder<> &B, Module *M = B.GetInsertBlock()->getModule(); Type *I8Ptr = B.getInt8PtrTy(); Value *StrCpy = M->getOrInsertFunction(Name, I8Ptr, I8Ptr, I8Ptr); - inferLibFuncAttributes(*M->getFunction(Name), *TLI); + inferLibFuncAttributes(M->getFunction(Name), *TLI); CallInst *CI = B.CreateCall(StrCpy, {castToCStr(Dst, B), castToCStr(Src, B)}, Name); if (const Function *F = dyn_cast(StrCpy->stripPointerCasts())) @@ -844,7 +848,7 @@ Value *llvm::emitStrNCpy(Value *Dst, Value *Src, Value *Len, IRBuilder<> &B, Type *I8Ptr = B.getInt8PtrTy(); Value *StrNCpy = M->getOrInsertFunction(Name, I8Ptr, I8Ptr, I8Ptr, Len->getType()); - inferLibFuncAttributes(*M->getFunction(Name), *TLI); + inferLibFuncAttributes(M->getFunction(Name), *TLI); CallInst *CI = B.CreateCall( StrNCpy, {castToCStr(Dst, B), castToCStr(Src, B), Len}, "strncpy"); if (const Function *F = dyn_cast(StrNCpy->stripPointerCasts())) @@ -885,7 +889,7 @@ Value *llvm::emitMemChr(Value *Ptr, Value *Val, Value *Len, IRBuilder<> &B, Value *MemChr = M->getOrInsertFunction("memchr", B.getInt8PtrTy(), B.getInt8PtrTy(), B.getInt32Ty(), DL.getIntPtrType(Context)); - inferLibFuncAttributes(*M->getFunction("memchr"), *TLI); + inferLibFuncAttributes(M->getFunction("memchr"), *TLI); CallInst *CI = B.CreateCall(MemChr, {castToCStr(Ptr, B), Val, Len}, "memchr"); if (const Function *F = dyn_cast(MemChr->stripPointerCasts())) @@ -904,7 +908,7 @@ Value *llvm::emitMemCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilder<> &B, Value *MemCmp = M->getOrInsertFunction("memcmp", B.getInt32Ty(), B.getInt8PtrTy(), B.getInt8PtrTy(), DL.getIntPtrType(Context)); - inferLibFuncAttributes(*M->getFunction("memcmp"), *TLI); + inferLibFuncAttributes(M->getFunction("memcmp"), *TLI); CallInst *CI = B.CreateCall( MemCmp, {castToCStr(Ptr1, B), castToCStr(Ptr2, B), Len}, "memcmp"); @@ -974,7 +978,7 @@ Value *llvm::emitPutChar(Value *Char, IRBuilder<> &B, Module *M = B.GetInsertBlock()->getModule(); Value *PutChar = M->getOrInsertFunction("putchar", B.getInt32Ty(), B.getInt32Ty()); - inferLibFuncAttributes(*M->getFunction("putchar"), *TLI); + inferLibFuncAttributes(M->getFunction("putchar"), *TLI); CallInst *CI = B.CreateCall(PutChar, B.CreateIntCast(Char, B.getInt32Ty(), @@ -995,7 +999,7 @@ Value *llvm::emitPutS(Value *Str, IRBuilder<> &B, Module *M = B.GetInsertBlock()->getModule(); Value *PutS = M->getOrInsertFunction("puts", B.getInt32Ty(), B.getInt8PtrTy()); - inferLibFuncAttributes(*M->getFunction("puts"), *TLI); + inferLibFuncAttributes(M->getFunction("puts"), *TLI); CallInst *CI = B.CreateCall(PutS, castToCStr(Str, B), "puts"); if (const Function *F = dyn_cast(PutS->stripPointerCasts())) CI->setCallingConv(F->getCallingConv()); @@ -1011,7 +1015,7 @@ Value *llvm::emitFPutC(Value *Char, Value *File, IRBuilder<> &B, Constant *F = M->getOrInsertFunction("fputc", B.getInt32Ty(), B.getInt32Ty(), File->getType()); if (File->getType()->isPointerTy()) - inferLibFuncAttributes(*M->getFunction("fputc"), *TLI); + inferLibFuncAttributes(M->getFunction("fputc"), *TLI); Char = B.CreateIntCast(Char, B.getInt32Ty(), /*isSigned*/true, "chari"); CallInst *CI = B.CreateCall(F, {Char, File}, "fputc"); @@ -1030,7 +1034,7 @@ Value *llvm::emitFPutCUnlocked(Value *Char, Value *File, IRBuilder<> &B, Constant *F = M->getOrInsertFunction("fputc_unlocked", B.getInt32Ty(), B.getInt32Ty(), File->getType()); if (File->getType()->isPointerTy()) - inferLibFuncAttributes(*M->getFunction("fputc_unlocked"), *TLI); + inferLibFuncAttributes(M->getFunction("fputc_unlocked"), *TLI); Char = B.CreateIntCast(Char, B.getInt32Ty(), /*isSigned*/ true, "chari"); CallInst *CI = B.CreateCall(F, {Char, File}, "fputc_unlocked"); @@ -1049,7 +1053,7 @@ Value *llvm::emitFPutS(Value *Str, Value *File, IRBuilder<> &B, Constant *F = M->getOrInsertFunction( FPutsName, B.getInt32Ty(), B.getInt8PtrTy(), File->getType()); if (File->getType()->isPointerTy()) - inferLibFuncAttributes(*M->getFunction(FPutsName), *TLI); + inferLibFuncAttributes(M->getFunction(FPutsName), *TLI); CallInst *CI = B.CreateCall(F, {castToCStr(Str, B), File}, "fputs"); if (const Function *Fn = dyn_cast(F->stripPointerCasts())) @@ -1067,7 +1071,7 @@ Value *llvm::emitFPutSUnlocked(Value *Str, Value *File, IRBuilder<> &B, Constant *F = M->getOrInsertFunction(FPutsUnlockedName, B.getInt32Ty(), B.getInt8PtrTy(), File->getType()); if (File->getType()->isPointerTy()) - inferLibFuncAttributes(*M->getFunction(FPutsUnlockedName), *TLI); + inferLibFuncAttributes(M->getFunction(FPutsUnlockedName), *TLI); CallInst *CI = B.CreateCall(F, {castToCStr(Str, B), File}, "fputs_unlocked"); if (const Function *Fn = dyn_cast(F->stripPointerCasts())) @@ -1088,7 +1092,7 @@ Value *llvm::emitFWrite(Value *Ptr, Value *Size, Value *File, IRBuilder<> &B, DL.getIntPtrType(Context), DL.getIntPtrType(Context), File->getType()); if (File->getType()->isPointerTy()) - inferLibFuncAttributes(*M->getFunction(FWriteName), *TLI); + inferLibFuncAttributes(M->getFunction(FWriteName), *TLI); CallInst *CI = B.CreateCall(F, {castToCStr(Ptr, B), Size, ConstantInt::get(DL.getIntPtrType(Context), 1), File}); @@ -1107,7 +1111,7 @@ Value *llvm::emitMalloc(Value *Num, IRBuilder<> &B, const DataLayout &DL, LLVMContext &Context = B.GetInsertBlock()->getContext(); Value *Malloc = M->getOrInsertFunction("malloc", B.getInt8PtrTy(), DL.getIntPtrType(Context)); - inferLibFuncAttributes(*M->getFunction("malloc"), *TLI); + inferLibFuncAttributes(M->getFunction("malloc"), *TLI); CallInst *CI = B.CreateCall(Malloc, Num, "malloc"); if (const Function *F = dyn_cast(Malloc->stripPointerCasts())) @@ -1126,7 +1130,7 @@ Value *llvm::emitCalloc(Value *Num, Value *Size, const AttributeList &Attrs, IntegerType *PtrType = DL.getIntPtrType((B.GetInsertBlock()->getContext())); Value *Calloc = M->getOrInsertFunction("calloc", Attrs, B.getInt8PtrTy(), PtrType, PtrType); - inferLibFuncAttributes(*M->getFunction("calloc"), TLI); + inferLibFuncAttributes(M->getFunction("calloc"), TLI); CallInst *CI = B.CreateCall(Calloc, {Num, Size}, "calloc"); if (const auto *F = dyn_cast(Calloc->stripPointerCasts())) @@ -1149,7 +1153,7 @@ Value *llvm::emitFWriteUnlocked(Value *Ptr, Value *Size, Value *N, Value *File, DL.getIntPtrType(Context), DL.getIntPtrType(Context), File->getType()); if (File->getType()->isPointerTy()) - inferLibFuncAttributes(*M->getFunction(FWriteUnlockedName), *TLI); + inferLibFuncAttributes(M->getFunction(FWriteUnlockedName), *TLI); CallInst *CI = B.CreateCall(F, {castToCStr(Ptr, B), Size, N, File}); if (const Function *Fn = dyn_cast(F->stripPointerCasts())) @@ -1166,7 +1170,7 @@ Value *llvm::emitFGetCUnlocked(Value *File, IRBuilder<> &B, Constant *F = M->getOrInsertFunction("fgetc_unlocked", B.getInt32Ty(), File->getType()); if (File->getType()->isPointerTy()) - inferLibFuncAttributes(*M->getFunction("fgetc_unlocked"), *TLI); + inferLibFuncAttributes(M->getFunction("fgetc_unlocked"), *TLI); CallInst *CI = B.CreateCall(F, File, "fgetc_unlocked"); if (const Function *Fn = dyn_cast(F->stripPointerCasts())) @@ -1183,7 +1187,7 @@ Value *llvm::emitFGetSUnlocked(Value *Str, Value *Size, Value *File, Constant *F = M->getOrInsertFunction("fgets_unlocked", B.getInt8PtrTy(), B.getInt8PtrTy(), B.getInt32Ty(), File->getType()); - inferLibFuncAttributes(*M->getFunction("fgets_unlocked"), *TLI); + inferLibFuncAttributes(M->getFunction("fgets_unlocked"), *TLI); CallInst *CI = B.CreateCall(F, {castToCStr(Str, B), Size, File}, "fgets_unlocked"); @@ -1206,7 +1210,7 @@ Value *llvm::emitFReadUnlocked(Value *Ptr, Value *Size, Value *N, Value *File, DL.getIntPtrType(Context), DL.getIntPtrType(Context), File->getType()); if (File->getType()->isPointerTy()) - inferLibFuncAttributes(*M->getFunction(FReadUnlockedName), *TLI); + inferLibFuncAttributes(M->getFunction(FReadUnlockedName), *TLI); CallInst *CI = B.CreateCall(F, {castToCStr(Ptr, B), Size, N, File}); if (const Function *Fn = dyn_cast(F->stripPointerCasts())) diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp index 41a495a0484..6f24dc10e1e 100644 --- a/lib/Transforms/Utils/SimplifyLibCalls.cpp +++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -145,7 +145,7 @@ static bool isLocallyOpenedFile(Value *File, CallInst *CI, IRBuilder<> &B, Func != LibFunc_fopen) return false; - inferLibFuncAttributes(*CI->getCalledFunction(), *TLI); + inferLibFuncAttributes(CI->getCalledFunction(), *TLI); if (PointerMayBeCaptured(File, true, true)) return false; diff --git a/test/Transforms/InstCombine/pr39177.ll b/test/Transforms/InstCombine/pr39177.ll new file mode 100644 index 00000000000..750e17a01f1 --- /dev/null +++ b/test/Transforms/InstCombine/pr39177.ll @@ -0,0 +1,66 @@ +; RUN: opt < %s -instcombine -S + +%struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] } +%struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 } + +@stderr = external global %struct._IO_FILE*, align 8 +@.str = private constant [8 x i8] c"crash!\0A\00", align 1 + +@fwrite = alias i64 (i8*, i64, i64, %struct._IO_FILE*), i64 (i8*, i64, i64, %struct._IO_FILE*)* @__fwrite_alias + +define i64 @__fwrite_alias(i8* %ptr, i64 %size, i64 %n, %struct._IO_FILE* %s) { +entry: + %ptr.addr = alloca i8*, align 8 + %size.addr = alloca i64, align 8 + %n.addr = alloca i64, align 8 + %s.addr = alloca %struct._IO_FILE*, align 8 + store i8* %ptr, i8** %ptr.addr, align 8 + store i64 %size, i64* %size.addr, align 8 + store i64 %n, i64* %n.addr, align 8 + store %struct._IO_FILE* %s, %struct._IO_FILE** %s.addr, align 8 + ret i64 0 +} + +define void @foo() { +entry: + %retval = alloca i32, align 4 + store i32 0, i32* %retval, align 4 + %0 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %call = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %0, i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str, i32 0, i32 0)) + ret void +} + +declare i32 @fprintf(%struct._IO_FILE*, i8*, ...) +; RUN: opt < %s -instcombine -S + +%struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] } +%struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 } + +@stderr = external global %struct._IO_FILE*, align 8 +@.str = private constant [8 x i8] c"crash!\0A\00", align 1 + +@fwrite = alias i64 (i8*, i64, i64, %struct._IO_FILE*), i64 (i8*, i64, i64, %struct._IO_FILE*)* @__fwrite_alias + +define i64 @__fwrite_alias(i8* %ptr, i64 %size, i64 %n, %struct._IO_FILE* %s) { +entry: + %ptr.addr = alloca i8*, align 8 + %size.addr = alloca i64, align 8 + %n.addr = alloca i64, align 8 + %s.addr = alloca %struct._IO_FILE*, align 8 + store i8* %ptr, i8** %ptr.addr, align 8 + store i64 %size, i64* %size.addr, align 8 + store i64 %n, i64* %n.addr, align 8 + store %struct._IO_FILE* %s, %struct._IO_FILE** %s.addr, align 8 + ret i64 0 +} + +define void @foo() { +entry: + %retval = alloca i32, align 4 + store i32 0, i32* %retval, align 4 + %0 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 + %call = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %0, i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str, i32 0, i32 0)) + ret void +} + +declare i32 @fprintf(%struct._IO_FILE*, i8*, ...) -- GitLab From e9226f019e3c62982bb6f81acf2e24701b0748b7 Mon Sep 17 00:00:00 2001 From: David Bolvansky Date: Sat, 13 Oct 2018 15:26:13 +0000 Subject: [PATCH 0151/1116] [NFC] Fixed duplicated test file git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344455 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/Transforms/InstCombine/pr39177.ll | 33 -------------------------- 1 file changed, 33 deletions(-) diff --git a/test/Transforms/InstCombine/pr39177.ll b/test/Transforms/InstCombine/pr39177.ll index 750e17a01f1..a047a079f58 100644 --- a/test/Transforms/InstCombine/pr39177.ll +++ b/test/Transforms/InstCombine/pr39177.ll @@ -31,36 +31,3 @@ entry: } declare i32 @fprintf(%struct._IO_FILE*, i8*, ...) -; RUN: opt < %s -instcombine -S - -%struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] } -%struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 } - -@stderr = external global %struct._IO_FILE*, align 8 -@.str = private constant [8 x i8] c"crash!\0A\00", align 1 - -@fwrite = alias i64 (i8*, i64, i64, %struct._IO_FILE*), i64 (i8*, i64, i64, %struct._IO_FILE*)* @__fwrite_alias - -define i64 @__fwrite_alias(i8* %ptr, i64 %size, i64 %n, %struct._IO_FILE* %s) { -entry: - %ptr.addr = alloca i8*, align 8 - %size.addr = alloca i64, align 8 - %n.addr = alloca i64, align 8 - %s.addr = alloca %struct._IO_FILE*, align 8 - store i8* %ptr, i8** %ptr.addr, align 8 - store i64 %size, i64* %size.addr, align 8 - store i64 %n, i64* %n.addr, align 8 - store %struct._IO_FILE* %s, %struct._IO_FILE** %s.addr, align 8 - ret i64 0 -} - -define void @foo() { -entry: - %retval = alloca i32, align 4 - store i32 0, i32* %retval, align 4 - %0 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8 - %call = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %0, i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str, i32 0, i32 0)) - ret void -} - -declare i32 @fprintf(%struct._IO_FILE*, i8*, ...) -- GitLab From 7875f53fbec75964572823e79a58eee5df2e514d Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Sat, 13 Oct 2018 16:02:47 +0000 Subject: [PATCH 0152/1116] [InstCombine] add tests for operand complexity canonicalization; NFC The tests with undef vector elements demonstrate a hole in the current pattern matching. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344456 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../InstCombine/operand-complexity.ll | 136 ++++++++++++++++++ 1 file changed, 136 insertions(+) create mode 100644 test/Transforms/InstCombine/operand-complexity.ll diff --git a/test/Transforms/InstCombine/operand-complexity.ll b/test/Transforms/InstCombine/operand-complexity.ll new file mode 100644 index 00000000000..747b0c836a5 --- /dev/null +++ b/test/Transforms/InstCombine/operand-complexity.ll @@ -0,0 +1,136 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -instcombine -S | FileCheck %s + +; 'Negate' is considered less complex than a normal binop, so the mul should have the binop as the first operand. + +define i8 @neg(i8 %x) { +; CHECK-LABEL: @neg( +; CHECK-NEXT: [[BO:%.*]] = udiv i8 [[X:%.*]], 42 +; CHECK-NEXT: [[NEGX:%.*]] = sub i8 0, [[X]] +; CHECK-NEXT: [[R:%.*]] = mul i8 [[BO]], [[NEGX]] +; CHECK-NEXT: ret i8 [[R]] +; + %bo = udiv i8 %x, 42 + %negx = sub i8 0, %x + %r = mul i8 %negx, %bo + ret i8 %r +} + +define <2 x i8> @neg_vec(<2 x i8> %x) { +; CHECK-LABEL: @neg_vec( +; CHECK-NEXT: [[BO:%.*]] = udiv <2 x i8> [[X:%.*]], +; CHECK-NEXT: [[NEGX:%.*]] = sub <2 x i8> zeroinitializer, [[X]] +; CHECK-NEXT: [[R:%.*]] = mul <2 x i8> [[BO]], [[NEGX]] +; CHECK-NEXT: ret <2 x i8> [[R]] +; + %bo = udiv <2 x i8> %x, + %negx = sub <2 x i8> , %x + %r = mul <2 x i8> %negx, %bo + ret <2 x i8> %r +} + +define <2 x i8> @neg_vec_undef(<2 x i8> %x) { +; CHECK-LABEL: @neg_vec_undef( +; CHECK-NEXT: [[BO:%.*]] = udiv <2 x i8> [[X:%.*]], +; CHECK-NEXT: [[NEGX:%.*]] = sub <2 x i8> , [[X]] +; CHECK-NEXT: [[R:%.*]] = mul <2 x i8> [[NEGX]], [[BO]] +; CHECK-NEXT: ret <2 x i8> [[R]] +; + %bo = udiv <2 x i8> %x, + %negx = sub <2 x i8> , %x + %r = mul <2 x i8> %negx, %bo + ret <2 x i8> %r +} + +; 'Not' is considered less complex than a normal binop, so the mul should have the binop as the first operand. + +define i8 @not(i8 %x) { +; CHECK-LABEL: @not( +; CHECK-NEXT: [[BO:%.*]] = udiv i8 [[X:%.*]], 42 +; CHECK-NEXT: [[NOTX:%.*]] = xor i8 [[X]], -1 +; CHECK-NEXT: [[R:%.*]] = mul i8 [[BO]], [[NOTX]] +; CHECK-NEXT: ret i8 [[R]] +; + %bo = udiv i8 %x, 42 + %notx = xor i8 -1, %x + %r = mul i8 %notx, %bo + ret i8 %r +} + +define <2 x i8> @not_vec(<2 x i8> %x) { +; CHECK-LABEL: @not_vec( +; CHECK-NEXT: [[BO:%.*]] = udiv <2 x i8> [[X:%.*]], +; CHECK-NEXT: [[NOTX:%.*]] = xor <2 x i8> [[X]], +; CHECK-NEXT: [[R:%.*]] = mul <2 x i8> [[BO]], [[NOTX]] +; CHECK-NEXT: ret <2 x i8> [[R]] +; + %bo = udiv <2 x i8> %x, + %notx = xor <2 x i8> , %x + %r = mul <2 x i8> %notx, %bo + ret <2 x i8> %r +} + +define <2 x i8> @not_vec_undef(<2 x i8> %x) { +; CHECK-LABEL: @not_vec_undef( +; CHECK-NEXT: [[BO:%.*]] = udiv <2 x i8> [[X:%.*]], +; CHECK-NEXT: [[NOTX:%.*]] = xor <2 x i8> [[X]], +; CHECK-NEXT: [[R:%.*]] = mul <2 x i8> [[NOTX]], [[BO]] +; CHECK-NEXT: ret <2 x i8> [[R]] +; + %bo = udiv <2 x i8> %x, + %notx = xor <2 x i8> , %x + %r = mul <2 x i8> %notx, %bo + ret <2 x i8> %r +} + +; 'Fneg' is considered less complex than a normal binop, so the fmul should have the binop as the first operand. +; Extra uses are required to ensure that the fneg is not canonicalized after the fmul. + +declare void @use(float) +declare void @use_vec(<2 x float>) + +define float @fneg(float %x) { +; CHECK-LABEL: @fneg( +; CHECK-NEXT: [[BO:%.*]] = fdiv float [[X:%.*]], 4.200000e+01 +; CHECK-NEXT: [[FNEGX:%.*]] = fsub float -0.000000e+00, [[X]] +; CHECK-NEXT: [[R:%.*]] = fmul float [[BO]], [[FNEGX]] +; CHECK-NEXT: call void @use(float [[FNEGX]]) +; CHECK-NEXT: ret float [[R]] +; + %bo = fdiv float %x, 42.0 + %fnegx = fsub float -0.0, %x + %r = fmul float %fnegx, %bo + call void @use(float %fnegx) + ret float %r +} + +define <2 x float> @fneg_vec(<2 x float> %x) { +; CHECK-LABEL: @fneg_vec( +; CHECK-NEXT: [[BO:%.*]] = fdiv <2 x float> [[X:%.*]], +; CHECK-NEXT: [[FNEGX:%.*]] = fsub <2 x float> , [[X]] +; CHECK-NEXT: [[R:%.*]] = fmul <2 x float> [[BO]], [[FNEGX]] +; CHECK-NEXT: call void @use_vec(<2 x float> [[FNEGX]]) +; CHECK-NEXT: ret <2 x float> [[R]] +; + %bo = fdiv <2 x float> %x, + %fnegx = fsub <2 x float> , %x + %r = fmul <2 x float> %fnegx, %bo + call void @use_vec(<2 x float> %fnegx) + ret <2 x float> %r +} + +define <2 x float> @fneg_vec_undef(<2 x float> %x) { +; CHECK-LABEL: @fneg_vec_undef( +; CHECK-NEXT: [[BO:%.*]] = fdiv <2 x float> [[X:%.*]], +; CHECK-NEXT: [[FNEGX:%.*]] = fsub <2 x float> , [[X]] +; CHECK-NEXT: [[R:%.*]] = fmul <2 x float> [[FNEGX]], [[BO]] +; CHECK-NEXT: call void @use_vec(<2 x float> [[FNEGX]]) +; CHECK-NEXT: ret <2 x float> [[R]] +; + %bo = fdiv <2 x float> %x, + %fnegx = fsub <2 x float> , %x + %r = fmul <2 x float> %fnegx, %bo + call void @use_vec(<2 x float> %fnegx) + ret <2 x float> %r +} + -- GitLab From 1000de2443cae3c5e9cb2f3c39bf8e846c6005b8 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sat, 13 Oct 2018 16:11:15 +0000 Subject: [PATCH 0153/1116] [X86][SSE] Remove most of vector CTTZ custom lowering and use LegalizeDAG instead. There is one remnant - AVX1 custom splitting of 256-bit vectors - which is due to a regression where the X86ISD::ANDNP is still performed as a YMM. I've also tightened the CTLZ or CTPOP lowering in SelectionDAGLegalize::ExpandBitCount to require a legal CTLZ - it doesn't affect existing users and fixes an issue with AVX512 codegen. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344457 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 4 +-- lib/Target/X86/X86ISelLowering.cpp | 35 +++++------------------- 2 files changed, 9 insertions(+), 30 deletions(-) diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index 884d7174440..3564a767a09 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -2814,8 +2814,8 @@ SDValue SelectionDAGLegalize::ExpandBitCount(unsigned Opc, SDValue Op, DAG.getNode(ISD::SUB, dl, VT, Op, DAG.getConstant(1, dl, VT))); // If ISD::CTLZ is legal and CTPOP isn't, then do that instead. - if (!TLI.isOperationLegalOrCustom(ISD::CTPOP, VT) && - TLI.isOperationLegalOrCustom(ISD::CTLZ, VT)) + if (!TLI.isOperationLegal(ISD::CTPOP, VT) && + TLI.isOperationLegal(ISD::CTLZ, VT)) return DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(Len, dl, VT), DAG.getNode(ISD::CTLZ, dl, VT, Tmp3)); diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 1411cf18902..1abe642a830 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -826,7 +826,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::CTPOP, VT, Custom); - setOperationAction(ISD::CTTZ, VT, Custom); // The condition codes aren't legal in SSE/AVX and under AVX512 we use // setcc all the way to isel and prefer SETGT in some isel patterns. @@ -1083,9 +1082,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::CTPOP, VT, Custom); - setOperationAction(ISD::CTTZ, VT, Custom); setOperationAction(ISD::CTLZ, VT, Custom); + // TODO - remove this once 256-bit X86ISD::ANDNP correctly split. + setOperationAction(ISD::CTTZ, VT, HasInt256 ? Expand : Custom); + // The condition codes aren't legal in SSE/AVX and under AVX512 we use // setcc all the way to isel and prefer SETGT in some isel patterns. setCondCodeAction(ISD::SETLT, VT, Custom); @@ -1371,7 +1372,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SHL, VT, Custom); setOperationAction(ISD::SRA, VT, Custom); setOperationAction(ISD::CTPOP, VT, Custom); - setOperationAction(ISD::CTTZ, VT, Custom); setOperationAction(ISD::ROTL, VT, Custom); setOperationAction(ISD::ROTR, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); @@ -1402,7 +1402,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // NonVLX sub-targets extend 128/256 vectors to use the 512 version. for (auto VT : { MVT::v16i32, MVT::v8i64} ) { setOperationAction(ISD::CTLZ, VT, Legal); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom); } } // Subtarget.hasCDI() @@ -1491,7 +1490,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, if (Subtarget.hasCDI()) { for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) { setOperationAction(ISD::CTLZ, VT, Legal); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom); } } // Subtarget.hasCDI() @@ -1586,7 +1584,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::MLOAD, VT, Legal); setOperationAction(ISD::MSTORE, VT, Legal); setOperationAction(ISD::CTPOP, VT, Custom); - setOperationAction(ISD::CTTZ, VT, Custom); setOperationAction(ISD::CTLZ, VT, Custom); setOperationAction(ISD::SMAX, VT, Legal); setOperationAction(ISD::UMAX, VT, Legal); @@ -22999,29 +22996,11 @@ static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget, SDValue N0 = Op.getOperand(0); SDLoc dl(Op); - if (VT.isVector()) { - // Decompose 256-bit ops into smaller 128-bit ops. - if (VT.is256BitVector() && !Subtarget.hasInt256()) - return Lower256IntUnary(Op, DAG); - - // cttz(x) = width - ctlz(~x & (x - 1)) - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (TLI.isOperationLegal(ISD::CTLZ, VT) && - !TLI.isOperationLegal(ISD::CTPOP, VT)) { - SDValue One = DAG.getConstant(1, dl, VT); - SDValue Width = DAG.getConstant(NumBits, dl, VT); - return DAG.getNode( - ISD::SUB, dl, VT, Width, - DAG.getNode(ISD::CTLZ, dl, VT, - DAG.getNode(ISD::AND, dl, VT, DAG.getNOT(dl, N0, VT), - DAG.getNode(ISD::SUB, dl, VT, N0, One)))); - } - - // Else leave it to the legalizer. - return SDValue(); - } + // Decompose 256-bit ops into smaller 128-bit ops. + if (VT.is256BitVector() && !Subtarget.hasInt256()) + return Lower256IntUnary(Op, DAG); - assert(Op.getOpcode() == ISD::CTTZ && + assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ && "Only scalar CTTZ requires custom lowering"); // Issue a bsf (scan bits forward) which also sets EFLAGS. -- GitLab From 21706932d747dc7d4908e03577b3823fb27683d9 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Sat, 13 Oct 2018 16:15:37 +0000 Subject: [PATCH 0154/1116] [InstCombine] fix complexity canonicalization with fake unary vector ops This is a preliminary step to avoid regressions when we add an actual 'fneg' instruction to IR. See D52934 and D53205. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344458 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/InstCombine/InstCombineInternal.h | 4 ++-- test/Transforms/InstCombine/operand-complexity.ll | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/lib/Transforms/InstCombine/InstCombineInternal.h b/lib/Transforms/InstCombine/InstCombineInternal.h index 46c598d4bfb..3a18744e434 100644 --- a/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/lib/Transforms/InstCombine/InstCombineInternal.h @@ -82,8 +82,8 @@ class User; /// 5 -> Other instructions static inline unsigned getComplexity(Value *V) { if (isa(V)) { - if (isa(V) || BinaryOperator::isNeg(V) || - BinaryOperator::isFNeg(V) || BinaryOperator::isNot(V)) + if (isa(V) || match(V, m_Neg(m_Value())) || + match(V, m_Not(m_Value())) || match(V, m_FNeg(m_Value()))) return 4; return 5; } diff --git a/test/Transforms/InstCombine/operand-complexity.ll b/test/Transforms/InstCombine/operand-complexity.ll index 747b0c836a5..20abe7b48f9 100644 --- a/test/Transforms/InstCombine/operand-complexity.ll +++ b/test/Transforms/InstCombine/operand-complexity.ll @@ -33,7 +33,7 @@ define <2 x i8> @neg_vec_undef(<2 x i8> %x) { ; CHECK-LABEL: @neg_vec_undef( ; CHECK-NEXT: [[BO:%.*]] = udiv <2 x i8> [[X:%.*]], ; CHECK-NEXT: [[NEGX:%.*]] = sub <2 x i8> , [[X]] -; CHECK-NEXT: [[R:%.*]] = mul <2 x i8> [[NEGX]], [[BO]] +; CHECK-NEXT: [[R:%.*]] = mul <2 x i8> [[BO]], [[NEGX]] ; CHECK-NEXT: ret <2 x i8> [[R]] ; %bo = udiv <2 x i8> %x, @@ -74,7 +74,7 @@ define <2 x i8> @not_vec_undef(<2 x i8> %x) { ; CHECK-LABEL: @not_vec_undef( ; CHECK-NEXT: [[BO:%.*]] = udiv <2 x i8> [[X:%.*]], ; CHECK-NEXT: [[NOTX:%.*]] = xor <2 x i8> [[X]], -; CHECK-NEXT: [[R:%.*]] = mul <2 x i8> [[NOTX]], [[BO]] +; CHECK-NEXT: [[R:%.*]] = mul <2 x i8> [[BO]], [[NOTX]] ; CHECK-NEXT: ret <2 x i8> [[R]] ; %bo = udiv <2 x i8> %x, @@ -123,7 +123,7 @@ define <2 x float> @fneg_vec_undef(<2 x float> %x) { ; CHECK-LABEL: @fneg_vec_undef( ; CHECK-NEXT: [[BO:%.*]] = fdiv <2 x float> [[X:%.*]], ; CHECK-NEXT: [[FNEGX:%.*]] = fsub <2 x float> , [[X]] -; CHECK-NEXT: [[R:%.*]] = fmul <2 x float> [[FNEGX]], [[BO]] +; CHECK-NEXT: [[R:%.*]] = fmul <2 x float> [[BO]], [[FNEGX]] ; CHECK-NEXT: call void @use_vec(<2 x float> [[FNEGX]]) ; CHECK-NEXT: ret <2 x float> [[R]] ; -- GitLab From bb14c3e538cceb4a3e9dd8f81e1cc46191030eef Mon Sep 17 00:00:00 2001 From: Thomas Lively Date: Sat, 13 Oct 2018 16:58:03 +0000 Subject: [PATCH 0155/1116] [WebAssembly][NFC] Fix signed/unsigned comparison warning git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344459 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/WebAssembly/WebAssemblyInstrSIMD.td | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td index af5c03599cd..b0fd6cab229 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -30,7 +30,9 @@ defm "" : ARGUMENT; // Constrained immediate argument types foreach SIZE = [8, 16] in -def ImmI#SIZE : ImmLeaf; +def ImmI#SIZE : ImmLeaf; foreach SIZE = [2, 4, 8, 16, 32] in def LaneIdx#SIZE : ImmLeaf; -- GitLab From ebbe7135795e31b45ac408aa2174ca9314ce7bfd Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sat, 13 Oct 2018 17:47:20 +0000 Subject: [PATCH 0156/1116] [LegalizeTypes] Prevent an assertion from PromoteIntRes_BSWAP and PromoteIntRes_BITREVERSE if the shift amount is too large for the VT returned by getShiftAmountTy Summary: getShiftAmountTy for X86 returns MVT::i8. If a BSWAP or BITREVERSE is created that requires promotion and the difference between the original VT and the promoted VT is more than 255 then we won't able to create the constant. This patch adds a check to replace the result from getShiftAmountTy to MVT::i32 if the difference won't fit. This should get legalized later when the shift is ultimately expanded since its clearly an illegal type that we're only promoting to make it a power of 2 bit width. Alternatively we could base the decision completely on the largest shift amount the promoted VT could use. Vectors should be immune here because getShiftAmountTy always returns the incoming VT for vectors. Only the scalar shift amount can be changed by the targets. Reviewers: eli.friedman, RKSimon, spatel Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D53232 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344460 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../SelectionDAG/LegalizeIntegerTypes.cpp | 28 +- test/CodeGen/X86/bitreverse.ll | 618 ++++++++++++++++++ test/CodeGen/X86/bswap.ll | 150 +++++ 3 files changed, 788 insertions(+), 8 deletions(-) diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index e11a18fd0c4..064e9e5875b 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -311,6 +311,19 @@ SDValue DAGTypeLegalizer::PromoteIntRes_BITCAST(SDNode *N) { CreateStackStoreLoad(InOp, OutVT)); } +// Helper for BSWAP/BITREVERSE promotion to ensure we can fit the shift amount +// in the VT returned by getShiftAmountTy and to return a safe VT if we can't. +static EVT getShiftAmountTyForConstant(unsigned Val, EVT VT, + const TargetLowering &TLI, + SelectionDAG &DAG) { + EVT ShiftVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout()); + // If the value won't fit in the prefered type, just use something safe. It + // will be legalized when the shift is expanded. + if ((Log2_32(Val) + 1) > ShiftVT.getScalarSizeInBits()) + ShiftVT = MVT::i32; + return ShiftVT; +} + SDValue DAGTypeLegalizer::PromoteIntRes_BSWAP(SDNode *N) { SDValue Op = GetPromotedInteger(N->getOperand(0)); EVT OVT = N->getValueType(0); @@ -318,10 +331,9 @@ SDValue DAGTypeLegalizer::PromoteIntRes_BSWAP(SDNode *N) { SDLoc dl(N); unsigned DiffBits = NVT.getScalarSizeInBits() - OVT.getScalarSizeInBits(); - return DAG.getNode( - ISD::SRL, dl, NVT, DAG.getNode(ISD::BSWAP, dl, NVT, Op), - DAG.getConstant(DiffBits, dl, - TLI.getShiftAmountTy(NVT, DAG.getDataLayout()))); + EVT ShiftVT = getShiftAmountTyForConstant(DiffBits, NVT, TLI, DAG); + return DAG.getNode(ISD::SRL, dl, NVT, DAG.getNode(ISD::BSWAP, dl, NVT, Op), + DAG.getConstant(DiffBits, dl, ShiftVT)); } SDValue DAGTypeLegalizer::PromoteIntRes_BITREVERSE(SDNode *N) { @@ -331,10 +343,10 @@ SDValue DAGTypeLegalizer::PromoteIntRes_BITREVERSE(SDNode *N) { SDLoc dl(N); unsigned DiffBits = NVT.getScalarSizeInBits() - OVT.getScalarSizeInBits(); - return DAG.getNode( - ISD::SRL, dl, NVT, DAG.getNode(ISD::BITREVERSE, dl, NVT, Op), - DAG.getConstant(DiffBits, dl, - TLI.getShiftAmountTy(NVT, DAG.getDataLayout()))); + EVT ShiftVT = getShiftAmountTyForConstant(DiffBits, NVT, TLI, DAG); + return DAG.getNode(ISD::SRL, dl, NVT, + DAG.getNode(ISD::BITREVERSE, dl, NVT, Op), + DAG.getConstant(DiffBits, dl, ShiftVT)); } SDValue DAGTypeLegalizer::PromoteIntRes_BUILD_PAIR(SDNode *N) { diff --git a/test/CodeGen/X86/bitreverse.ll b/test/CodeGen/X86/bitreverse.ll index 2e35fde6c55..aeac9e88dd0 100644 --- a/test/CodeGen/X86/bitreverse.ll +++ b/test/CodeGen/X86/bitreverse.ll @@ -523,3 +523,621 @@ define <2 x i16> @undef_v2i16() { %b = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> undef) ret <2 x i16> %b } + +; Make sure we don't assert during type legalization promoting a large +; bitreverse due to the need for a large shift that won't fit in the i8 returned +; from getShiftAmountTy. +define i528 @large_promotion(i528 %A) nounwind { +; X86-LABEL: large_promotion: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: subl $56, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: bswapl %ebx +; X86-NEXT: movl %ebx, %ebp +; X86-NEXT: andl $252645135, %ebp # imm = 0xF0F0F0F +; X86-NEXT: shll $4, %ebp +; X86-NEXT: andl $-252645136, %ebx # imm = 0xF0F0F0F0 +; X86-NEXT: shrl $4, %ebx +; X86-NEXT: orl %ebp, %ebx +; X86-NEXT: movl %ebx, %ebp +; X86-NEXT: andl $858993459, %ebp # imm = 0x33333333 +; X86-NEXT: andl $-858993460, %ebx # imm = 0xCCCCCCCC +; X86-NEXT: shrl $2, %ebx +; X86-NEXT: leal (%ebx,%ebp,4), %ebx +; X86-NEXT: movl %ebx, %ebp +; X86-NEXT: andl $1431633920, %ebp # imm = 0x55550000 +; X86-NEXT: andl $-1431699456, %ebx # imm = 0xAAAA0000 +; X86-NEXT: shrl %ebx +; X86-NEXT: leal (%ebx,%ebp,2), %ebx +; X86-NEXT: movl %ebx, (%esp) # 4-byte Spill +; X86-NEXT: bswapl %edi +; X86-NEXT: movl %edi, %ebx +; X86-NEXT: andl $252645135, %ebx # imm = 0xF0F0F0F +; X86-NEXT: shll $4, %ebx +; X86-NEXT: andl $-252645136, %edi # imm = 0xF0F0F0F0 +; X86-NEXT: shrl $4, %edi +; X86-NEXT: orl %ebx, %edi +; X86-NEXT: movl %edi, %ebx +; X86-NEXT: andl $858993459, %ebx # imm = 0x33333333 +; X86-NEXT: andl $-858993460, %edi # imm = 0xCCCCCCCC +; X86-NEXT: shrl $2, %edi +; X86-NEXT: leal (%edi,%ebx,4), %edi +; X86-NEXT: movl %edi, %ebx +; X86-NEXT: andl $1431655765, %ebx # imm = 0x55555555 +; X86-NEXT: andl $-1431655766, %edi # imm = 0xAAAAAAAA +; X86-NEXT: shrl %edi +; X86-NEXT: leal (%edi,%ebx,2), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: bswapl %esi +; X86-NEXT: movl %esi, %edi +; X86-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F +; X86-NEXT: shll $4, %edi +; X86-NEXT: andl $-252645136, %esi # imm = 0xF0F0F0F0 +; X86-NEXT: shrl $4, %esi +; X86-NEXT: orl %edi, %esi +; X86-NEXT: movl %esi, %edi +; X86-NEXT: andl $858993459, %edi # imm = 0x33333333 +; X86-NEXT: andl $-858993460, %esi # imm = 0xCCCCCCCC +; X86-NEXT: shrl $2, %esi +; X86-NEXT: leal (%esi,%edi,4), %esi +; X86-NEXT: movl %esi, %edi +; X86-NEXT: andl $1431655765, %edi # imm = 0x55555555 +; X86-NEXT: andl $-1431655766, %esi # imm = 0xAAAAAAAA +; X86-NEXT: shrl %esi +; X86-NEXT: leal (%esi,%edi,2), %ebx +; X86-NEXT: bswapl %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: andl $252645135, %esi # imm = 0xF0F0F0F +; X86-NEXT: shll $4, %esi +; X86-NEXT: andl $-252645136, %edx # imm = 0xF0F0F0F0 +; X86-NEXT: shrl $4, %edx +; X86-NEXT: orl %esi, %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: andl $858993459, %esi # imm = 0x33333333 +; X86-NEXT: andl $-858993460, %edx # imm = 0xCCCCCCCC +; X86-NEXT: shrl $2, %edx +; X86-NEXT: leal (%edx,%esi,4), %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: andl $1431655765, %esi # imm = 0x55555555 +; X86-NEXT: andl $-1431655766, %edx # imm = 0xAAAAAAAA +; X86-NEXT: shrl %edx +; X86-NEXT: leal (%edx,%esi,2), %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: bswapl %ecx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F +; X86-NEXT: shll $4, %edx +; X86-NEXT: andl $-252645136, %ecx # imm = 0xF0F0F0F0 +; X86-NEXT: shrl $4, %ecx +; X86-NEXT: orl %edx, %ecx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: andl $858993459, %edx # imm = 0x33333333 +; X86-NEXT: andl $-858993460, %ecx # imm = 0xCCCCCCCC +; X86-NEXT: shrl $2, %ecx +; X86-NEXT: leal (%ecx,%edx,4), %ecx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: andl $1431655765, %edx # imm = 0x55555555 +; X86-NEXT: andl $-1431655766, %ecx # imm = 0xAAAAAAAA +; X86-NEXT: shrl %ecx +; X86-NEXT: leal (%ecx,%edx,2), %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: bswapl %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F +; X86-NEXT: shll $4, %ecx +; X86-NEXT: andl $-252645136, %eax # imm = 0xF0F0F0F0 +; X86-NEXT: shrl $4, %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: andl $858993459, %ecx # imm = 0x33333333 +; X86-NEXT: andl $-858993460, %eax # imm = 0xCCCCCCCC +; X86-NEXT: shrl $2, %eax +; X86-NEXT: leal (%eax,%ecx,4), %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555 +; X86-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA +; X86-NEXT: shrl %eax +; X86-NEXT: leal (%eax,%ecx,2), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: bswapl %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F +; X86-NEXT: shll $4, %ecx +; X86-NEXT: andl $-252645136, %eax # imm = 0xF0F0F0F0 +; X86-NEXT: shrl $4, %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: andl $858993459, %ecx # imm = 0x33333333 +; X86-NEXT: andl $-858993460, %eax # imm = 0xCCCCCCCC +; X86-NEXT: shrl $2, %eax +; X86-NEXT: leal (%eax,%ecx,4), %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555 +; X86-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA +; X86-NEXT: shrl %eax +; X86-NEXT: leal (%eax,%ecx,2), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: bswapl %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F +; X86-NEXT: shll $4, %ecx +; X86-NEXT: andl $-252645136, %eax # imm = 0xF0F0F0F0 +; X86-NEXT: shrl $4, %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: andl $858993459, %ecx # imm = 0x33333333 +; X86-NEXT: andl $-858993460, %eax # imm = 0xCCCCCCCC +; X86-NEXT: shrl $2, %eax +; X86-NEXT: leal (%eax,%ecx,4), %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555 +; X86-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA +; X86-NEXT: shrl %eax +; X86-NEXT: leal (%eax,%ecx,2), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: bswapl %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F +; X86-NEXT: shll $4, %ecx +; X86-NEXT: andl $-252645136, %eax # imm = 0xF0F0F0F0 +; X86-NEXT: shrl $4, %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: andl $858993459, %ecx # imm = 0x33333333 +; X86-NEXT: andl $-858993460, %eax # imm = 0xCCCCCCCC +; X86-NEXT: shrl $2, %eax +; X86-NEXT: leal (%eax,%ecx,4), %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555 +; X86-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA +; X86-NEXT: shrl %eax +; X86-NEXT: leal (%eax,%ecx,2), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: bswapl %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F +; X86-NEXT: shll $4, %ecx +; X86-NEXT: andl $-252645136, %eax # imm = 0xF0F0F0F0 +; X86-NEXT: shrl $4, %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: andl $858993459, %ecx # imm = 0x33333333 +; X86-NEXT: andl $-858993460, %eax # imm = 0xCCCCCCCC +; X86-NEXT: shrl $2, %eax +; X86-NEXT: leal (%eax,%ecx,4), %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555 +; X86-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA +; X86-NEXT: shrl %eax +; X86-NEXT: leal (%eax,%ecx,2), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: bswapl %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F +; X86-NEXT: shll $4, %ecx +; X86-NEXT: andl $-252645136, %eax # imm = 0xF0F0F0F0 +; X86-NEXT: shrl $4, %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: andl $858993459, %ecx # imm = 0x33333333 +; X86-NEXT: andl $-858993460, %eax # imm = 0xCCCCCCCC +; X86-NEXT: shrl $2, %eax +; X86-NEXT: leal (%eax,%ecx,4), %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555 +; X86-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA +; X86-NEXT: shrl %eax +; X86-NEXT: leal (%eax,%ecx,2), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: bswapl %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F +; X86-NEXT: shll $4, %ecx +; X86-NEXT: andl $-252645136, %eax # imm = 0xF0F0F0F0 +; X86-NEXT: shrl $4, %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: andl $858993459, %ecx # imm = 0x33333333 +; X86-NEXT: andl $-858993460, %eax # imm = 0xCCCCCCCC +; X86-NEXT: shrl $2, %eax +; X86-NEXT: leal (%eax,%ecx,4), %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555 +; X86-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA +; X86-NEXT: shrl %eax +; X86-NEXT: leal (%eax,%ecx,2), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: bswapl %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F +; X86-NEXT: shll $4, %ecx +; X86-NEXT: andl $-252645136, %eax # imm = 0xF0F0F0F0 +; X86-NEXT: shrl $4, %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: andl $858993459, %ecx # imm = 0x33333333 +; X86-NEXT: andl $-858993460, %eax # imm = 0xCCCCCCCC +; X86-NEXT: shrl $2, %eax +; X86-NEXT: leal (%eax,%ecx,4), %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555 +; X86-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA +; X86-NEXT: shrl %eax +; X86-NEXT: leal (%eax,%ecx,2), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: bswapl %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F +; X86-NEXT: shll $4, %ecx +; X86-NEXT: andl $-252645136, %eax # imm = 0xF0F0F0F0 +; X86-NEXT: shrl $4, %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: andl $858993459, %ecx # imm = 0x33333333 +; X86-NEXT: andl $-858993460, %eax # imm = 0xCCCCCCCC +; X86-NEXT: shrl $2, %eax +; X86-NEXT: leal (%eax,%ecx,4), %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555 +; X86-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA +; X86-NEXT: shrl %eax +; X86-NEXT: leal (%eax,%ecx,2), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: bswapl %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F +; X86-NEXT: shll $4, %ecx +; X86-NEXT: andl $-252645136, %eax # imm = 0xF0F0F0F0 +; X86-NEXT: shrl $4, %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: andl $858993459, %ecx # imm = 0x33333333 +; X86-NEXT: andl $-858993460, %eax # imm = 0xCCCCCCCC +; X86-NEXT: shrl $2, %eax +; X86-NEXT: leal (%eax,%ecx,4), %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555 +; X86-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA +; X86-NEXT: shrl %eax +; X86-NEXT: leal (%eax,%ecx,2), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: bswapl %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F +; X86-NEXT: shll $4, %ecx +; X86-NEXT: andl $-252645136, %eax # imm = 0xF0F0F0F0 +; X86-NEXT: shrl $4, %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: andl $858993459, %ecx # imm = 0x33333333 +; X86-NEXT: andl $-858993460, %eax # imm = 0xCCCCCCCC +; X86-NEXT: shrl $2, %eax +; X86-NEXT: leal (%eax,%ecx,4), %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555 +; X86-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA +; X86-NEXT: shrl %eax +; X86-NEXT: leal (%eax,%ecx,2), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: bswapl %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F +; X86-NEXT: shll $4, %ecx +; X86-NEXT: andl $-252645136, %eax # imm = 0xF0F0F0F0 +; X86-NEXT: shrl $4, %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: andl $858993459, %ecx # imm = 0x33333333 +; X86-NEXT: andl $-858993460, %eax # imm = 0xCCCCCCCC +; X86-NEXT: shrl $2, %eax +; X86-NEXT: leal (%eax,%ecx,4), %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: andl $1431655765, %ecx # imm = 0x55555555 +; X86-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA +; X86-NEXT: shrl %eax +; X86-NEXT: leal (%eax,%ecx,2), %edx +; X86-NEXT: movl (%esp), %esi # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shrdl $16, %eax, %esi +; X86-NEXT: shrdl $16, %ebx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: shrdl $16, %ecx, %ebx +; X86-NEXT: movl %ebx, (%esp) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shrdl $16, %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: shrdl $16, %ecx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shrdl $16, %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: shrdl $16, %ecx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shrdl $16, %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: shrdl $16, %ecx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shrdl $16, %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: shrdl $16, %ecx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: shrdl $16, %ebp, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: shrdl $16, %ebx, %ebp +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shrdl $16, %eax, %ebx +; X86-NEXT: shrdl $16, %edi, %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shrdl $16, %edx, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %edi, 60(%eax) +; X86-NEXT: movl %ecx, 56(%eax) +; X86-NEXT: movl %ebx, 52(%eax) +; X86-NEXT: movl %ebp, 48(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, 44(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, 40(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, 36(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, 32(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, 28(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, 24(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, 20(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, 16(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, 8(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, 4(%eax) +; X86-NEXT: movl %esi, (%eax) +; X86-NEXT: shrl $16, %edx +; X86-NEXT: movw %dx, 64(%eax) +; X86-NEXT: addl $56, %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp +; X86-NEXT: retl $4 +; +; X64-LABEL: large_promotion: +; X64: # %bb.0: +; X64-NEXT: pushq %rbp +; X64-NEXT: pushq %r15 +; X64-NEXT: pushq %r14 +; X64-NEXT: pushq %r13 +; X64-NEXT: pushq %r12 +; X64-NEXT: pushq %rbx +; X64-NEXT: movq %rdi, %r12 +; X64-NEXT: movq {{[0-9]+}}(%rsp), %rbp +; X64-NEXT: movq {{[0-9]+}}(%rsp), %rbx +; X64-NEXT: bswapq %rbx +; X64-NEXT: movabsq $1085102592571150095, %r13 # imm = 0xF0F0F0F0F0F0F0F +; X64-NEXT: movq %rbx, %r10 +; X64-NEXT: andq %r13, %r10 +; X64-NEXT: shlq $4, %r10 +; X64-NEXT: movabsq $-1085102592571150096, %rax # imm = 0xF0F0F0F0F0F0F0F0 +; X64-NEXT: andq %rax, %rbx +; X64-NEXT: shrq $4, %rbx +; X64-NEXT: orq %r10, %rbx +; X64-NEXT: movabsq $3689348814741910323, %r11 # imm = 0x3333333333333333 +; X64-NEXT: movq %rbx, %r10 +; X64-NEXT: andq %r11, %r10 +; X64-NEXT: movabsq $-3689348814741910324, %r14 # imm = 0xCCCCCCCCCCCCCCCC +; X64-NEXT: andq %r14, %rbx +; X64-NEXT: shrq $2, %rbx +; X64-NEXT: leaq (%rbx,%r10,4), %r10 +; X64-NEXT: movabsq $6148820866244280320, %rbx # imm = 0x5555000000000000 +; X64-NEXT: andq %r10, %rbx +; X64-NEXT: movabsq $-6149102341220990976, %rdi # imm = 0xAAAA000000000000 +; X64-NEXT: andq %r10, %rdi +; X64-NEXT: shrq %rdi +; X64-NEXT: leaq (%rdi,%rbx,2), %rdi +; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: bswapq %rbp +; X64-NEXT: movq %rbp, %rdi +; X64-NEXT: andq %r13, %rdi +; X64-NEXT: shlq $4, %rdi +; X64-NEXT: andq %rax, %rbp +; X64-NEXT: shrq $4, %rbp +; X64-NEXT: orq %rdi, %rbp +; X64-NEXT: movq %rbp, %rdi +; X64-NEXT: andq %r11, %rdi +; X64-NEXT: andq %r14, %rbp +; X64-NEXT: shrq $2, %rbp +; X64-NEXT: leaq (%rbp,%rdi,4), %rbp +; X64-NEXT: movabsq $6148914691236517205, %rbx # imm = 0x5555555555555555 +; X64-NEXT: movq %rbp, %r10 +; X64-NEXT: andq %rbx, %r10 +; X64-NEXT: movabsq $-6148914691236517206, %rdi # imm = 0xAAAAAAAAAAAAAAAA +; X64-NEXT: andq %rdi, %rbp +; X64-NEXT: shrq %rbp +; X64-NEXT: leaq (%rbp,%r10,2), %rbp +; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq {{[0-9]+}}(%rsp), %rbp +; X64-NEXT: bswapq %rbp +; X64-NEXT: movq %rbp, %r10 +; X64-NEXT: andq %r13, %r10 +; X64-NEXT: shlq $4, %r10 +; X64-NEXT: andq %rax, %rbp +; X64-NEXT: movq %rax, %r15 +; X64-NEXT: shrq $4, %rbp +; X64-NEXT: orq %r10, %rbp +; X64-NEXT: movq %rbp, %r10 +; X64-NEXT: andq %r11, %r10 +; X64-NEXT: andq %r14, %rbp +; X64-NEXT: shrq $2, %rbp +; X64-NEXT: leaq (%rbp,%r10,4), %rbp +; X64-NEXT: movq %rbp, %r10 +; X64-NEXT: andq %rbx, %r10 +; X64-NEXT: andq %rdi, %rbp +; X64-NEXT: shrq %rbp +; X64-NEXT: leaq (%rbp,%r10,2), %rbp +; X64-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; X64-NEXT: bswapq %r10 +; X64-NEXT: movq %r10, %rax +; X64-NEXT: andq %r13, %rax +; X64-NEXT: shlq $4, %rax +; X64-NEXT: movq %r15, %rdi +; X64-NEXT: andq %r15, %r10 +; X64-NEXT: shrq $4, %r10 +; X64-NEXT: orq %rax, %r10 +; X64-NEXT: movq %r10, %rax +; X64-NEXT: andq %r11, %rax +; X64-NEXT: andq %r14, %r10 +; X64-NEXT: shrq $2, %r10 +; X64-NEXT: leaq (%r10,%rax,4), %rax +; X64-NEXT: movq %rax, %r10 +; X64-NEXT: andq %rbx, %r10 +; X64-NEXT: movabsq $-6148914691236517206, %r15 # imm = 0xAAAAAAAAAAAAAAAA +; X64-NEXT: andq %r15, %rax +; X64-NEXT: shrq %rax +; X64-NEXT: leaq (%rax,%r10,2), %r10 +; X64-NEXT: bswapq %r9 +; X64-NEXT: movq %r9, %rax +; X64-NEXT: andq %r13, %rax +; X64-NEXT: shlq $4, %rax +; X64-NEXT: andq %rdi, %r9 +; X64-NEXT: shrq $4, %r9 +; X64-NEXT: orq %rax, %r9 +; X64-NEXT: movq %r9, %rax +; X64-NEXT: andq %r11, %rax +; X64-NEXT: andq %r14, %r9 +; X64-NEXT: shrq $2, %r9 +; X64-NEXT: leaq (%r9,%rax,4), %rax +; X64-NEXT: movq %rax, %r9 +; X64-NEXT: andq %rbx, %r9 +; X64-NEXT: andq %r15, %rax +; X64-NEXT: shrq %rax +; X64-NEXT: leaq (%rax,%r9,2), %r9 +; X64-NEXT: bswapq %r8 +; X64-NEXT: movq %r8, %rax +; X64-NEXT: andq %r13, %rax +; X64-NEXT: shlq $4, %rax +; X64-NEXT: andq %rdi, %r8 +; X64-NEXT: shrq $4, %r8 +; X64-NEXT: orq %rax, %r8 +; X64-NEXT: movq %r8, %rax +; X64-NEXT: andq %r11, %rax +; X64-NEXT: andq %r14, %r8 +; X64-NEXT: shrq $2, %r8 +; X64-NEXT: leaq (%r8,%rax,4), %rax +; X64-NEXT: movq %rax, %r8 +; X64-NEXT: andq %rbx, %r8 +; X64-NEXT: andq %r15, %rax +; X64-NEXT: shrq %rax +; X64-NEXT: leaq (%rax,%r8,2), %r8 +; X64-NEXT: bswapq %rcx +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: andq %r13, %rax +; X64-NEXT: shlq $4, %rax +; X64-NEXT: andq %rdi, %rcx +; X64-NEXT: shrq $4, %rcx +; X64-NEXT: orq %rax, %rcx +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: andq %r11, %rax +; X64-NEXT: andq %r14, %rcx +; X64-NEXT: shrq $2, %rcx +; X64-NEXT: leaq (%rcx,%rax,4), %rax +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: andq %rbx, %rcx +; X64-NEXT: andq %r15, %rax +; X64-NEXT: shrq %rax +; X64-NEXT: leaq (%rax,%rcx,2), %rcx +; X64-NEXT: bswapq %rdx +; X64-NEXT: movq %rdx, %rax +; X64-NEXT: andq %r13, %rax +; X64-NEXT: shlq $4, %rax +; X64-NEXT: andq %rdi, %rdx +; X64-NEXT: shrq $4, %rdx +; X64-NEXT: orq %rax, %rdx +; X64-NEXT: movq %rdx, %rax +; X64-NEXT: andq %r11, %rax +; X64-NEXT: andq %r14, %rdx +; X64-NEXT: shrq $2, %rdx +; X64-NEXT: leaq (%rdx,%rax,4), %rax +; X64-NEXT: movq %rax, %rdx +; X64-NEXT: andq %rbx, %rdx +; X64-NEXT: andq %r15, %rax +; X64-NEXT: shrq %rax +; X64-NEXT: leaq (%rax,%rdx,2), %rax +; X64-NEXT: bswapq %rsi +; X64-NEXT: andq %rsi, %r13 +; X64-NEXT: andq %rdi, %rsi +; X64-NEXT: shlq $4, %r13 +; X64-NEXT: shrq $4, %rsi +; X64-NEXT: orq %r13, %rsi +; X64-NEXT: andq %rsi, %r11 +; X64-NEXT: andq %r14, %rsi +; X64-NEXT: shrq $2, %rsi +; X64-NEXT: leaq (%rsi,%r11,4), %rdx +; X64-NEXT: andq %rdx, %rbx +; X64-NEXT: andq %r15, %rdx +; X64-NEXT: shrq %rdx +; X64-NEXT: leaq (%rdx,%rbx,2), %rdx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; X64-NEXT: shrdq $48, %rdi, %rsi +; X64-NEXT: shrdq $48, %rbp, %rdi +; X64-NEXT: shrdq $48, %r10, %rbp +; X64-NEXT: shrdq $48, %r9, %r10 +; X64-NEXT: shrdq $48, %r8, %r9 +; X64-NEXT: shrdq $48, %rcx, %r8 +; X64-NEXT: shrdq $48, %rax, %rcx +; X64-NEXT: shrdq $48, %rdx, %rax +; X64-NEXT: movq %rax, 56(%r12) +; X64-NEXT: movq %rcx, 48(%r12) +; X64-NEXT: movq %r8, 40(%r12) +; X64-NEXT: movq %r9, 32(%r12) +; X64-NEXT: movq %r10, 24(%r12) +; X64-NEXT: movq %rbp, 16(%r12) +; X64-NEXT: movq %rdi, 8(%r12) +; X64-NEXT: movq %rsi, (%r12) +; X64-NEXT: shrq $48, %rdx +; X64-NEXT: movw %dx, 64(%r12) +; X64-NEXT: movq %r12, %rax +; X64-NEXT: popq %rbx +; X64-NEXT: popq %r12 +; X64-NEXT: popq %r13 +; X64-NEXT: popq %r14 +; X64-NEXT: popq %r15 +; X64-NEXT: popq %rbp +; X64-NEXT: retq + %Z = call i528 @llvm.bitreverse.i528(i528 %A) + ret i528 %Z +} +declare i528 @llvm.bitreverse.i528(i528) diff --git a/test/CodeGen/X86/bswap.ll b/test/CodeGen/X86/bswap.ll index 756dd7fa6f6..4753fc27cc0 100644 --- a/test/CodeGen/X86/bswap.ll +++ b/test/CodeGen/X86/bswap.ll @@ -206,3 +206,153 @@ define i64 @finally_useful_bswap() { ret i64 %swapped } +; Make sure we don't assert during type legalization promoting a large +; bswap due to the need for a large shift that won't fit in the i8 returned +; from getShiftAmountTy. +define i528 @large_promotion(i528 %A) nounwind { +; CHECK-LABEL: large_promotion: +; CHECK: # %bb.0: +; CHECK-NEXT: pushl %ebp +; CHECK-NEXT: pushl %ebx +; CHECK-NEXT: pushl %edi +; CHECK-NEXT: pushl %esi +; CHECK-NEXT: subl $44, %esp +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebp +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: bswapl %eax +; CHECK-NEXT: bswapl %ecx +; CHECK-NEXT: shrdl $16, %ecx, %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: bswapl %edx +; CHECK-NEXT: shrdl $16, %edx, %ecx +; CHECK-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: bswapl %esi +; CHECK-NEXT: shrdl $16, %esi, %edx +; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: bswapl %edi +; CHECK-NEXT: shrdl $16, %edi, %esi +; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: bswapl %ebx +; CHECK-NEXT: shrdl $16, %ebx, %edi +; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: bswapl %ebp +; CHECK-NEXT: shrdl $16, %ebp, %ebx +; CHECK-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: bswapl %ecx +; CHECK-NEXT: shrdl $16, %ecx, %ebp +; CHECK-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: bswapl %eax +; CHECK-NEXT: shrdl $16, %eax, %ecx +; CHECK-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: bswapl %ecx +; CHECK-NEXT: shrdl $16, %ecx, %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: bswapl %eax +; CHECK-NEXT: shrdl $16, %eax, %ecx +; CHECK-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebp +; CHECK-NEXT: bswapl %ebp +; CHECK-NEXT: shrdl $16, %ebp, %eax +; CHECK-NEXT: movl %eax, (%esp) # 4-byte Spill +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebx +; CHECK-NEXT: bswapl %ebx +; CHECK-NEXT: shrdl $16, %ebx, %ebp +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi +; CHECK-NEXT: bswapl %esi +; CHECK-NEXT: shrdl $16, %esi, %ebx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: bswapl %edx +; CHECK-NEXT: shrdl $16, %edx, %esi +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: bswapl %ecx +; CHECK-NEXT: shrdl $16, %ecx, %edx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi +; CHECK-NEXT: bswapl %edi +; CHECK-NEXT: shrdl $16, %edi, %ecx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl %ecx, 60(%eax) +; CHECK-NEXT: movl %edx, 56(%eax) +; CHECK-NEXT: movl %esi, 52(%eax) +; CHECK-NEXT: movl %ebx, 48(%eax) +; CHECK-NEXT: movl %ebp, 44(%eax) +; CHECK-NEXT: movl (%esp), %ecx # 4-byte Reload +; CHECK-NEXT: movl %ecx, 40(%eax) +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-NEXT: movl %ecx, 36(%eax) +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-NEXT: movl %ecx, 32(%eax) +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-NEXT: movl %ecx, 28(%eax) +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-NEXT: movl %ecx, 24(%eax) +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-NEXT: movl %ecx, 20(%eax) +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-NEXT: movl %ecx, 16(%eax) +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-NEXT: movl %ecx, 12(%eax) +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-NEXT: movl %ecx, 8(%eax) +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-NEXT: movl %ecx, 4(%eax) +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-NEXT: movl %ecx, (%eax) +; CHECK-NEXT: shrl $16, %edi +; CHECK-NEXT: movw %di, 64(%eax) +; CHECK-NEXT: addl $44, %esp +; CHECK-NEXT: popl %esi +; CHECK-NEXT: popl %edi +; CHECK-NEXT: popl %ebx +; CHECK-NEXT: popl %ebp +; CHECK-NEXT: retl $4 +; +; CHECK64-LABEL: large_promotion: +; CHECK64: # %bb.0: +; CHECK64-NEXT: pushq %rbx +; CHECK64-NEXT: movq %rdi, %rax +; CHECK64-NEXT: movq {{[0-9]+}}(%rsp), %rbx +; CHECK64-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; CHECK64-NEXT: movq {{[0-9]+}}(%rsp), %rdi +; CHECK64-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; CHECK64-NEXT: bswapq %r10 +; CHECK64-NEXT: bswapq %rdi +; CHECK64-NEXT: shrdq $48, %rdi, %r10 +; CHECK64-NEXT: bswapq %r11 +; CHECK64-NEXT: shrdq $48, %r11, %rdi +; CHECK64-NEXT: bswapq %rbx +; CHECK64-NEXT: shrdq $48, %rbx, %r11 +; CHECK64-NEXT: bswapq %r9 +; CHECK64-NEXT: shrdq $48, %r9, %rbx +; CHECK64-NEXT: bswapq %r8 +; CHECK64-NEXT: shrdq $48, %r8, %r9 +; CHECK64-NEXT: bswapq %rcx +; CHECK64-NEXT: shrdq $48, %rcx, %r8 +; CHECK64-NEXT: bswapq %rdx +; CHECK64-NEXT: shrdq $48, %rdx, %rcx +; CHECK64-NEXT: bswapq %rsi +; CHECK64-NEXT: shrdq $48, %rsi, %rdx +; CHECK64-NEXT: shrq $48, %rsi +; CHECK64-NEXT: movq %rdx, 56(%rax) +; CHECK64-NEXT: movq %rcx, 48(%rax) +; CHECK64-NEXT: movq %r8, 40(%rax) +; CHECK64-NEXT: movq %r9, 32(%rax) +; CHECK64-NEXT: movq %rbx, 24(%rax) +; CHECK64-NEXT: movq %r11, 16(%rax) +; CHECK64-NEXT: movq %rdi, 8(%rax) +; CHECK64-NEXT: movq %r10, (%rax) +; CHECK64-NEXT: movw %si, 64(%rax) +; CHECK64-NEXT: popq %rbx +; CHECK64-NEXT: retq + %Z = call i528 @llvm.bswap.i528(i528 %A) + ret i528 %Z +} +declare i528 @llvm.bswap.i528(i528) -- GitLab From 80d4302554550430bb1a9ea4331fe49a97df4f57 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sat, 13 Oct 2018 18:40:48 +0000 Subject: [PATCH 0157/1116] Pull out repeated variables from SelectionDAGLegalize::ExpandBitCount. The CTPOP case has been changed from VT.getSizeInBits to VT.getScalarSizeInBits - but this fits in with future work for vector support (PR32655) and doesn't affect any current (scalar) uses. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344461 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index 3564a767a09..bb2c76a6a41 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -2709,13 +2709,12 @@ SDValue SelectionDAGLegalize::ExpandBSWAP(SDValue Op, const SDLoc &dl) { SDValue SelectionDAGLegalize::ExpandBitCount(unsigned Opc, SDValue Op, const SDLoc &dl) { EVT VT = Op.getValueType(); + EVT ShVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout()); + unsigned Len = VT.getScalarSizeInBits(); switch (Opc) { default: llvm_unreachable("Cannot expand this yet!"); case ISD::CTPOP: { - EVT ShVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout()); - unsigned Len = VT.getSizeInBits(); - assert(VT.isInteger() && Len <= 128 && Len % 8 == 0 && "CTPOP not implemented for this type."); @@ -2761,8 +2760,6 @@ SDValue SelectionDAGLegalize::ExpandBitCount(unsigned Opc, SDValue Op, // This trivially expands to CTLZ. return DAG.getNode(ISD::CTLZ, dl, VT, Op); case ISD::CTLZ: { - unsigned Len = VT.getScalarSizeInBits(); - if (TLI.isOperationLegalOrCustom(ISD::CTLZ_ZERO_UNDEF, VT)) { EVT SetCCVT = getSetCCResultType(VT); SDValue CTLZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, dl, VT, Op); @@ -2781,7 +2778,6 @@ SDValue SelectionDAGLegalize::ExpandBitCount(unsigned Opc, SDValue Op, // return popcount(~x); // // Ref: "Hacker's Delight" by Henry Warren - EVT ShVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout()); for (unsigned i = 0; (1U << i) <= (Len / 2); ++i) { SDValue Tmp3 = DAG.getConstant(1ULL << i, dl, ShVT); Op = DAG.getNode(ISD::OR, dl, VT, Op, @@ -2794,8 +2790,6 @@ SDValue SelectionDAGLegalize::ExpandBitCount(unsigned Opc, SDValue Op, // This trivially expands to CTTZ. return DAG.getNode(ISD::CTTZ, dl, VT, Op); case ISD::CTTZ: { - unsigned Len = VT.getScalarSizeInBits(); - if (TLI.isOperationLegalOrCustom(ISD::CTTZ_ZERO_UNDEF, VT)) { EVT SetCCVT = getSetCCResultType(VT); SDValue CTTZ = DAG.getNode(ISD::CTTZ_ZERO_UNDEF, dl, VT, Op); -- GitLab From 128986073212af3e3bf947d29247cdbf04d7e8e8 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sat, 13 Oct 2018 21:32:49 +0000 Subject: [PATCH 0158/1116] [ARM] Regenerate popcnt tests Improve codegen view as part of PR32655 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344465 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/CodeGen/ARM/popcnt.ll | 311 ++++++++++++++++++++++++++++++------- 1 file changed, 257 insertions(+), 54 deletions(-) diff --git a/test/CodeGen/ARM/popcnt.ll b/test/CodeGen/ARM/popcnt.ll index fd61811f49c..224d5dcb3a6 100644 --- a/test/CodeGen/ARM/popcnt.ll +++ b/test/CodeGen/ARM/popcnt.ll @@ -1,17 +1,27 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s ; Implement ctpop with vcnt define <8 x i8> @vcnt8(<8 x i8>* %A) nounwind { -;CHECK-LABEL: vcnt8: -;CHECK: vcnt.8 {{d[0-9]+}}, {{d[0-9]+}} +; CHECK-LABEL: vcnt8: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr d16, [r0] +; CHECK-NEXT: vcnt.8 d16, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %tmp1) ret <8 x i8> %tmp2 } define <16 x i8> @vcntQ8(<16 x i8>* %A) nounwind { -;CHECK-LABEL: vcntQ8: -;CHECK: vcnt.8 {{q[0-9]+}}, {{q[0-9]+}} +; CHECK-LABEL: vcntQ8: +; CHECK: @ %bb.0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r0] +; CHECK-NEXT: vcnt.8 q8, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr %tmp1 = load <16 x i8>, <16 x i8>* %A %tmp2 = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %tmp1) ret <16 x i8> %tmp2 @@ -19,11 +29,16 @@ define <16 x i8> @vcntQ8(<16 x i8>* %A) nounwind { define <4 x i16> @vcnt16(<4 x i16>* %A) nounwind { ; CHECK-LABEL: vcnt16: -; CHECK: vcnt.8 {{d[0-9]+}}, {{d[0-9]+}} -; CHECK: vrev16.8 {{d[0-9]+}}, {{d[0-9]+}} -; CHECK: vadd.i8 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} -; CHECK: vuzp.8 {{d[0-9]+}}, {{d[0-9]+}} -; CHECK: vmovl.u8 {{q[0-9]+}}, {{d[0-9]+}} +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr d16, [r0] +; CHECK-NEXT: vcnt.8 d16, d16 +; CHECK-NEXT: vrev16.8 d17, d16 +; CHECK-NEXT: vadd.i8 d16, d16, d17 +; CHECK-NEXT: vorr d17, d16, d16 +; CHECK-NEXT: vuzp.8 d16, d17 +; CHECK-NEXT: vmovl.u8 q8, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: mov pc, lr %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> %tmp1) ret <4 x i16> %tmp2 @@ -31,11 +46,17 @@ define <4 x i16> @vcnt16(<4 x i16>* %A) nounwind { define <8 x i16> @vcntQ16(<8 x i16>* %A) nounwind { ; CHECK-LABEL: vcntQ16: -; CHECK: vcnt.8 {{q[0-9]+}}, {{q[0-9]+}} -; CHECK: vrev16.8 {{q[0-9]+}}, {{q[0-9]+}} -; CHECK: vadd.i8 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}} -; CHECK: vuzp.8 {{q[0-9]+}}, {{q[0-9]+}} -; CHECK: vmovl.u8 {{q[0-9]+}}, {{d[0-9]+}} +; CHECK: @ %bb.0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r0] +; CHECK-NEXT: vcnt.8 q8, q8 +; CHECK-NEXT: vrev16.8 q9, q8 +; CHECK-NEXT: vadd.i8 q8, q8, q9 +; CHECK-NEXT: vorr q9, q8, q8 +; CHECK-NEXT: vuzp.8 q8, q9 +; CHECK-NEXT: vmovl.u8 q8, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %tmp1) ret <8 x i16> %tmp2 @@ -43,14 +64,21 @@ define <8 x i16> @vcntQ16(<8 x i16>* %A) nounwind { define <2 x i32> @vcnt32(<2 x i32>* %A) nounwind { ; CHECK-LABEL: vcnt32: -; CHECK: vcnt.8 {{d[0-9]+}}, {{d[0-9]+}} -; CHECK: vrev16.8 {{d[0-9]+}}, {{d[0-9]+}} -; CHECK: vadd.i8 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} -; CHECK: vuzp.8 {{d[0-9]+}}, {{d[0-9]+}} -; CHECK: vmovl.u8 {{q[0-9]+}}, {{d[0-9]+}} -; CHECK: vrev32.16 {{d[0-9]+}}, {{d[0-9]+}} -; CHECK: vuzp.16 {{d[0-9]+}}, {{d[0-9]+}} -; CHECK: vmovl.u16 {{q[0-9]+}}, {{d[0-9]+}} +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr d16, [r0] +; CHECK-NEXT: vcnt.8 d16, d16 +; CHECK-NEXT: vrev16.8 d17, d16 +; CHECK-NEXT: vadd.i8 d16, d16, d17 +; CHECK-NEXT: vorr d17, d16, d16 +; CHECK-NEXT: vuzp.8 d16, d17 +; CHECK-NEXT: vmovl.u8 q8, d16 +; CHECK-NEXT: vrev32.16 d18, d16 +; CHECK-NEXT: vadd.i16 d16, d16, d18 +; CHECK-NEXT: vorr d17, d16, d16 +; CHECK-NEXT: vuzp.16 d16, d17 +; CHECK-NEXT: vmovl.u16 q8, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: mov pc, lr %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %tmp1) ret <2 x i32> %tmp2 @@ -58,14 +86,22 @@ define <2 x i32> @vcnt32(<2 x i32>* %A) nounwind { define <4 x i32> @vcntQ32(<4 x i32>* %A) nounwind { ; CHECK-LABEL: vcntQ32: -; CHECK: vcnt.8 {{q[0-9]+}}, {{q[0-9]+}} -; CHECK: vrev16.8 {{q[0-9]+}}, {{q[0-9]+}} -; CHECK: vadd.i8 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}} -; CHECK: vuzp.8 {{q[0-9]+}}, {{q[0-9]+}} -; CHECK: vmovl.u8 {{q[0-9]+}}, {{d[0-9]+}} -; CHECK: vrev32.16 {{q[0-9]+}}, {{q[0-9]+}} -; CHECK: vuzp.16 {{q[0-9]+}}, {{q[0-9]+}} -; CHECK: vmovl.u16 {{q[0-9]+}}, {{d[0-9]+}} +; CHECK: @ %bb.0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r0] +; CHECK-NEXT: vcnt.8 q8, q8 +; CHECK-NEXT: vrev16.8 q9, q8 +; CHECK-NEXT: vadd.i8 q8, q8, q9 +; CHECK-NEXT: vorr q9, q8, q8 +; CHECK-NEXT: vuzp.8 q8, q9 +; CHECK-NEXT: vmovl.u8 q9, d16 +; CHECK-NEXT: vrev32.16 q9, q9 +; CHECK-NEXT: vaddw.u8 q8, q9, d16 +; CHECK-NEXT: vorr q9, q8, q8 +; CHECK-NEXT: vuzp.16 q8, q9 +; CHECK-NEXT: vmovl.u16 q8, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr %tmp1 = load <4 x i32>, <4 x i32>* %A %tmp2 = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %tmp1) ret <4 x i32> %tmp2 @@ -73,6 +109,51 @@ define <4 x i32> @vcntQ32(<4 x i32>* %A) nounwind { define <1 x i64> @vcnt64(<1 x i64>* %A) nounwind { ; CHECK-LABEL: vcnt64: +; CHECK: @ %bb.0: +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: vldr d16, [r0] +; CHECK-NEXT: ldr r2, .LCPI6_0 +; CHECK-NEXT: vmov.32 r0, d16[0] +; CHECK-NEXT: ldr r3, .LCPI6_3 +; CHECK-NEXT: vmov.32 r1, d16[1] +; CHECK-NEXT: ldr lr, .LCPI6_2 +; CHECK-NEXT: ldr r12, .LCPI6_1 +; CHECK-NEXT: vldr s1, .LCPI6_4 +; CHECK-NEXT: and r4, r2, r0, lsr #1 +; CHECK-NEXT: sub r0, r0, r4 +; CHECK-NEXT: and r2, r2, r1, lsr #1 +; CHECK-NEXT: sub r1, r1, r2 +; CHECK-NEXT: and r4, r0, r3 +; CHECK-NEXT: and r0, r3, r0, lsr #2 +; CHECK-NEXT: and r2, r1, r3 +; CHECK-NEXT: add r0, r4, r0 +; CHECK-NEXT: and r1, r3, r1, lsr #2 +; CHECK-NEXT: add r1, r2, r1 +; CHECK-NEXT: add r0, r0, r0, lsr #4 +; CHECK-NEXT: and r0, r0, lr +; CHECK-NEXT: add r1, r1, r1, lsr #4 +; CHECK-NEXT: mul r2, r0, r12 +; CHECK-NEXT: and r0, r1, lr +; CHECK-NEXT: mul r1, r0, r12 +; CHECK-NEXT: lsr r0, r2, #24 +; CHECK-NEXT: add r0, r0, r1, lsr #24 +; CHECK-NEXT: vmov s0, r0 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: pop {r4, lr} +; CHECK-NEXT: mov pc, lr +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI6_0: +; CHECK-NEXT: .long 1431655765 @ 0x55555555 +; CHECK-NEXT: .LCPI6_1: +; CHECK-NEXT: .long 16843009 @ 0x1010101 +; CHECK-NEXT: .LCPI6_2: +; CHECK-NEXT: .long 252645135 @ 0xf0f0f0f +; CHECK-NEXT: .LCPI6_3: +; CHECK-NEXT: .long 858993459 @ 0x33333333 +; CHECK-NEXT: .LCPI6_4: +; CHECK-NEXT: .long 0 @ float 0 %tmp1 = load <1 x i64>, <1 x i64>* %A %tmp2 = call <1 x i64> @llvm.ctpop.v1i64(<1 x i64> %tmp1) ret <1 x i64> %tmp2 @@ -80,6 +161,74 @@ define <1 x i64> @vcnt64(<1 x i64>* %A) nounwind { define <2 x i64> @vcntQ64(<2 x i64>* %A) nounwind { ; CHECK-LABEL: vcntQ64: +; CHECK: @ %bb.0: +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: vld1.64 {d16, d17}, [r0] +; CHECK-NEXT: vmov.32 r1, d17[1] +; CHECK-NEXT: ldr lr, .LCPI7_0 +; CHECK-NEXT: vmov.32 r2, d17[0] +; CHECK-NEXT: ldr r0, .LCPI7_2 +; CHECK-NEXT: vmov.32 r3, d16[0] +; CHECK-NEXT: ldr r12, .LCPI7_1 +; CHECK-NEXT: ldr r5, .LCPI7_3 +; CHECK-NEXT: vldr s3, .LCPI7_4 +; CHECK-NEXT: and r4, lr, r1, lsr #1 +; CHECK-NEXT: sub r1, r1, r4 +; CHECK-NEXT: and r4, r1, r0 +; CHECK-NEXT: and r1, r0, r1, lsr #2 +; CHECK-NEXT: add r1, r4, r1 +; CHECK-NEXT: and r4, lr, r2, lsr #1 +; CHECK-NEXT: sub r2, r2, r4 +; CHECK-NEXT: and r4, r2, r0 +; CHECK-NEXT: add r1, r1, r1, lsr #4 +; CHECK-NEXT: and r2, r0, r2, lsr #2 +; CHECK-NEXT: and r6, r1, r12 +; CHECK-NEXT: add r2, r4, r2 +; CHECK-NEXT: and r4, lr, r3, lsr #1 +; CHECK-NEXT: sub r3, r3, r4 +; CHECK-NEXT: and r4, r3, r0 +; CHECK-NEXT: add r2, r2, r2, lsr #4 +; CHECK-NEXT: and r3, r0, r3, lsr #2 +; CHECK-NEXT: and r2, r2, r12 +; CHECK-NEXT: add r3, r4, r3 +; CHECK-NEXT: add r3, r3, r3, lsr #4 +; CHECK-NEXT: and r3, r3, r12 +; CHECK-NEXT: mul r4, r3, r5 +; CHECK-NEXT: vmov.32 r3, d16[1] +; CHECK-NEXT: and r1, lr, r3, lsr #1 +; CHECK-NEXT: sub r1, r3, r1 +; CHECK-NEXT: and r3, r1, r0 +; CHECK-NEXT: and r0, r0, r1, lsr #2 +; CHECK-NEXT: mul r1, r2, r5 +; CHECK-NEXT: add r0, r3, r0 +; CHECK-NEXT: mul r2, r6, r5 +; CHECK-NEXT: add r0, r0, r0, lsr #4 +; CHECK-NEXT: and r0, r0, r12 +; CHECK-NEXT: mul r3, r0, r5 +; CHECK-NEXT: lsr r0, r1, #24 +; CHECK-NEXT: lsr r1, r4, #24 +; CHECK-NEXT: add r0, r0, r2, lsr #24 +; CHECK-NEXT: vmov s2, r0 +; CHECK-NEXT: add r0, r1, r3, lsr #24 +; CHECK-NEXT: vmov s0, r0 +; CHECK-NEXT: vmov.f32 s1, s3 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: pop {r4, r5, r6, lr} +; CHECK-NEXT: mov pc, lr +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI7_0: +; CHECK-NEXT: .long 1431655765 @ 0x55555555 +; CHECK-NEXT: .LCPI7_1: +; CHECK-NEXT: .long 252645135 @ 0xf0f0f0f +; CHECK-NEXT: .LCPI7_2: +; CHECK-NEXT: .long 858993459 @ 0x33333333 +; CHECK-NEXT: .LCPI7_3: +; CHECK-NEXT: .long 16843009 @ 0x1010101 +; CHECK-NEXT: .LCPI7_4: +; CHECK-NEXT: .long 0 @ float 0 %tmp1 = load <2 x i64>, <2 x i64>* %A %tmp2 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %tmp1) ret <2 x i64> %tmp2 @@ -95,48 +244,75 @@ declare <1 x i64> @llvm.ctpop.v1i64(<1 x i64>) nounwind readnone declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) nounwind readnone define <8 x i8> @vclz8(<8 x i8>* %A) nounwind { -;CHECK-LABEL: vclz8: -;CHECK: vclz.i8 {{d[0-9]+}}, {{d[0-9]+}} +; CHECK-LABEL: vclz8: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr d16, [r0] +; CHECK-NEXT: vclz.i8 d16, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %tmp1, i1 0) ret <8 x i8> %tmp2 } define <4 x i16> @vclz16(<4 x i16>* %A) nounwind { -;CHECK-LABEL: vclz16: -;CHECK: vclz.i16 {{d[0-9]+}}, {{d[0-9]+}} +; CHECK-LABEL: vclz16: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr d16, [r0] +; CHECK-NEXT: vclz.i16 d16, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: mov pc, lr %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %tmp1, i1 0) ret <4 x i16> %tmp2 } define <2 x i32> @vclz32(<2 x i32>* %A) nounwind { -;CHECK-LABEL: vclz32: -;CHECK: vclz.i32 {{d[0-9]+}}, {{d[0-9]+}} +; CHECK-LABEL: vclz32: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr d16, [r0] +; CHECK-NEXT: vclz.i32 d16, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: mov pc, lr %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %tmp1, i1 0) ret <2 x i32> %tmp2 } define <16 x i8> @vclzQ8(<16 x i8>* %A) nounwind { -;CHECK-LABEL: vclzQ8: -;CHECK: vclz.i8 {{q[0-9]+}}, {{q[0-9]+}} +; CHECK-LABEL: vclzQ8: +; CHECK: @ %bb.0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r0] +; CHECK-NEXT: vclz.i8 q8, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr %tmp1 = load <16 x i8>, <16 x i8>* %A %tmp2 = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %tmp1, i1 0) ret <16 x i8> %tmp2 } define <8 x i16> @vclzQ16(<8 x i16>* %A) nounwind { -;CHECK-LABEL: vclzQ16: -;CHECK: vclz.i16 {{q[0-9]+}}, {{q[0-9]+}} +; CHECK-LABEL: vclzQ16: +; CHECK: @ %bb.0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r0] +; CHECK-NEXT: vclz.i16 q8, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %tmp1, i1 0) ret <8 x i16> %tmp2 } define <4 x i32> @vclzQ32(<4 x i32>* %A) nounwind { -;CHECK-LABEL: vclzQ32: -;CHECK: vclz.i32 {{q[0-9]+}}, {{q[0-9]+}} +; CHECK-LABEL: vclzQ32: +; CHECK: @ %bb.0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r0] +; CHECK-NEXT: vclz.i32 q8, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr %tmp1 = load <4 x i32>, <4 x i32>* %A %tmp2 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %tmp1, i1 0) ret <4 x i32> %tmp2 @@ -151,48 +327,75 @@ declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16>, i1) nounwind readnone declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone define <8 x i8> @vclss8(<8 x i8>* %A) nounwind { -;CHECK-LABEL: vclss8: -;CHECK: vcls.s8 +; CHECK-LABEL: vclss8: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr d16, [r0] +; CHECK-NEXT: vcls.s8 d16, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = call <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8> %tmp1) ret <8 x i8> %tmp2 } define <4 x i16> @vclss16(<4 x i16>* %A) nounwind { -;CHECK-LABEL: vclss16: -;CHECK: vcls.s16 +; CHECK-LABEL: vclss16: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr d16, [r0] +; CHECK-NEXT: vcls.s16 d16, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: mov pc, lr %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = call <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16> %tmp1) ret <4 x i16> %tmp2 } define <2 x i32> @vclss32(<2 x i32>* %A) nounwind { -;CHECK-LABEL: vclss32: -;CHECK: vcls.s32 +; CHECK-LABEL: vclss32: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr d16, [r0] +; CHECK-NEXT: vcls.s32 d16, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: mov pc, lr %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = call <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32> %tmp1) ret <2 x i32> %tmp2 } define <16 x i8> @vclsQs8(<16 x i8>* %A) nounwind { -;CHECK-LABEL: vclsQs8: -;CHECK: vcls.s8 +; CHECK-LABEL: vclsQs8: +; CHECK: @ %bb.0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r0] +; CHECK-NEXT: vcls.s8 q8, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr %tmp1 = load <16 x i8>, <16 x i8>* %A %tmp2 = call <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8> %tmp1) ret <16 x i8> %tmp2 } define <8 x i16> @vclsQs16(<8 x i16>* %A) nounwind { -;CHECK-LABEL: vclsQs16: -;CHECK: vcls.s16 +; CHECK-LABEL: vclsQs16: +; CHECK: @ %bb.0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r0] +; CHECK-NEXT: vcls.s16 q8, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = call <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16> %tmp1) ret <8 x i16> %tmp2 } define <4 x i32> @vclsQs32(<4 x i32>* %A) nounwind { -;CHECK-LABEL: vclsQs32: -;CHECK: vcls.s32 +; CHECK-LABEL: vclsQs32: +; CHECK: @ %bb.0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r0] +; CHECK-NEXT: vcls.s32 q8, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr %tmp1 = load <4 x i32>, <4 x i32>* %A %tmp2 = call <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32> %tmp1) ret <4 x i32> %tmp2 -- GitLab From cc018b73f8c92f38cf715249d13015b141ebe458 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sat, 13 Oct 2018 21:50:15 +0000 Subject: [PATCH 0159/1116] [AARCH64] Regenerate popcnt tests Improve codegen view as part of PR32655 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344466 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/CodeGen/AArch64/arm64-vpopcnt.ll | 157 +++++++++++++++++++++++--- 1 file changed, 141 insertions(+), 16 deletions(-) diff --git a/test/CodeGen/AArch64/arm64-vpopcnt.ll b/test/CodeGen/AArch64/arm64-vpopcnt.ll index 4fb73ca4805..0c223ced9ac 100644 --- a/test/CodeGen/AArch64/arm64-vpopcnt.ll +++ b/test/CodeGen/AArch64/arm64-vpopcnt.ll @@ -1,65 +1,190 @@ -; RUN: llc < %s -mtriple=arm64-apple-ios -mcpu=cyclone | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=aarch64-apple- -mcpu=cyclone | FileCheck %s ; The non-byte ones used to fail with "Cannot select" -; CHECK-LABEL: ctpopv8i8 -; CHECK: cnt.8b define <8 x i8> @ctpopv8i8(<8 x i8> %x) nounwind readnone { +; CHECK-LABEL: ctpopv8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: cnt v0.8b, v0.8b +; CHECK-NEXT: ret %cnt = tail call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %x) ret <8 x i8> %cnt } declare <8 x i8> @llvm.ctpop.v8i8(<8 x i8>) nounwind readnone -; CHECK-LABEL: ctpopv4i16 -; CHECK: cnt.8b define <4 x i16> @ctpopv4i16(<4 x i16> %x) nounwind readnone { +; CHECK-LABEL: ctpopv4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: umov w8, v0.h[0] +; CHECK-NEXT: fmov d1, x8 +; CHECK-NEXT: cnt v1.8b, v1.8b +; CHECK-NEXT: uaddlv h1, v1.8b +; CHECK-NEXT: umov w8, v0.h[1] +; CHECK-NEXT: fmov d2, x8 +; CHECK-NEXT: cnt v2.8b, v2.8b +; CHECK-NEXT: uaddlv h2, v2.8b +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov v1.h[1], w8 +; CHECK-NEXT: umov w8, v0.h[2] +; CHECK-NEXT: fmov d2, x8 +; CHECK-NEXT: cnt v2.8b, v2.8b +; CHECK-NEXT: uaddlv h2, v2.8b +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov v1.h[2], w8 +; CHECK-NEXT: umov w8, v0.h[3] +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: cnt v0.8b, v0.8b +; CHECK-NEXT: uaddlv h0, v0.8b +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: mov v1.h[3], w8 +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret %cnt = tail call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> %x) ret <4 x i16> %cnt } declare <4 x i16> @llvm.ctpop.v4i16(<4 x i16>) nounwind readnone -; CHECK-LABEL: ctpopv2i32 -; CHECK: cnt.8b define <2 x i32> @ctpopv2i32(<2 x i32> %x) nounwind readnone { +; CHECK-LABEL: ctpopv2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov w8, v0.s[1] +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: fmov d0, x0 +; CHECK-NEXT: cnt v0.8b, v0.8b +; CHECK-NEXT: uaddlv h0, v0.8b +; CHECK-NEXT: fmov d1, x8 +; CHECK-NEXT: cnt v1.8b, v1.8b +; CHECK-NEXT: uaddlv h1, v1.8b +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: mov v0.s[1], w8 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret %cnt = tail call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %x) ret <2 x i32> %cnt } declare <2 x i32> @llvm.ctpop.v2i32(<2 x i32>) nounwind readnone - -; CHECK-LABEL: ctpopv16i8 -; CHECK: cnt.16b define <16 x i8> @ctpopv16i8(<16 x i8> %x) nounwind readnone { +; CHECK-LABEL: ctpopv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: ret %cnt = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %x) ret <16 x i8> %cnt } declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>) nounwind readnone -; CHECK-LABEL: ctpopv8i16 -; CHECK: cnt.8b define <8 x i16> @ctpopv8i16(<8 x i16> %x) nounwind readnone { +; CHECK-LABEL: ctpopv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: umov w8, v0.h[1] +; CHECK-NEXT: fmov d1, x8 +; CHECK-NEXT: cnt v1.8b, v1.8b +; CHECK-NEXT: uaddlv h1, v1.8b +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: umov w9, v0.h[0] +; CHECK-NEXT: fmov d1, x9 +; CHECK-NEXT: cnt v1.8b, v1.8b +; CHECK-NEXT: uaddlv h1, v1.8b +; CHECK-NEXT: mov v1.h[1], w8 +; CHECK-NEXT: umov w8, v0.h[2] +; CHECK-NEXT: fmov d2, x8 +; CHECK-NEXT: cnt v2.8b, v2.8b +; CHECK-NEXT: uaddlv h2, v2.8b +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov v1.h[2], w8 +; CHECK-NEXT: umov w8, v0.h[3] +; CHECK-NEXT: fmov d2, x8 +; CHECK-NEXT: cnt v2.8b, v2.8b +; CHECK-NEXT: uaddlv h2, v2.8b +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov v1.h[3], w8 +; CHECK-NEXT: umov w8, v0.h[4] +; CHECK-NEXT: fmov d2, x8 +; CHECK-NEXT: cnt v2.8b, v2.8b +; CHECK-NEXT: uaddlv h2, v2.8b +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov v1.h[4], w8 +; CHECK-NEXT: umov w8, v0.h[5] +; CHECK-NEXT: fmov d2, x8 +; CHECK-NEXT: cnt v2.8b, v2.8b +; CHECK-NEXT: uaddlv h2, v2.8b +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov v1.h[5], w8 +; CHECK-NEXT: umov w8, v0.h[6] +; CHECK-NEXT: fmov d2, x8 +; CHECK-NEXT: cnt v2.8b, v2.8b +; CHECK-NEXT: uaddlv h2, v2.8b +; CHECK-NEXT: fmov w8, s2 +; CHECK-NEXT: mov v1.h[6], w8 +; CHECK-NEXT: umov w8, v0.h[7] +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: cnt v0.8b, v0.8b +; CHECK-NEXT: uaddlv h0, v0.8b +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: mov v1.h[7], w8 +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret %cnt = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %x) ret <8 x i16> %cnt } declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>) nounwind readnone -; CHECK-LABEL: ctpopv4i32 -; CHECK: cnt.8b define <4 x i32> @ctpopv4i32(<4 x i32> %x) nounwind readnone { +; CHECK-LABEL: ctpopv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, v0.s[1] +; CHECK-NEXT: mov w9, v0.s[2] +; CHECK-NEXT: mov w10, v0.s[3] +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: fmov d0, x0 +; CHECK-NEXT: cnt v0.8b, v0.8b +; CHECK-NEXT: uaddlv h0, v0.8b +; CHECK-NEXT: fmov d1, x8 +; CHECK-NEXT: cnt v1.8b, v1.8b +; CHECK-NEXT: uaddlv h1, v1.8b +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: mov v0.s[1], w8 +; CHECK-NEXT: fmov d1, x9 +; CHECK-NEXT: cnt v1.8b, v1.8b +; CHECK-NEXT: uaddlv h1, v1.8b +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: mov v0.s[2], w8 +; CHECK-NEXT: fmov d1, x10 +; CHECK-NEXT: cnt v1.8b, v1.8b +; CHECK-NEXT: uaddlv h1, v1.8b +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: mov v0.s[3], w8 +; CHECK-NEXT: ret %cnt = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %x) ret <4 x i32> %cnt } declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>) nounwind readnone -; CHECK-LABEL: ctpopv2i64 -; CHECK: cnt.8b define <2 x i64> @ctpopv2i64(<2 x i64> %x) nounwind readnone { +; CHECK-LABEL: ctpopv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: cnt v1.8b, v0.8b +; CHECK-NEXT: uaddlv h1, v1.8b +; CHECK-NEXT: fmov w0, s1 +; CHECK-NEXT: fmov d1, x0 +; CHECK-NEXT: mov x8, v0.d[1] +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: cnt v0.8b, v0.8b +; CHECK-NEXT: uaddlv h0, v0.8b +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: mov v1.d[1], x8 +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret %cnt = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %x) ret <2 x i64> %cnt } -- GitLab From 54d4881c352796b18bfe7314662a294754e3a752 Mon Sep 17 00:00:00 2001 From: Lang Hames Date: Sat, 13 Oct 2018 21:53:40 +0000 Subject: [PATCH 0160/1116] [ORC] During lookup, do not match against hidden symbols in other JITDylibs. This adds two arguments to the main ExecutionSession::lookup method: MatchNonExportedInJD, and MatchNonExported. These control whether and where hidden symbols should be matched when searching a list of JITDylibs. A similar effect could have been achieved by filtering search results, but this would have involved materializing symbol definitions (since materialization is triggered on lookup) only to throw the results away, among other issues. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344467 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/ExecutionEngine/Orc/Core.h | 56 ++++++----- lib/ExecutionEngine/Orc/Core.cpp | 94 +++++++++++-------- lib/ExecutionEngine/Orc/ExecutionUtils.cpp | 5 +- lib/ExecutionEngine/Orc/IndirectionUtils.cpp | 3 +- lib/ExecutionEngine/Orc/LLJIT.cpp | 2 +- lib/ExecutionEngine/Orc/LazyReexports.cpp | 4 +- .../Orc/RTDyldObjectLinkingLayer.cpp | 2 +- .../ExecutionEngine/Orc/CoreAPIsTest.cpp | 60 +++++++----- 8 files changed, 129 insertions(+), 97 deletions(-) diff --git a/include/llvm/ExecutionEngine/Orc/Core.h b/include/llvm/ExecutionEngine/Orc/Core.h index f3ea2aef620..24cdeeae42e 100644 --- a/include/llvm/ExecutionEngine/Orc/Core.h +++ b/include/llvm/ExecutionEngine/Orc/Core.h @@ -628,10 +628,12 @@ private: const SymbolNameSet &Names); void lodgeQuery(std::shared_ptr &Q, - SymbolNameSet &Unresolved, MaterializationUnitList &MUs); + SymbolNameSet &Unresolved, JITDylib *MatchNonExportedInJD, + bool MatchNonExported, MaterializationUnitList &MUs); void lodgeQueryImpl(std::shared_ptr &Q, - SymbolNameSet &Unresolved, MaterializationUnitList &MUs); + SymbolNameSet &Unresolved, JITDylib *MatchNonExportedInJD, + bool MatchNonExported, MaterializationUnitList &MUs); LookupImplActionFlags lookupImpl(std::shared_ptr &Q, @@ -766,9 +768,19 @@ public: /// dependenant symbols for this query (e.g. it is being made by a top level /// client to get an address to call) then the value NoDependenciesToRegister /// can be used. + /// + /// If the MatchNonExportedInJD pointer is non-null, then the lookup will find + /// non-exported symbols defined in the JITDylib pointed to by + /// MatchNonExportedInJD. + /// If MatchNonExported is true the lookup will find non-exported symbols in + /// any JITDylib (setting MatchNonExportedInJD is redundant in such cases). + /// If MatchNonExported is false and MatchNonExportedInJD is null, + /// non-exported symbols will never be found. void lookup(const JITDylibList &JDs, SymbolNameSet Symbols, SymbolsResolvedCallback OnResolve, SymbolsReadyCallback OnReady, - RegisterDependenciesFunction RegisterDependencies); + RegisterDependenciesFunction RegisterDependencies, + JITDylib *MatchNonExportedInJD = nullptr, + bool MatchNonExported = false); /// Blocking version of lookup above. Returns the resolved symbol map. /// If WaitUntilReady is true (the default), will not return until all @@ -779,18 +791,22 @@ public: /// error will be reported via reportErrors. Expected lookup(const JITDylibList &JDs, const SymbolNameSet &Symbols, - RegisterDependenciesFunction RegisterDependencies, - bool WaitUntilReady = true); - - /// Convenience version of the blocking version of lookup above. Uses the main - /// JITDylib's search order as the lookup order, and registers no - /// dependencies. - Expected lookup(const SymbolNameSet &Symbols) { - return getMainJITDylib().withSearchOrderDo( - [&](const JITDylibList &SearchOrder) { - return lookup(SearchOrder, Symbols, NoDependenciesToRegister, true); - }); - } + RegisterDependenciesFunction RegisterDependencies = + NoDependenciesToRegister, + bool WaitUntilReady = true, + JITDylib *MatchNonExportedInJD = nullptr, + bool MatchNonExported = false); + + /// Convenience version of blocking lookup. + /// Performs a single-symbol lookup. + Expected lookup(const JITDylibList &JDs, + SymbolStringPtr Symbol, + bool MatchNonExported = false); + + /// Convenience version of blocking lookup. + /// Performs a single-symbol lookup, auto-interning the given symbol name. + Expected lookup(const JITDylibList &JDs, StringRef Symbol, + bool MatchNonExported = false); /// Materialize the given unit. void dispatchMaterialization(JITDylib &JD, @@ -873,16 +889,6 @@ Error JITDylib::define(std::unique_ptr &MU) { }); } -/// Look up the given names in the given JITDylibs. -/// JDs will be searched in order and no JITDylib pointer may be null. -/// All symbols must be found within the given JITDylibs or an error -/// will be returned. -Expected lookup(const JITDylibList &JDs, SymbolNameSet Names); - -/// Look up a symbol by searching a list of JITDylibs. -Expected lookup(const JITDylibList &JDs, - SymbolStringPtr Name); - /// Mangles symbol names then uniques them in the context of an /// ExecutionSession. class MangleAndInterner { diff --git a/lib/ExecutionEngine/Orc/Core.cpp b/lib/ExecutionEngine/Orc/Core.cpp index 86a7ecaaf07..c9cfacef61b 100644 --- a/lib/ExecutionEngine/Orc/Core.cpp +++ b/lib/ExecutionEngine/Orc/Core.cpp @@ -646,7 +646,7 @@ void ReExportsMaterializationUnit::materialize( auto OnReady = [&ES](Error Err) { ES.reportError(std::move(Err)); }; ES.lookup({&SrcJD}, QuerySymbols, std::move(OnResolve), std::move(OnReady), - std::move(RegisterDependencies)); + std::move(RegisterDependencies), nullptr, true); } } @@ -1151,16 +1151,18 @@ SymbolNameSet JITDylib::lookupFlagsImpl(SymbolFlagsMap &Flags, void JITDylib::lodgeQuery(std::shared_ptr &Q, SymbolNameSet &Unresolved, + JITDylib *MatchNonExportedInJD, bool MatchNonExported, MaterializationUnitList &MUs) { assert(Q && "Query can not be null"); - lodgeQueryImpl(Q, Unresolved, MUs); + lodgeQueryImpl(Q, Unresolved, MatchNonExportedInJD, MatchNonExported, MUs); if (FallbackDefinitionGenerator && !Unresolved.empty()) { auto FallbackDefs = FallbackDefinitionGenerator(*this, Unresolved); if (!FallbackDefs.empty()) { for (auto &D : FallbackDefs) Unresolved.erase(D); - lodgeQueryImpl(Q, FallbackDefs, MUs); + lodgeQueryImpl(Q, FallbackDefs, MatchNonExportedInJD, MatchNonExported, + MUs); assert(FallbackDefs.empty() && "All fallback defs should have been found by lookupImpl"); } @@ -1169,6 +1171,7 @@ void JITDylib::lodgeQuery(std::shared_ptr &Q, void JITDylib::lodgeQueryImpl( std::shared_ptr &Q, SymbolNameSet &Unresolved, + JITDylib *MatchNonExportedInJD, bool MatchNonExported, std::vector> &MUs) { for (auto I = Unresolved.begin(), E = Unresolved.end(); I != E;) { auto TmpI = I++; @@ -1179,8 +1182,15 @@ void JITDylib::lodgeQueryImpl( if (SymI == Symbols.end()) continue; - // If we found Name in JD, remove it frome the Unresolved set and add it - // to the added set. + // If this is a non-exported symbol, then check the values of + // MatchNonExportedInJD and MatchNonExported. Skip if we should not match + // against this symbol. + if (!SymI->second.getFlags().isExported()) + if (!MatchNonExported && MatchNonExportedInJD != this) + continue; + + // If we matched against Name in JD, remove it frome the Unresolved set and + // add it to the added set. Unresolved.erase(TmpI); // If the symbol has an address then resolve it. @@ -1695,18 +1705,20 @@ Expected ExecutionSession::legacyLookup( #endif } -void ExecutionSession::lookup( - const JITDylibList &JDs, SymbolNameSet Symbols, - SymbolsResolvedCallback OnResolve, SymbolsReadyCallback OnReady, - RegisterDependenciesFunction RegisterDependencies) { +void ExecutionSession::lookup(const JITDylibList &JDs, SymbolNameSet Symbols, + SymbolsResolvedCallback OnResolve, + SymbolsReadyCallback OnReady, + RegisterDependenciesFunction RegisterDependencies, + JITDylib *MatchNonExportedInJD, + bool MatchNonExported) { // lookup can be re-entered recursively if running on a single thread. Run any - // outstanding MUs in case this query depends on them, otherwise the main - // thread will starve waiting for a result from an MU that it failed to run. + // outstanding MUs in case this query depends on them, otherwise this lookup + // will starve waiting for a result from an MU that is stuck in the queue. runOutstandingMUs(); auto Unresolved = std::move(Symbols); - std::map MUsMap; + std::map CollectedMUsMap; auto Q = std::make_shared( Unresolved, std::move(OnResolve), std::move(OnReady)); bool QueryIsFullyResolved = false; @@ -1716,9 +1728,10 @@ void ExecutionSession::lookup( runSessionLocked([&]() { for (auto *JD : JDs) { assert(JD && "JITDylibList entries must not be null"); - assert(!MUsMap.count(JD) && + assert(!CollectedMUsMap.count(JD) && "JITDylibList should not contain duplicate entries"); - JD->lodgeQuery(Q, Unresolved, MUsMap[JD]); + JD->lodgeQuery(Q, Unresolved, MatchNonExportedInJD, MatchNonExported, + CollectedMUsMap[JD]); } if (Unresolved.empty()) { @@ -1741,7 +1754,7 @@ void ExecutionSession::lookup( Q->detach(); // Replace the MUs. - for (auto &KV : MUsMap) + for (auto &KV : CollectedMUsMap) for (auto &MU : KV.second) KV.first->replace(std::move(MU)); } @@ -1761,7 +1774,7 @@ void ExecutionSession::lookup( { std::lock_guard Lock(OutstandingMUsMutex); - for (auto &KV : MUsMap) + for (auto &KV : CollectedMUsMap) for (auto &MU : KV.second) OutstandingMUs.push_back(std::make_pair(KV.first, std::move(MU))); } @@ -1772,7 +1785,8 @@ void ExecutionSession::lookup( Expected ExecutionSession::lookup(const JITDylibList &JDs, const SymbolNameSet &Symbols, RegisterDependenciesFunction RegisterDependencies, - bool WaitUntilReady) { + bool WaitUntilReady, JITDylib *MatchNonExportedInJD, + bool MatchNonExported) { #if LLVM_ENABLE_THREADS // In the threaded case we use promises to return the results. std::promise PromisedResult; @@ -1839,7 +1853,8 @@ ExecutionSession::lookup(const JITDylibList &JDs, const SymbolNameSet &Symbols, #endif // Perform the asynchronous lookup. - lookup(JDs, Symbols, OnResolve, OnReady, RegisterDependencies); + lookup(JDs, Symbols, OnResolve, OnReady, RegisterDependencies, + MatchNonExportedInJD, MatchNonExported); #if LLVM_ENABLE_THREADS auto ResultFuture = PromisedResult.get_future(); @@ -1882,6 +1897,27 @@ ExecutionSession::lookup(const JITDylibList &JDs, const SymbolNameSet &Symbols, #endif } +/// Look up a symbol by searching a list of JDs. +Expected ExecutionSession::lookup(const JITDylibList &JDs, + SymbolStringPtr Name, + bool MatchNonExported) { + SymbolNameSet Names({Name}); + + if (auto ResultMap = lookup(JDs, std::move(Names), NoDependenciesToRegister, + true, nullptr, MatchNonExported)) { + assert(ResultMap->size() == 1 && "Unexpected number of results"); + assert(ResultMap->count(Name) && "Missing result for symbol"); + return std::move(ResultMap->begin()->second); + } else + return ResultMap.takeError(); +} + +Expected ExecutionSession::lookup(const JITDylibList &JDs, + StringRef Name, + bool MatchNonExported) { + return lookup(JDs, intern(Name), MatchNonExported); +} + void ExecutionSession::dump(raw_ostream &OS) { runSessionLocked([this, &OS]() { for (auto &JD : JDs) @@ -1910,28 +1946,6 @@ void ExecutionSession::runOutstandingMUs() { } } -Expected lookup(const JITDylibList &JDs, SymbolNameSet Names) { - - if (JDs.empty()) - return SymbolMap(); - - auto &ES = (*JDs.begin())->getExecutionSession(); - - return ES.lookup(JDs, Names, NoDependenciesToRegister, true); -} - -/// Look up a symbol by searching a list of JDs. -Expected lookup(const JITDylibList &JDs, - SymbolStringPtr Name) { - SymbolNameSet Names({Name}); - if (auto ResultMap = lookup(JDs, std::move(Names))) { - assert(ResultMap->size() == 1 && "Unexpected number of results"); - assert(ResultMap->count(Name) && "Missing result for symbol"); - return std::move(ResultMap->begin()->second); - } else - return ResultMap.takeError(); -} - MangleAndInterner::MangleAndInterner(ExecutionSession &ES, const DataLayout &DL) : ES(ES), DL(DL) {} diff --git a/lib/ExecutionEngine/Orc/ExecutionUtils.cpp b/lib/ExecutionEngine/Orc/ExecutionUtils.cpp index 47cb273ee12..6a180106240 100644 --- a/lib/ExecutionEngine/Orc/ExecutionUtils.cpp +++ b/lib/ExecutionEngine/Orc/ExecutionUtils.cpp @@ -128,7 +128,10 @@ Error CtorDtorRunner2::run() { } } - if (auto CtorDtorMap = lookup({&JD}, std::move(Names))) { + auto &ES = JD.getExecutionSession(); + if (auto CtorDtorMap = + ES.lookup({&JD}, std::move(Names), NoDependenciesToRegister, true, + nullptr, true)) { for (auto &KV : CtorDtorsByPriority) { for (auto &Name : KV.second) { assert(CtorDtorMap->count(Name) && "No entry for Name"); diff --git a/lib/ExecutionEngine/Orc/IndirectionUtils.cpp b/lib/ExecutionEngine/Orc/IndirectionUtils.cpp index d7fd57b6e53..6bc33c90cbc 100644 --- a/lib/ExecutionEngine/Orc/IndirectionUtils.cpp +++ b/lib/ExecutionEngine/Orc/IndirectionUtils.cpp @@ -99,9 +99,10 @@ JITTargetAddress JITCompileCallbackManager::executeCompileCallback( Name = I->second; } - if (auto Sym = lookup({&CallbacksJD}, Name)) + if (auto Sym = ES.lookup({&CallbacksJD}, Name, true)) return Sym->getAddress(); else { + llvm::dbgs() << "Didn't find callback.\n"; // If anything goes wrong materializing Sym then report it to the session // and return the ErrorHandlerAddress; ES.reportError(Sym.takeError()); diff --git a/lib/ExecutionEngine/Orc/LLJIT.cpp b/lib/ExecutionEngine/Orc/LLJIT.cpp index 47baa45a8aa..39bb4c48067 100644 --- a/lib/ExecutionEngine/Orc/LLJIT.cpp +++ b/lib/ExecutionEngine/Orc/LLJIT.cpp @@ -78,7 +78,7 @@ Error LLJIT::addObjectFile(JITDylib &JD, std::unique_ptr Obj) { Expected LLJIT::lookupLinkerMangled(JITDylib &JD, StringRef Name) { - return llvm::orc::lookup({&JD}, ES->intern(Name)); + return ES->lookup({&JD}, ES->intern(Name)); } LLJIT::LLJIT(std::unique_ptr ES, diff --git a/lib/ExecutionEngine/Orc/LazyReexports.cpp b/lib/ExecutionEngine/Orc/LazyReexports.cpp index 0d8049178b5..1cce0c6cd2c 100644 --- a/lib/ExecutionEngine/Orc/LazyReexports.cpp +++ b/lib/ExecutionEngine/Orc/LazyReexports.cpp @@ -52,8 +52,8 @@ LazyCallThroughManager::callThroughToSymbol(JITTargetAddress TrampolineAddr) { SymbolName = I->second.second; } - auto LookupResult = - ES.lookup({SourceJD}, {SymbolName}, NoDependenciesToRegister); + auto LookupResult = ES.lookup({SourceJD}, {SymbolName}, + NoDependenciesToRegister, true, nullptr, true); if (!LookupResult) { ES.reportError(LookupResult.takeError()); diff --git a/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp b/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp index a2c4a2f2081..e84295ca215 100644 --- a/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp +++ b/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp @@ -52,7 +52,7 @@ public: MR.getTargetJITDylib().withSearchOrderDo([&](const JITDylibList &JDs) { ES.lookup(JDs, InternedSymbols, OnResolvedWithUnwrap, OnReady, - RegisterDependencies); + RegisterDependencies, &MR.getTargetJITDylib()); }); } diff --git a/unittests/ExecutionEngine/Orc/CoreAPIsTest.cpp b/unittests/ExecutionEngine/Orc/CoreAPIsTest.cpp index cd742187ffb..c8fa6ef5297 100644 --- a/unittests/ExecutionEngine/Orc/CoreAPIsTest.cpp +++ b/unittests/ExecutionEngine/Orc/CoreAPIsTest.cpp @@ -220,6 +220,24 @@ TEST_F(CoreAPIsStandardTest, ChainedJITDylibLookup) { EXPECT_TRUE(OnReadyRun) << "OnReady was not run for empty query"; } +TEST_F(CoreAPIsStandardTest, LookupWithHiddenSymbols) { + auto BarHiddenFlags = BarSym.getFlags() & ~JITSymbolFlags::Exported; + auto BarHiddenSym = JITEvaluatedSymbol(BarSym.getAddress(), BarHiddenFlags); + + cantFail(JD.define(absoluteSymbols({{Foo, FooSym}, {Bar, BarHiddenSym}}))); + + auto &JD2 = ES.createJITDylib("JD2"); + cantFail(JD2.define(absoluteSymbols({{Bar, QuxSym}}))); + + auto Result = cantFail(ES.lookup({&JD, &JD2}, {Foo, Bar})); + + EXPECT_EQ(Result.size(), 2U) << "Unexpected number of results"; + EXPECT_EQ(Result.count(Foo), 1U) << "Missing result for \"Foo\""; + EXPECT_EQ(Result.count(Bar), 1U) << "Missing result for \"Bar\""; + EXPECT_EQ(Result[Bar].getAddress(), QuxSym.getAddress()) + << "Wrong result for \"Bar\""; +} + TEST_F(CoreAPIsStandardTest, LookupFlagsTest) { // Test that lookupFlags works on a predefined symbol, and does not trigger // materialization of a lazy symbol. Make the lazy symbol weak to test that @@ -257,7 +275,7 @@ TEST_F(CoreAPIsStandardTest, TestBasicAliases) { {Qux, {Bar, JITSymbolFlags::Weak}}}))); cantFail(JD.define(absoluteSymbols({{Qux, QuxSym}}))); - auto Result = lookup({&JD}, {Baz, Qux}); + auto Result = ES.lookup({&JD}, {Baz, Qux}); EXPECT_TRUE(!!Result) << "Unexpected lookup failure"; EXPECT_EQ(Result->count(Baz), 1U) << "No result for \"baz\""; EXPECT_EQ(Result->count(Qux), 1U) << "No result for \"qux\""; @@ -272,7 +290,7 @@ TEST_F(CoreAPIsStandardTest, TestChainedAliases) { cantFail(JD.define(symbolAliases( {{Baz, {Bar, BazSym.getFlags()}}, {Bar, {Foo, BarSym.getFlags()}}}))); - auto Result = lookup({&JD}, {Bar, Baz}); + auto Result = ES.lookup({&JD}, {Bar, Baz}); EXPECT_TRUE(!!Result) << "Unexpected lookup failure"; EXPECT_EQ(Result->count(Bar), 1U) << "No result for \"bar\""; EXPECT_EQ(Result->count(Baz), 1U) << "No result for \"baz\""; @@ -291,7 +309,7 @@ TEST_F(CoreAPIsStandardTest, TestBasicReExports) { cantFail(JD2.define(reexports(JD, {{Bar, {Foo, BarSym.getFlags()}}}))); - auto Result = cantFail(lookup({&JD2}, Bar)); + auto Result = cantFail(ES.lookup({&JD2}, Bar)); EXPECT_EQ(Result.getAddress(), FooSym.getAddress()) << "Re-export Bar for symbol Foo should match FooSym's address"; } @@ -317,7 +335,7 @@ TEST_F(CoreAPIsStandardTest, TestThatReExportsDontUnnecessarilyMaterialize) { cantFail(JD2.define(reexports( JD, {{Baz, {Foo, BazSym.getFlags()}}, {Qux, {Bar, QuxSym.getFlags()}}}))); - auto Result = cantFail(lookup({&JD2}, Baz)); + auto Result = cantFail(ES.lookup({&JD2}, Baz)); EXPECT_EQ(Result.getAddress(), FooSym.getAddress()) << "Re-export Baz for symbol Foo should match FooSym's address"; @@ -340,7 +358,7 @@ TEST_F(CoreAPIsStandardTest, TestReexportsFallbackGenerator) { EXPECT_EQ(Flags.size(), 1U) << "Unexpected number of results"; EXPECT_EQ(Flags[Foo], FooSym.getFlags()) << "Unexpected flags for Foo"; - auto Result = cantFail(lookup({&JD}, Foo)); + auto Result = cantFail(ES.lookup({&JD}, Foo)); EXPECT_EQ(Result.getAddress(), FooSym.getAddress()) << "Incorrect reexported symbol address"; @@ -650,13 +668,13 @@ TEST_F(CoreAPIsStandardTest, DefineMaterializingSymbol) { }); cantFail(JD.define(MU)); - cantFail(lookup({&JD}, Foo)); + cantFail(ES.lookup({&JD}, Foo)); // Assert that materialization is complete by now. ExpectNoMoreMaterialization = true; // Look up bar to verify that no further materialization happens. - auto BarResult = cantFail(lookup({&JD}, Bar)); + auto BarResult = cantFail(ES.lookup({&JD}, Bar)); EXPECT_EQ(BarResult.getAddress(), BarSym.getAddress()) << "Expected Bar == BarSym"; } @@ -670,7 +688,7 @@ TEST_F(CoreAPIsStandardTest, FallbackDefinitionGeneratorTest) { return SymbolNameSet({Bar}); }); - auto Result = cantFail(lookup({&JD}, {Foo, Bar})); + auto Result = cantFail(ES.lookup({&JD}, {Foo, Bar})); EXPECT_EQ(Result.count(Bar), 1U) << "Expected to find fallback def for 'bar'"; EXPECT_EQ(Result[Bar].getAddress(), BarSym.getAddress()) @@ -679,14 +697,14 @@ TEST_F(CoreAPIsStandardTest, FallbackDefinitionGeneratorTest) { TEST_F(CoreAPIsStandardTest, FailResolution) { auto MU = llvm::make_unique( - SymbolFlagsMap( - {{Foo, JITSymbolFlags::Weak}, {Bar, JITSymbolFlags::Weak}}), + SymbolFlagsMap({{Foo, JITSymbolFlags::Exported | JITSymbolFlags::Weak}, + {Bar, JITSymbolFlags::Exported | JITSymbolFlags::Weak}}), [&](MaterializationResponsibility R) { R.failMaterialization(); }); cantFail(JD.define(MU)); SymbolNameSet Names({Foo, Bar}); - auto Result = lookup({&JD}, Names); + auto Result = ES.lookup({&JD}, Names); EXPECT_FALSE(!!Result) << "Expected failure"; if (!Result) { @@ -718,7 +736,7 @@ TEST_F(CoreAPIsStandardTest, TestLookupWithUnthreadedMaterialization) { cantFail(JD.define(MU)); - auto FooLookupResult = cantFail(lookup({&JD}, Foo)); + auto FooLookupResult = cantFail(ES.lookup({&JD}, Foo)); EXPECT_EQ(FooLookupResult.getAddress(), FooSym.getAddress()) << "lookup returned an incorrect address"; @@ -739,7 +757,7 @@ TEST_F(CoreAPIsStandardTest, TestLookupWithThreadedMaterialization) { cantFail(JD.define(absoluteSymbols({{Foo, FooSym}}))); - auto FooLookupResult = cantFail(lookup({&JD}, Foo)); + auto FooLookupResult = cantFail(ES.lookup({&JD}, Foo)); EXPECT_EQ(FooLookupResult.getAddress(), FooSym.getAddress()) << "lookup returned an incorrect address"; @@ -787,14 +805,14 @@ TEST_F(CoreAPIsStandardTest, TestGetRequestedSymbolsAndReplace) { EXPECT_FALSE(FooMaterialized) << "Foo should not be materialized yet"; EXPECT_FALSE(BarMaterialized) << "Bar should not be materialized yet"; - auto FooSymResult = cantFail(lookup({&JD}, Foo)); + auto FooSymResult = cantFail(ES.lookup({&JD}, Foo)); EXPECT_EQ(FooSymResult.getAddress(), FooSym.getAddress()) << "Address mismatch for Foo"; EXPECT_TRUE(FooMaterialized) << "Foo should be materialized now"; EXPECT_FALSE(BarMaterialized) << "Bar still should not be materialized"; - auto BarSymResult = cantFail(lookup({&JD}, Bar)); + auto BarSymResult = cantFail(ES.lookup({&JD}, Bar)); EXPECT_EQ(BarSymResult.getAddress(), BarSym.getAddress()) << "Address mismatch for Bar"; EXPECT_TRUE(BarMaterialized) << "Bar should be materialized now"; @@ -814,7 +832,7 @@ TEST_F(CoreAPIsStandardTest, TestMaterializationResponsibilityDelegation) { cantFail(JD.define(MU)); - auto Result = lookup({&JD}, {Foo, Bar}); + auto Result = ES.lookup({&JD}, {Foo, Bar}); EXPECT_TRUE(!!Result) << "Result should be a success value"; EXPECT_EQ(Result->count(Foo), 1U) << "\"Foo\" entry missing"; @@ -865,14 +883,4 @@ TEST_F(CoreAPIsStandardTest, TestMaterializeWeakSymbol) { FooResponsibility->emit(); } -TEST_F(CoreAPIsStandardTest, TestMainJITDylibAndDefaultLookupOrder) { - cantFail(ES.getMainJITDylib().define(absoluteSymbols({{Foo, FooSym}}))); - auto Results = cantFail(ES.lookup({Foo})); - - EXPECT_EQ(Results.size(), 1U) << "Incorrect number of results"; - EXPECT_EQ(Results.count(Foo), 1U) << "Expected result for 'Foo'"; - EXPECT_EQ(Results[Foo].getAddress(), FooSym.getAddress()) - << "Expected result address to match Foo's address"; -} - } // namespace -- GitLab From 3898e47d1e7f74ec12eee0cc8529b6abc9d27a71 Mon Sep 17 00:00:00 2001 From: Benjamin Kramer Date: Sat, 13 Oct 2018 22:18:22 +0000 Subject: [PATCH 0161/1116] Move some helpers from the global namespace into anonymous ones. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344468 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Demangle/MicrosoftDemangle.cpp | 11 ++++++----- lib/Target/Mips/MipsCallLowering.cpp | 8 ++++---- lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp | 2 +- lib/Target/X86/X86CondBrFolding.cpp | 2 ++ .../Instrumentation/ControlHeightReduction.cpp | 7 ++++--- 5 files changed, 17 insertions(+), 13 deletions(-) diff --git a/lib/Demangle/MicrosoftDemangle.cpp b/lib/Demangle/MicrosoftDemangle.cpp index 9f60eb22cc4..59fb7c9ae9f 100644 --- a/lib/Demangle/MicrosoftDemangle.cpp +++ b/lib/Demangle/MicrosoftDemangle.cpp @@ -652,7 +652,7 @@ Demangler::demangleLiteralOperatorIdentifier(StringView &MangledName) { return N; } -IntrinsicFunctionKind +static IntrinsicFunctionKind translateIntrinsicFunctionCode(char CH, FunctionIdentifierCodeGroup Group) { // Not all ? identifiers are intrinsics *functions*. This function only maps // operator codes for the special functions, all others are handled elsewhere, @@ -1220,7 +1220,7 @@ static void outputEscapedChar(OutputStream &OS, unsigned C) { outputHex(OS, C); } -unsigned countTrailingNullBytes(const uint8_t *StringBytes, int Length) { +static unsigned countTrailingNullBytes(const uint8_t *StringBytes, int Length) { const uint8_t *End = StringBytes + Length - 1; unsigned Count = 0; while (Length > 0 && *End == 0) { @@ -1231,7 +1231,8 @@ unsigned countTrailingNullBytes(const uint8_t *StringBytes, int Length) { return Count; } -unsigned countEmbeddedNulls(const uint8_t *StringBytes, unsigned Length) { +static unsigned countEmbeddedNulls(const uint8_t *StringBytes, + unsigned Length) { unsigned Result = 0; for (unsigned I = 0; I < Length; ++I) { if (*StringBytes++ == 0) @@ -1240,8 +1241,8 @@ unsigned countEmbeddedNulls(const uint8_t *StringBytes, unsigned Length) { return Result; } -unsigned guessCharByteSize(const uint8_t *StringBytes, unsigned NumChars, - unsigned NumBytes) { +static unsigned guessCharByteSize(const uint8_t *StringBytes, unsigned NumChars, + unsigned NumBytes) { assert(NumBytes > 0); // If the number of bytes is odd, this is guaranteed to be a char string. diff --git a/lib/Target/Mips/MipsCallLowering.cpp b/lib/Target/Mips/MipsCallLowering.cpp index 8babdbf902a..4d070f9f523 100644 --- a/lib/Target/Mips/MipsCallLowering.cpp +++ b/lib/Target/Mips/MipsCallLowering.cpp @@ -298,8 +298,8 @@ static bool isSupportedType(Type *T) { return false; } -CCValAssign::LocInfo determineLocInfo(const MVT RegisterVT, const EVT VT, - const ISD::ArgFlagsTy &Flags) { +static CCValAssign::LocInfo determineLocInfo(const MVT RegisterVT, const EVT VT, + const ISD::ArgFlagsTy &Flags) { // > does not mean loss of information as type RegisterVT can't hold type VT, // it means that type VT is split into multiple registers of type RegisterVT if (VT.getSizeInBits() >= RegisterVT.getSizeInBits()) @@ -312,8 +312,8 @@ CCValAssign::LocInfo determineLocInfo(const MVT RegisterVT, const EVT VT, } template -void setLocInfo(SmallVectorImpl &ArgLocs, - const SmallVectorImpl &Arguments) { +static void setLocInfo(SmallVectorImpl &ArgLocs, + const SmallVectorImpl &Arguments) { for (unsigned i = 0; i < ArgLocs.size(); ++i) { const CCValAssign &VA = ArgLocs[i]; CCValAssign::LocInfo LocInfo = determineLocInfo( diff --git a/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp b/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp index 936b801a9a0..98953f09482 100644 --- a/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp +++ b/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp @@ -59,7 +59,7 @@ FunctionPass *llvm::createWebAssemblyLateEHPrepare() { // possible search paths should be the same. // Returns nullptr in case it does not find any EH pad in the search, or finds // multiple different EH pads. -MachineBasicBlock *GetMatchingEHPad(MachineInstr *MI) { +static MachineBasicBlock *GetMatchingEHPad(MachineInstr *MI) { MachineFunction *MF = MI->getParent()->getParent(); SmallVector WL; SmallPtrSet Visited; diff --git a/lib/Target/X86/X86CondBrFolding.cpp b/lib/Target/X86/X86CondBrFolding.cpp index 8b9ef20d916..1d221930c2a 100644 --- a/lib/Target/X86/X86CondBrFolding.cpp +++ b/lib/Target/X86/X86CondBrFolding.cpp @@ -84,6 +84,7 @@ FunctionPass *llvm::createX86CondBrFolding() { return new X86CondBrFoldingPass(); } +namespace { // A class the stores the auxiliary information for each MBB. struct TargetMBBInfo { MachineBasicBlock *TBB; @@ -129,6 +130,7 @@ private: return MBBInfos[MBB->getNumber()].get(); } }; +} // namespace // Find a valid path that we can reuse the CondCode. // The resulted path (if return true) is stored in BranchPath. diff --git a/lib/Transforms/Instrumentation/ControlHeightReduction.cpp b/lib/Transforms/Instrumentation/ControlHeightReduction.cpp index 2c0721f7366..8f4159d3d19 100644 --- a/lib/Transforms/Instrumentation/ControlHeightReduction.cpp +++ b/lib/Transforms/Instrumentation/ControlHeightReduction.cpp @@ -621,9 +621,10 @@ static BranchProbability getCHRBiasThreshold() { // CHRBiasThreshold, put Key into TrueSet and return true. If FalseProb >= // CHRBiasThreshold, put Key into FalseSet and return true. Otherwise, return // false. -template -bool checkBias(K *Key, BranchProbability TrueProb, BranchProbability FalseProb, - S &TrueSet, S &FalseSet, M &BiasMap) { +template +static bool checkBias(K *Key, BranchProbability TrueProb, + BranchProbability FalseProb, S &TrueSet, S &FalseSet, + M &BiasMap) { BranchProbability Threshold = getCHRBiasThreshold(); if (TrueProb >= Threshold) { TrueSet.insert(Key); -- GitLab From 1ccfde68b896d5c3f03c7fff42113c8db425e92d Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sun, 14 Oct 2018 03:36:27 +0000 Subject: [PATCH 0162/1116] [X86] Type legalize v2f32 stores by widening to v4f32, casting to v2f64, extracting f64 and storing. Summary: This is similar to what D52528 did for loads. It should match what generic type legalization does in 64-bit mode where it uses a v2i64 cast and an i64 store. Reviewers: RKSimon, spatel Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D53173 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344470 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 47 ++++++++++++++------ test/CodeGen/X86/2011-10-19-widen_vselect.ll | 16 +++---- test/CodeGen/X86/sse-schedule.ll | 44 +++++++----------- test/CodeGen/X86/vec_fptrunc.ll | 6 +-- test/CodeGen/X86/widen_conv-3.ll | 31 ++++--------- 5 files changed, 67 insertions(+), 77 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 1abe642a830..7d8fb392b07 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -902,8 +902,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal); // We want to legalize this to an f64 load rather than an i64 load on - // 64-bit targets and two 32-bit loads on a 32-bit target. + // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for + // store. setOperationAction(ISD::LOAD, MVT::v2f32, Custom); + setOperationAction(ISD::STORE, MVT::v2f32, Custom); setOperationAction(ISD::BITCAST, MVT::v2i32, Custom); setOperationAction(ISD::BITCAST, MVT::v4i16, Custom); @@ -19943,18 +19945,36 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, SDValue StoredVal = St->getValue(); // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads. - assert(StoredVal.getValueType().isVector() && - StoredVal.getValueType().getVectorElementType() == MVT::i1 && - StoredVal.getValueType().getVectorNumElements() <= 8 && - "Unexpected VT"); - assert(!St->isTruncatingStore() && "Expected non-truncating store"); - assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() && - "Expected AVX512F without AVX512DQI"); - - StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1, - DAG.getUNDEF(MVT::v8i1), StoredVal, + if (StoredVal.getValueType().isVector() && + StoredVal.getValueType().getVectorElementType() == MVT::i1) { + assert(StoredVal.getValueType().getVectorNumElements() <= 8 && + "Unexpected VT"); + assert(!St->isTruncatingStore() && "Expected non-truncating store"); + assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() && + "Expected AVX512F without AVX512DQI"); + + StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1, + DAG.getUNDEF(MVT::v8i1), StoredVal, + DAG.getIntPtrConstant(0, dl)); + StoredVal = DAG.getBitcast(MVT::i8, StoredVal); + + return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(), + St->getPointerInfo(), St->getAlignment(), + St->getMemOperand()->getFlags()); + } + + if (St->isTruncatingStore()) + return SDValue(); + + assert(StoredVal.getValueType() == MVT::v2f32 && "Unexpected VT"); + + // Widen the vector, cast to a v2x64 type, extract the single 64-bit + // element and store it. + StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, StoredVal, + DAG.getUNDEF(MVT::v2f32)); + StoredVal = DAG.getBitcast(MVT::v2f64, StoredVal); + StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, StoredVal, DAG.getIntPtrConstant(0, dl)); - StoredVal = DAG.getBitcast(MVT::i8, StoredVal); return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(), St->getPointerInfo(), St->getAlignment(), @@ -36912,7 +36932,8 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store // pair instead. if (Subtarget.is64Bit() || F64IsLegal) { - MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64; + MVT LdVT = (Subtarget.is64Bit() && + (!VT.isFloatingPoint() || !F64IsLegal)) ? MVT::i64 : MVT::f64; SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(), Ld->getMemOperand()); diff --git a/test/CodeGen/X86/2011-10-19-widen_vselect.ll b/test/CodeGen/X86/2011-10-19-widen_vselect.ll index a84a85e2ecd..d09abf5fbb1 100644 --- a/test/CodeGen/X86/2011-10-19-widen_vselect.ll +++ b/test/CodeGen/X86/2011-10-19-widen_vselect.ll @@ -8,8 +8,7 @@ define void @simple_widen(<2 x float> %a, <2 x float> %b) { ; X32-LABEL: simple_widen: ; X32: # %bb.0: # %entry -; X32-NEXT: extractps $1, %xmm1, (%eax) -; X32-NEXT: movss %xmm1, (%eax) +; X32-NEXT: movlps %xmm1, (%eax) ; X32-NEXT: retl ; ; X64-LABEL: simple_widen: @@ -28,8 +27,7 @@ define void @complex_inreg_work(<2 x float> %a, <2 x float> %b) { ; X32-NEXT: movaps %xmm0, %xmm2 ; X32-NEXT: cmpordps %xmm0, %xmm0 ; X32-NEXT: blendvps %xmm0, %xmm2, %xmm1 -; X32-NEXT: extractps $1, %xmm1, (%eax) -; X32-NEXT: movss %xmm1, (%eax) +; X32-NEXT: movlps %xmm1, (%eax) ; X32-NEXT: retl ; ; X64-LABEL: complex_inreg_work: @@ -50,8 +48,7 @@ define void @zero_test() { ; X32-LABEL: zero_test: ; X32: # %bb.0: # %entry ; X32-NEXT: xorps %xmm0, %xmm0 -; X32-NEXT: extractps $1, %xmm0, (%eax) -; X32-NEXT: movss %xmm0, (%eax) +; X32-NEXT: movlps %xmm0, (%eax) ; X32-NEXT: retl ; ; X64-LABEL: zero_test: @@ -82,11 +79,8 @@ define void @full_test() { ; X32-NEXT: cmpeqps %xmm2, %xmm1 ; X32-NEXT: movaps %xmm1, %xmm0 ; X32-NEXT: blendvps %xmm0, %xmm2, %xmm4 -; X32-NEXT: movss %xmm4, {{[0-9]+}}(%esp) -; X32-NEXT: movshdup {{.*#+}} xmm0 = xmm4[1,1,3,3] -; X32-NEXT: movss %xmm0, {{[0-9]+}}(%esp) -; X32-NEXT: movss %xmm4, {{[0-9]+}}(%esp) -; X32-NEXT: movss %xmm0, {{[0-9]+}}(%esp) +; X32-NEXT: movlps %xmm4, {{[0-9]+}}(%esp) +; X32-NEXT: movlps %xmm4, {{[0-9]+}}(%esp) ; X32-NEXT: addl $60, %esp ; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl diff --git a/test/CodeGen/X86/sse-schedule.ll b/test/CodeGen/X86/sse-schedule.ll index 662061d8c88..cd1fdfbc6aa 100644 --- a/test/CodeGen/X86/sse-schedule.ll +++ b/test/CodeGen/X86/sse-schedule.ll @@ -2712,8 +2712,7 @@ define <4 x float> @test_movhps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2) ; GENERIC: # %bb.0: ; GENERIC-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:1.00] ; GENERIC-NEXT: addps %xmm1, %xmm0 # sched: [3:1.00] -; GENERIC-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] sched: [1:1.00] -; GENERIC-NEXT: movlps %xmm0, (%rdi) # sched: [1:1.00] +; GENERIC-NEXT: movhps %xmm0, (%rdi) # sched: [1:1.00] ; GENERIC-NEXT: movaps %xmm1, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2723,16 +2722,14 @@ define <4 x float> @test_movhps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2) ; ATOM-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [1:1.00] ; ATOM-NEXT: addps %xmm1, %xmm2 # sched: [5:5.00] ; ATOM-NEXT: movaps %xmm1, %xmm0 # sched: [1:0.50] -; ATOM-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] sched: [1:1.00] -; ATOM-NEXT: movlps %xmm2, (%rdi) # sched: [1:1.00] +; ATOM-NEXT: movhps %xmm2, (%rdi) # sched: [1:1.00] ; ATOM-NEXT: retq # sched: [79:39.50] ; ; SLM-LABEL: test_movhps: ; SLM: # %bb.0: ; SLM-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [4:1.00] ; SLM-NEXT: addps %xmm1, %xmm0 # sched: [3:1.00] -; SLM-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] sched: [1:1.00] -; SLM-NEXT: movlps %xmm0, (%rdi) # sched: [1:1.00] +; SLM-NEXT: movhps %xmm0, (%rdi) # sched: [1:1.00] ; SLM-NEXT: movaps %xmm1, %xmm0 # sched: [1:0.50] ; SLM-NEXT: retq # sched: [4:1.00] ; @@ -2740,8 +2737,7 @@ define <4 x float> @test_movhps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2) ; SANDY-SSE: # %bb.0: ; SANDY-SSE-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:1.00] ; SANDY-SSE-NEXT: addps %xmm1, %xmm0 # sched: [3:1.00] -; SANDY-SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] sched: [1:1.00] -; SANDY-SSE-NEXT: movlps %xmm0, (%rdi) # sched: [1:1.00] +; SANDY-SSE-NEXT: movhps %xmm0, (%rdi) # sched: [1:1.00] ; SANDY-SSE-NEXT: movaps %xmm1, %xmm0 # sched: [1:1.00] ; SANDY-SSE-NEXT: retq # sched: [1:1.00] ; @@ -2749,7 +2745,7 @@ define <4 x float> @test_movhps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2) ; SANDY: # %bb.0: ; SANDY-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:1.00] ; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [5:1.00] +; SANDY-NEXT: vmovhpd %xmm0, (%rdi) # sched: [1:1.00] ; SANDY-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00] ; SANDY-NEXT: retq # sched: [1:1.00] ; @@ -2757,8 +2753,7 @@ define <4 x float> @test_movhps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2) ; HASWELL-SSE: # %bb.0: ; HASWELL-SSE-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00] ; HASWELL-SSE-NEXT: addps %xmm1, %xmm0 # sched: [3:1.00] -; HASWELL-SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] sched: [1:1.00] -; HASWELL-SSE-NEXT: movlps %xmm0, (%rdi) # sched: [1:1.00] +; HASWELL-SSE-NEXT: movhps %xmm0, (%rdi) # sched: [1:1.00] ; HASWELL-SSE-NEXT: movaps %xmm1, %xmm0 # sched: [1:1.00] ; HASWELL-SSE-NEXT: retq # sched: [7:1.00] ; @@ -2766,7 +2761,7 @@ define <4 x float> @test_movhps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2) ; HASWELL: # %bb.0: ; HASWELL-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00] ; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; HASWELL-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [2:1.00] +; HASWELL-NEXT: vmovhpd %xmm0, (%rdi) # sched: [1:1.00] ; HASWELL-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00] ; HASWELL-NEXT: retq # sched: [7:1.00] ; @@ -2774,8 +2769,7 @@ define <4 x float> @test_movhps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2) ; BROADWELL-SSE: # %bb.0: ; BROADWELL-SSE-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00] ; BROADWELL-SSE-NEXT: addps %xmm1, %xmm0 # sched: [3:1.00] -; BROADWELL-SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] sched: [1:1.00] -; BROADWELL-SSE-NEXT: movlps %xmm0, (%rdi) # sched: [1:1.00] +; BROADWELL-SSE-NEXT: movhps %xmm0, (%rdi) # sched: [1:1.00] ; BROADWELL-SSE-NEXT: movaps %xmm1, %xmm0 # sched: [1:1.00] ; BROADWELL-SSE-NEXT: retq # sched: [7:1.00] ; @@ -2783,7 +2777,7 @@ define <4 x float> @test_movhps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2) ; BROADWELL: # %bb.0: ; BROADWELL-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00] ; BROADWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; BROADWELL-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [2:1.00] +; BROADWELL-NEXT: vmovhpd %xmm0, (%rdi) # sched: [1:1.00] ; BROADWELL-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00] ; BROADWELL-NEXT: retq # sched: [7:1.00] ; @@ -2791,8 +2785,7 @@ define <4 x float> @test_movhps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2) ; SKYLAKE-SSE: # %bb.0: ; SKYLAKE-SSE-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00] ; SKYLAKE-SSE-NEXT: addps %xmm1, %xmm0 # sched: [4:0.50] -; SKYLAKE-SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] sched: [1:1.00] -; SKYLAKE-SSE-NEXT: movlps %xmm0, (%rdi) # sched: [1:1.00] +; SKYLAKE-SSE-NEXT: movhps %xmm0, (%rdi) # sched: [1:1.00] ; SKYLAKE-SSE-NEXT: movaps %xmm1, %xmm0 # sched: [1:0.33] ; SKYLAKE-SSE-NEXT: retq # sched: [7:1.00] ; @@ -2800,7 +2793,7 @@ define <4 x float> @test_movhps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2) ; SKYLAKE: # %bb.0: ; SKYLAKE-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00] ; SKYLAKE-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50] -; SKYLAKE-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [2:1.00] +; SKYLAKE-NEXT: vmovhpd %xmm0, (%rdi) # sched: [1:1.00] ; SKYLAKE-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:0.33] ; SKYLAKE-NEXT: retq # sched: [7:1.00] ; @@ -2808,8 +2801,7 @@ define <4 x float> @test_movhps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2) ; SKX-SSE: # %bb.0: ; SKX-SSE-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00] ; SKX-SSE-NEXT: addps %xmm1, %xmm0 # sched: [4:0.50] -; SKX-SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] sched: [1:1.00] -; SKX-SSE-NEXT: movlps %xmm0, (%rdi) # sched: [1:1.00] +; SKX-SSE-NEXT: movhps %xmm0, (%rdi) # sched: [1:1.00] ; SKX-SSE-NEXT: movaps %xmm1, %xmm0 # sched: [1:0.33] ; SKX-SSE-NEXT: retq # sched: [7:1.00] ; @@ -2817,7 +2809,7 @@ define <4 x float> @test_movhps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2) ; SKX: # %bb.0: ; SKX-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00] ; SKX-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50] -; SKX-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [2:1.00] +; SKX-NEXT: vmovhpd %xmm0, (%rdi) # sched: [1:1.00] ; SKX-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] ; @@ -2825,8 +2817,7 @@ define <4 x float> @test_movhps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2) ; BTVER2-SSE: # %bb.0: ; BTVER2-SSE-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00] ; BTVER2-SSE-NEXT: addps %xmm1, %xmm0 # sched: [3:1.00] -; BTVER2-SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] sched: [1:0.50] -; BTVER2-SSE-NEXT: movlps %xmm0, (%rdi) # sched: [2:1.00] +; BTVER2-SSE-NEXT: movhps %xmm0, (%rdi) # sched: [2:1.00] ; BTVER2-SSE-NEXT: movaps %xmm1, %xmm0 # sched: [1:0.50] ; BTVER2-SSE-NEXT: retq # sched: [4:1.00] ; @@ -2834,7 +2825,7 @@ define <4 x float> @test_movhps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2) ; BTVER2: # %bb.0: ; BTVER2-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00] ; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; BTVER2-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [3:1.00] +; BTVER2-NEXT: vmovhpd %xmm0, (%rdi) # sched: [2:1.00] ; BTVER2-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] ; @@ -2842,8 +2833,7 @@ define <4 x float> @test_movhps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2) ; ZNVER1-SSE: # %bb.0: ; ZNVER1-SSE-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [8:0.50] ; ZNVER1-SSE-NEXT: addps %xmm1, %xmm0 # sched: [3:1.00] -; ZNVER1-SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] sched: [1:0.50] -; ZNVER1-SSE-NEXT: movlps %xmm0, (%rdi) # sched: [1:0.50] +; ZNVER1-SSE-NEXT: movhps %xmm0, (%rdi) # sched: [1:0.50] ; ZNVER1-SSE-NEXT: movaps %xmm1, %xmm0 # sched: [1:0.25] ; ZNVER1-SSE-NEXT: retq # sched: [1:0.50] ; @@ -2851,7 +2841,7 @@ define <4 x float> @test_movhps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2) ; ZNVER1: # %bb.0: ; ZNVER1-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [8:0.50] ; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; ZNVER1-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [5:3.00] +; ZNVER1-NEXT: vmovhpd %xmm0, (%rdi) # sched: [1:0.50] ; ZNVER1-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:0.25] ; ZNVER1-NEXT: retq # sched: [1:0.50] %1 = bitcast x86_mmx* %a2 to <2 x float>* diff --git a/test/CodeGen/X86/vec_fptrunc.ll b/test/CodeGen/X86/vec_fptrunc.ll index 79abeb0c59f..bb6be6cd9e8 100644 --- a/test/CodeGen/X86/vec_fptrunc.ll +++ b/test/CodeGen/X86/vec_fptrunc.ll @@ -10,8 +10,7 @@ define void @fptrunc_frommem2(<2 x double>* %in, <2 x float>* %out) { ; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-SSE-NEXT: cvtpd2ps (%ecx), %xmm0 -; X32-SSE-NEXT: extractps $1, %xmm0, 4(%eax) -; X32-SSE-NEXT: movss %xmm0, (%eax) +; X32-SSE-NEXT: movlpd %xmm0, (%eax) ; X32-SSE-NEXT: retl ; ; X32-AVX-LABEL: fptrunc_frommem2: @@ -19,8 +18,7 @@ define void @fptrunc_frommem2(<2 x double>* %in, <2 x float>* %out) { ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-AVX-NEXT: vcvtpd2psx (%ecx), %xmm0 -; X32-AVX-NEXT: vextractps $1, %xmm0, 4(%eax) -; X32-AVX-NEXT: vmovss %xmm0, (%eax) +; X32-AVX-NEXT: vmovlpd %xmm0, (%eax) ; X32-AVX-NEXT: retl ; ; X64-SSE-LABEL: fptrunc_frommem2: diff --git a/test/CodeGen/X86/widen_conv-3.ll b/test/CodeGen/X86/widen_conv-3.ll index 1eb76b283c0..038c6cb33b6 100644 --- a/test/CodeGen/X86/widen_conv-3.ll +++ b/test/CodeGen/X86/widen_conv-3.ll @@ -7,28 +7,15 @@ ; sign to float v2i16 to v2f32 define void @convert_v2i16_to_v2f32(<2 x float>* %dst.addr, <2 x i16> %src) nounwind { -; X86-SSE2-LABEL: convert_v2i16_to_v2f32: -; X86-SSE2: # %bb.0: # %entry -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: psllq $48, %xmm0 -; X86-SSE2-NEXT: psrad $16, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; X86-SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 -; X86-SSE2-NEXT: movss %xmm0, (%eax) -; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] -; X86-SSE2-NEXT: movss %xmm0, 4(%eax) -; X86-SSE2-NEXT: retl -; -; X86-SSE42-LABEL: convert_v2i16_to_v2f32: -; X86-SSE42: # %bb.0: # %entry -; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE42-NEXT: psllq $48, %xmm0 -; X86-SSE42-NEXT: psrad $16, %xmm0 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; X86-SSE42-NEXT: cvtdq2ps %xmm0, %xmm0 -; X86-SSE42-NEXT: extractps $1, %xmm0, 4(%eax) -; X86-SSE42-NEXT: movss %xmm0, (%eax) -; X86-SSE42-NEXT: retl +; X86-LABEL: convert_v2i16_to_v2f32: +; X86: # %bb.0: # %entry +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: psllq $48, %xmm0 +; X86-NEXT: psrad $16, %xmm0 +; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; X86-NEXT: cvtdq2ps %xmm0, %xmm0 +; X86-NEXT: movlps %xmm0, (%eax) +; X86-NEXT: retl ; ; X64-LABEL: convert_v2i16_to_v2f32: ; X64: # %bb.0: # %entry -- GitLab From 6c09fbd91a88e899354b804cf1f9d0aa78d75cdf Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sun, 14 Oct 2018 04:01:40 +0000 Subject: [PATCH 0163/1116] [X86] Fix bad indentation. NFC git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344471 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 7d8fb392b07..441f26dd4c6 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -19946,7 +19946,7 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads. if (StoredVal.getValueType().isVector() && - StoredVal.getValueType().getVectorElementType() == MVT::i1) { + StoredVal.getValueType().getVectorElementType() == MVT::i1) { assert(StoredVal.getValueType().getVectorNumElements() <= 8 && "Unexpected VT"); assert(!St->isTruncatingStore() && "Expected non-truncating store"); -- GitLab From a3ff03e8e2bf2c8271d5641cff28d4da0c9bbbbf Mon Sep 17 00:00:00 2001 From: Dorit Nuzman Date: Sun, 14 Oct 2018 07:06:16 +0000 Subject: [PATCH 0164/1116] [IAI,LV] Add support for vectorizing predicated strided accesses using masked interleave-group The vectorizer currently does not attempt to create interleave-groups that contain predicated loads/stores; predicated strided accesses can currently be vectorized only using masked gather/scatter or scalarization. This patch makes predicated loads/stores candidates for forming interleave-groups during the Loop-Vectorizer's analysis, and adds the proper support for masked-interleave- groups to the Loop-Vectorizer's planning and transformation stages. The patch also extends the TTI API to allow querying the cost of masked interleave groups (which each target can control); Targets that support masked vector loads/ stores may choose to enable this feature and allow vectorizing predicated strided loads/stores using masked wide loads/stores and shuffles. Reviewers: Ayal, hsaito, dcaballe, fhahn, javed.absar Reviewed By: Ayal Differential Revision: https://reviews.llvm.org/D53011 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344472 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Analysis/TargetTransformInfo.h | 19 +- .../llvm/Analysis/TargetTransformInfoImpl.h | 6 +- include/llvm/Analysis/VectorUtils.h | 21 +- include/llvm/CodeGen/BasicTTIImpl.h | 38 ++- lib/Analysis/TargetTransformInfo.cpp | 10 +- lib/Analysis/VectorUtils.cpp | 29 ++- .../AArch64/AArch64TargetTransformInfo.cpp | 7 +- .../AArch64/AArch64TargetTransformInfo.h | 2 +- lib/Target/ARM/ARMTargetTransformInfo.cpp | 8 +- lib/Target/ARM/ARMTargetTransformInfo.h | 2 +- .../Hexagon/HexagonTargetTransformInfo.cpp | 6 +- .../Hexagon/HexagonTargetTransformInfo.h | 2 +- lib/Target/PowerPC/PPCTargetTransformInfo.cpp | 7 +- lib/Target/PowerPC/PPCTargetTransformInfo.h | 3 +- .../SystemZ/SystemZTargetTransformInfo.cpp | 6 +- .../SystemZ/SystemZTargetTransformInfo.h | 2 +- lib/Target/X86/X86TargetTransformInfo.cpp | 23 +- lib/Target/X86/X86TargetTransformInfo.h | 9 +- lib/Transforms/Vectorize/LoopVectorize.cpp | 126 ++++++++-- lib/Transforms/Vectorize/VPRecipeBuilder.h | 3 +- lib/Transforms/Vectorize/VPlan.h | 8 +- .../x86-interleaved-accesses-masked-group.ll | 164 +++++++++++++ .../interleaved-accesses-masked-group.ll | 222 ++++++++++++++++++ .../interleaved-accesses-pred-stores.ll | 1 + 24 files changed, 654 insertions(+), 70 deletions(-) create mode 100644 test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll create mode 100644 test/Transforms/LoopVectorize/interleaved-accesses-masked-group.ll diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h index 18b5a5cf0e5..c2a9d1ec195 100644 --- a/include/llvm/Analysis/TargetTransformInfo.h +++ b/include/llvm/Analysis/TargetTransformInfo.h @@ -587,6 +587,10 @@ public: /// Enable matching of interleaved access groups. bool enableInterleavedAccessVectorization() const; + /// Enable matching of interleaved access groups that contain predicated + /// accesses and are vectorized using masked vector loads/stores. + bool enableMaskedInterleavedAccessVectorization() const; + /// Indicate that it is potentially unsafe to automatically vectorize /// floating-point operations because the semantics of vector and scalar /// floating-point semantics may differ. For example, ARM NEON v7 SIMD math @@ -821,9 +825,11 @@ public: /// load allows gaps) /// \p Alignment is the alignment of the memory operation /// \p AddressSpace is address space of the pointer. + /// \p IsMasked indicates if the memory access is predicated. int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace) const; + unsigned AddressSpace, + bool IsMasked = false) const; /// Calculate the cost of performing a vector reduction. /// @@ -1072,6 +1078,7 @@ public: virtual const MemCmpExpansionOptions *enableMemCmpExpansion( bool IsZeroCmp) const = 0; virtual bool enableInterleavedAccessVectorization() = 0; + virtual bool enableMaskedInterleavedAccessVectorization() = 0; virtual bool isFPVectorizationPotentiallyUnsafe() = 0; virtual bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth, @@ -1132,7 +1139,8 @@ public: unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace) = 0; + unsigned AddressSpace, + bool IsMasked = false) = 0; virtual int getArithmeticReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm) = 0; virtual int getMinMaxReductionCost(Type *Ty, Type *CondTy, @@ -1346,6 +1354,9 @@ public: bool enableInterleavedAccessVectorization() override { return Impl.enableInterleavedAccessVectorization(); } + bool enableMaskedInterleavedAccessVectorization() override { + return Impl.enableMaskedInterleavedAccessVectorization(); + } bool isFPVectorizationPotentiallyUnsafe() override { return Impl.isFPVectorizationPotentiallyUnsafe(); } @@ -1471,9 +1482,9 @@ public: } int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace) override { + unsigned AddressSpace, bool IsMasked) override { return Impl.getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace); + Alignment, AddressSpace, IsMasked); } int getArithmeticReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm) override { diff --git a/include/llvm/Analysis/TargetTransformInfoImpl.h b/include/llvm/Analysis/TargetTransformInfoImpl.h index e39fe66c0a4..c64d4d36805 100644 --- a/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -313,6 +313,8 @@ public: bool enableInterleavedAccessVectorization() { return false; } + bool enableMaskedInterleavedAccessVectorization() { return false; } + bool isFPVectorizationPotentiallyUnsafe() { return false; } bool allowsMisalignedMemoryAccesses(LLVMContext &Context, @@ -450,8 +452,8 @@ public: unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, - unsigned Alignment, - unsigned AddressSpace) { + unsigned Alignment, unsigned AddressSpace, + bool IsMasked = false) { return 1; } diff --git a/include/llvm/Analysis/VectorUtils.h b/include/llvm/Analysis/VectorUtils.h index 622d932f74f..2ac49f67662 100644 --- a/include/llvm/Analysis/VectorUtils.h +++ b/include/llvm/Analysis/VectorUtils.h @@ -125,6 +125,21 @@ computeMinimumValueSizes(ArrayRef Blocks, /// This function always sets a (possibly null) value for each K in Kinds. Instruction *propagateMetadata(Instruction *I, ArrayRef VL); +/// Create a mask with replicated elements. +/// +/// This function creates a shuffle mask for replicating each of the \p VF +/// elements in a vector \p ReplicationFactor times. It can be used to +/// transform a mask of \p VF elements into a mask of +/// \p VF * \p ReplicationFactor elements used by a predicated +/// interleaved-group of loads/stores whose Interleaved-factor == +/// \p ReplicationFactor. +/// +/// For example, the mask for \p ReplicationFactor=3 and \p VF=4 is: +/// +/// <0,0,0,1,1,1,2,2,2,3,3,3> +Constant *createReplicatedMask(IRBuilder<> &Builder, unsigned ReplicationFactor, + unsigned VF); + /// Create an interleave shuffle mask. /// /// This function creates a shuffle mask for interleaving \p NumVecs vectors of @@ -328,7 +343,7 @@ public: InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L, DominatorTree *DT, LoopInfo *LI, const LoopAccessInfo *LAI) - : PSE(PSE), TheLoop(L), DT(DT), LI(LI), LAI(LAI) {} + : PSE(PSE), TheLoop(L), DT(DT), LI(LI), LAI(LAI) {} ~InterleavedAccessInfo() { SmallPtrSet DelSet; @@ -341,7 +356,9 @@ public: /// Analyze the interleaved accesses and collect them in interleave /// groups. Substitute symbolic strides using \p Strides. - void analyzeInterleaving(); + /// Consider also predicated loads/stores in the analysis if + /// \p EnableMaskedInterleavedGroup is true. + void analyzeInterleaving(bool EnableMaskedInterleavedGroup); /// Check if \p Instr belongs to any interleave group. bool isInterleaved(Instruction *Instr) const { diff --git a/include/llvm/CodeGen/BasicTTIImpl.h b/include/llvm/CodeGen/BasicTTIImpl.h index b460cdc0ba1..e740fe57172 100644 --- a/include/llvm/CodeGen/BasicTTIImpl.h +++ b/include/llvm/CodeGen/BasicTTIImpl.h @@ -783,8 +783,8 @@ public: unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, - unsigned Alignment, - unsigned AddressSpace) { + unsigned Alignment, unsigned AddressSpace, + bool IsMasked = false) { VectorType *VT = dyn_cast(VecTy); assert(VT && "Expect a vector type for interleaved memory op"); @@ -795,8 +795,13 @@ public: VectorType *SubVT = VectorType::get(VT->getElementType(), NumSubElts); // Firstly, the cost of load/store operation. - unsigned Cost = static_cast(this)->getMemoryOpCost( - Opcode, VecTy, Alignment, AddressSpace); + unsigned Cost; + if (IsMasked) + Cost = static_cast(this)->getMaskedMemoryOpCost( + Opcode, VecTy, Alignment, AddressSpace); + else + Cost = static_cast(this)->getMemoryOpCost(Opcode, VecTy, Alignment, + AddressSpace); // Legalize the vector type, and get the legalized and unlegalized type // sizes. @@ -892,6 +897,31 @@ public: ->getVectorInstrCost(Instruction::InsertElement, VT, i); } + if (!IsMasked) + return Cost; + + Type *I8Type = Type::getInt8Ty(VT->getContext()); + VectorType *MaskVT = VectorType::get(I8Type, NumElts); + SubVT = VectorType::get(I8Type, NumSubElts); + + // The Mask shuffling cost is extract all the elements of the Mask + // and insert each of them Factor times into the wide vector: + // + // E.g. an interleaved group with factor 3: + // %mask = icmp ult <8 x i32> %vec1, %vec2 + // %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef, + // <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7> + // The cost is estimated as extract all mask elements from the <8xi1> mask + // vector and insert them factor times into the <24xi1> shuffled mask + // vector. + for (unsigned i = 0; i < NumSubElts; i++) + Cost += static_cast(this)->getVectorInstrCost( + Instruction::ExtractElement, SubVT, i); + + for (unsigned i = 0; i < NumElts; i++) + Cost += static_cast(this)->getVectorInstrCost( + Instruction::InsertElement, MaskVT, i); + return Cost; } diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp index 4ad48e351a4..867403d0ef1 100644 --- a/lib/Analysis/TargetTransformInfo.cpp +++ b/lib/Analysis/TargetTransformInfo.cpp @@ -268,6 +268,10 @@ bool TargetTransformInfo::enableInterleavedAccessVectorization() const { return TTIImpl->enableInterleavedAccessVectorization(); } +bool TargetTransformInfo::enableMaskedInterleavedAccessVectorization() const { + return TTIImpl->enableMaskedInterleavedAccessVectorization(); +} + bool TargetTransformInfo::isFPVectorizationPotentiallyUnsafe() const { return TTIImpl->isFPVectorizationPotentiallyUnsafe(); } @@ -515,9 +519,9 @@ int TargetTransformInfo::getGatherScatterOpCost(unsigned Opcode, Type *DataTy, int TargetTransformInfo::getInterleavedMemoryOpCost( unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, - unsigned Alignment, unsigned AddressSpace) const { - int Cost = TTIImpl->getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace); + unsigned Alignment, unsigned AddressSpace, bool IsMasked) const { + int Cost = TTIImpl->getInterleavedMemoryOpCost( + Opcode, VecTy, Factor, Indices, Alignment, AddressSpace, IsMasked); assert(Cost >= 0 && "TTI should not produce negative costs!"); return Cost; } diff --git a/lib/Analysis/VectorUtils.cpp b/lib/Analysis/VectorUtils.cpp index 272c665ace1..e14449b8838 100644 --- a/lib/Analysis/VectorUtils.cpp +++ b/lib/Analysis/VectorUtils.cpp @@ -502,6 +502,16 @@ Instruction *llvm::propagateMetadata(Instruction *Inst, ArrayRef VL) { return Inst; } +Constant *llvm::createReplicatedMask(IRBuilder<> &Builder, + unsigned ReplicationFactor, unsigned VF) { + SmallVector MaskVec; + for (unsigned i = 0; i < VF; i++) + for (unsigned j = 0; j < ReplicationFactor; j++) + MaskVec.push_back(Builder.getInt32(i)); + + return ConstantVector::get(MaskVec); +} + Constant *llvm::createInterleaveMask(IRBuilder<> &Builder, unsigned VF, unsigned NumVecs) { SmallVector Mask; @@ -672,7 +682,8 @@ void InterleavedAccessInfo::collectConstStrideAccesses( // this group because it and (2) are dependent. However, (1) can be grouped // with other accesses that may precede it in program order. Note that a // bottom-up order does not imply that WAW dependences should not be checked. -void InterleavedAccessInfo::analyzeInterleaving() { +void InterleavedAccessInfo::analyzeInterleaving( + bool EnablePredicatedInterleavedMemAccesses) { LLVM_DEBUG(dbgs() << "LV: Analyzing interleaved accesses...\n"); const ValueToValueMap &Strides = LAI->getSymbolicStrides(); @@ -712,9 +723,8 @@ void InterleavedAccessInfo::analyzeInterleaving() { // create a group for B, we continue with the bottom-up algorithm to ensure // we don't break any of B's dependences. InterleaveGroup *Group = nullptr; - // TODO: Ignore B if it is in a predicated block. This restriction can be - // relaxed in the future once we handle masked interleaved groups. - if (isStrided(DesB.Stride) && !isPredicated(B->getParent())) { + if (isStrided(DesB.Stride) && + (!isPredicated(B->getParent()) || EnablePredicatedInterleavedMemAccesses)) { Group = getInterleaveGroup(B); if (!Group) { LLVM_DEBUG(dbgs() << "LV: Creating an interleave group with:" << *B @@ -808,11 +818,12 @@ void InterleavedAccessInfo::analyzeInterleaving() { if (DistanceToB % static_cast(DesB.Size)) continue; - // Ignore A if either A or B is in a predicated block. Although we - // currently prevent group formation for predicated accesses, we may be - // able to relax this limitation in the future once we handle more - // complicated blocks. - if (isPredicated(A->getParent()) || isPredicated(B->getParent())) + // All members of a predicated interleave-group must have the same predicate, + // and currently must reside in the same BB. + BasicBlock *BlockA = A->getParent(); + BasicBlock *BlockB = B->getParent(); + if ((isPredicated(BlockA) || isPredicated(BlockB)) && + (!EnablePredicatedInterleavedMemAccesses || BlockA != BlockB)) continue; // The index of A is the index of B plus A's distance to B in multiples diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 96e751e8697..a16de89cf10 100644 --- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -659,11 +659,12 @@ int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace) { + unsigned AddressSpace, + bool IsMasked) { assert(Factor >= 2 && "Invalid interleave factor"); assert(isa(VecTy) && "Expect a vector type"); - if (Factor <= TLI->getMaxSupportedInterleaveFactor()) { + if (!IsMasked && Factor <= TLI->getMaxSupportedInterleaveFactor()) { unsigned NumElts = VecTy->getVectorNumElements(); auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor); @@ -676,7 +677,7 @@ int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, } return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace); + Alignment, AddressSpace, IsMasked); } int AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef Tys) { diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.h b/lib/Target/AArch64/AArch64TargetTransformInfo.h index c056a7d2428..b3893d32850 100644 --- a/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -146,7 +146,7 @@ public: int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace); + unsigned AddressSpace, bool IsMasked = false); bool shouldConsiderAddressTypePromotion(const Instruction &I, diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp index 1b0d162f726..bac3e6c2387 100644 --- a/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -542,14 +542,16 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace) { + unsigned AddressSpace + bool IsMasked) { assert(Factor >= 2 && "Invalid interleave factor"); assert(isa(VecTy) && "Expect a vector type"); // vldN/vstN doesn't support vector types of i64/f64 element. bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64; - if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits) { + if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits && + !IsMasked) { unsigned NumElts = VecTy->getVectorNumElements(); auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor); @@ -562,7 +564,7 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, } return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace); + Alignment, AddressSpace, IsMasked); } void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, diff --git a/lib/Target/ARM/ARMTargetTransformInfo.h b/lib/Target/ARM/ARMTargetTransformInfo.h index 7d14bd7c256..84e3055c6bc 100644 --- a/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/lib/Target/ARM/ARMTargetTransformInfo.h @@ -169,7 +169,7 @@ public: int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace); + unsigned AddressSpace, bool IsMasked); void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP); diff --git a/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp index 4d0e7dc52e8..79b269bccfe 100644 --- a/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp +++ b/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp @@ -206,10 +206,10 @@ unsigned HexagonTTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *DataTy, unsigned HexagonTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, - unsigned Alignment, unsigned AddressSpace) { - if (Indices.size() != Factor) + unsigned Alignment, unsigned AddressSpace, bool IsMasked) { + if (Indices.size() != Factor || IsMasked) return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace); + Alignment, AddressSpace, IsMasked); return getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, nullptr); } diff --git a/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/lib/Target/Hexagon/HexagonTargetTransformInfo.h index 2c03cd268ff..901a91692e8 100644 --- a/lib/Target/Hexagon/HexagonTargetTransformInfo.h +++ b/lib/Target/Hexagon/HexagonTargetTransformInfo.h @@ -123,7 +123,7 @@ public: bool VariableMask, unsigned Alignment); unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace); + unsigned AddressSpace, bool IsMasked); unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, const Instruction *I); unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp index b0da9b5a6d7..2c81661cb17 100644 --- a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -473,7 +473,12 @@ int PPCTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace) { + unsigned AddressSpace, + Bool IsMasked) { + if (IsMasked) + return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, + Alignment, AddressSpace, IsMasked); + assert(isa(VecTy) && "Expect a vector type for interleaved memory op"); diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.h b/lib/Target/PowerPC/PPCTargetTransformInfo.h index 2ee2b3eb808..252d46e7a2a 100644 --- a/lib/Target/PowerPC/PPCTargetTransformInfo.h +++ b/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -90,7 +90,8 @@ public: unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace); + unsigned AddressSpace, + bool IsMasked = false); /// @} }; diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp index 6f553d5bed3..1eaeb9699bf 100644 --- a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -909,7 +909,11 @@ int SystemZTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace) { + unsigned AddressSpace, + bool IsMasked) { + if (IsMasked) + return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, + Alignment, AddressSpace, IsMasked); assert(isa(VecTy) && "Expect a vector type for interleaved memory op"); diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/lib/Target/SystemZ/SystemZTargetTransformInfo.h index bfa942357c5..92b2b9bdcb8 100644 --- a/lib/Target/SystemZ/SystemZTargetTransformInfo.h +++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.h @@ -92,7 +92,7 @@ public: unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace); + unsigned AddressSpace, bool IsMasked = false); /// @} }; diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp index d3a75123935..82e4dfe25b7 100644 --- a/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -2723,7 +2723,12 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace) { + unsigned AddressSpace, + bool IsMasked) { + + if (IsMasked) + return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, + Alignment, AddressSpace, IsMasked); // We currently Support only fully-interleaved groups, with no gaps. // TODO: Support also strided loads (interleaved-groups with gaps). @@ -2832,7 +2837,12 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace) { + unsigned AddressSpace, + bool IsMasked) { + + if (IsMasked) + return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, + Alignment, AddressSpace, IsMasked); // VecTy for interleave memop is . // So, for VF=4, Interleave Factor = 3, Element type = i32 we have @@ -2950,7 +2960,8 @@ int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace) { + unsigned AddressSpace, + bool IsMasked) { auto isSupportedOnAVX512 = [](Type *VecTy, bool HasBW) { Type *EltTy = VecTy->getVectorElementType(); if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) || @@ -2962,11 +2973,11 @@ int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, }; if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI())) return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace); + Alignment, AddressSpace, IsMasked); if (ST->hasAVX2()) return getInterleavedMemoryOpCostAVX2(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace); + Alignment, AddressSpace, IsMasked); return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace); + Alignment, AddressSpace, IsMasked); } diff --git a/lib/Target/X86/X86TargetTransformInfo.h b/lib/Target/X86/X86TargetTransformInfo.h index 3df89903882..2bd778a4211 100644 --- a/lib/Target/X86/X86TargetTransformInfo.h +++ b/lib/Target/X86/X86TargetTransformInfo.h @@ -101,13 +101,16 @@ public: int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, - unsigned Alignment, unsigned AddressSpace); + unsigned Alignment, unsigned AddressSpace, + bool IsMasked = false); int getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, - unsigned Alignment, unsigned AddressSpace); + unsigned Alignment, unsigned AddressSpace, + bool IsMasked = false); int getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, - unsigned Alignment, unsigned AddressSpace); + unsigned Alignment, unsigned AddressSpace, + bool IsMasked = false); int getIntImmCost(int64_t); diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 7ebe8d102b7..e93cfb34156 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -172,6 +172,10 @@ static cl::opt EnableInterleavedMemAccesses( "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on interleaved memory accesses in a loop")); +static cl::opt EnableMaskedInterleavedMemAccesses( + "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, + cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); + /// We don't interleave loops with a known constant trip count below this /// number. static const unsigned TinyTripCountInterleaveThreshold = 128; @@ -408,8 +412,10 @@ public: /// Construct the vector value of a scalarized value \p V one lane at a time. void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); - /// Try to vectorize the interleaved access group that \p Instr belongs to. - void vectorizeInterleaveGroup(Instruction *Instr); + /// Try to vectorize the interleaved access group that \p Instr belongs to, + /// optionally masking the vector operations if \p BlockInMask is non-null. + void vectorizeInterleaveGroup(Instruction *Instr, + VectorParts *BlockInMask = nullptr); /// Vectorize Load and Store instructions, optionally masking the vector /// operations if \p BlockInMask is non-null. @@ -1112,6 +1118,11 @@ public: /// access that can be widened. bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1); + /// Returns true if \p I is a memory instruction in an interleaved-group + /// of memory accesses that can be vectorized with wide vector loads/stores + /// and shuffles. + bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1); + /// Check if \p Instr belongs to any interleaved access group. bool isAccessInterleaved(Instruction *Instr) { return InterleaveInfo.isInterleaved(Instr); @@ -1946,7 +1957,8 @@ Value *InnerLoopVectorizer::reverseVector(Value *Vec) { // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B -void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) { +void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, + VectorParts *BlockInMask) { const InterleaveGroup *Group = Cost->getInterleavedAccessGroup(Instr); assert(Group && "Fail to get an interleaved access group."); @@ -1968,6 +1980,15 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) { SmallVector NewPtrs; unsigned Index = Group->getIndex(Instr); + VectorParts Mask; + bool IsMaskRequired = BlockInMask; + if (IsMaskRequired) { + Mask = *BlockInMask; + // TODO: extend the masked interleaved-group support to reversed access. + assert(!Group->isReverse() && "Reversed masked interleave-group " + "not supported."); + } + // If the group is reverse, adjust the index to refer to the last vector lane // instead of the first. We adjust the index from the first vector lane, // rather than directly getting the pointer for lane VF - 1, because the @@ -2011,8 +2032,19 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) { // For each unroll part, create a wide load for the group. SmallVector NewLoads; for (unsigned Part = 0; Part < UF; Part++) { - auto *NewLoad = Builder.CreateAlignedLoad( - NewPtrs[Part], Group->getAlignment(), "wide.vec"); + Instruction *NewLoad; + if (IsMaskRequired) { + auto *Undefs = UndefValue::get(Mask[Part]->getType()); + auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF); + Value *ShuffledMask = Builder.CreateShuffleVector( + Mask[Part], Undefs, RepMask, "interleaved.mask"); + NewLoad = Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(), + ShuffledMask, UndefVec, + "wide.masked.vec"); + } + else + NewLoad = Builder.CreateAlignedLoad(NewPtrs[Part], + Group->getAlignment(), "wide.vec"); Group->addMetadata(NewLoad); NewLoads.push_back(NewLoad); } @@ -2079,8 +2111,18 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) { Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask, "interleaved.vec"); - Instruction *NewStoreInstr = - Builder.CreateAlignedStore(IVec, NewPtrs[Part], Group->getAlignment()); + Instruction *NewStoreInstr; + if (IsMaskRequired) { + auto *Undefs = UndefValue::get(Mask[Part]->getType()); + auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF); + Value *ShuffledMask = Builder.CreateShuffleVector( + Mask[Part], Undefs, RepMask, "interleaved.mask"); + NewStoreInstr = Builder.CreateMaskedStore( + IVec, NewPtrs[Part], Group->getAlignment(), ShuffledMask); + } + else + NewStoreInstr = Builder.CreateAlignedStore(IVec, NewPtrs[Part], + Group->getAlignment()); Group->addMetadata(NewStoreInstr); } @@ -4253,6 +4295,32 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigne return false; } +static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { + if (!(EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)) + return TTI.enableMaskedInterleavedAccessVectorization(); + + // If an override option has been passed in for interleaved accesses, use it. + return EnableMaskedInterleavedMemAccesses; +} + +bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I, + unsigned VF) { + assert(isAccessInterleaved(I) && "Expecting interleaved access."); + assert(getWideningDecision(I, VF) == CM_Unknown && + "Decision should not be set yet."); + + if (!Legal->blockNeedsPredication(I->getParent()) || + !Legal->isMaskRequired(I)) + return true; + + if (!useMaskedInterleavedAccesses(TTI)) + return false; + + auto *Ty = getMemInstValueType(I); + return isa(I) ? TTI.isLegalMaskedLoad(Ty) + : TTI.isLegalMaskedStore(Ty); +} + bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I, unsigned VF) { // Get and ensure we have a valid memory instruction. @@ -5371,13 +5439,17 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, } // Calculate the cost of the whole interleaved group. - unsigned Cost = TTI.getInterleavedMemoryOpCost(I->getOpcode(), WideVecTy, - Group->getFactor(), Indices, - Group->getAlignment(), AS); - - if (Group->isReverse()) + unsigned Cost = TTI.getInterleavedMemoryOpCost( + I->getOpcode(), WideVecTy, Group->getFactor(), Indices, + Group->getAlignment(), AS, Legal->isMaskRequired(I)); + + if (Group->isReverse()) { + // TODO: Add support for reversed masked interleaved access. + assert(!Legal->isMaskRequired(I) && + "Reverse masked interleaved access not supported."); Cost += Group->getNumMembers() * TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); + } return Cost; } @@ -5479,7 +5551,8 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { continue; NumAccesses = Group->getNumMembers(); - InterleaveCost = getInterleaveGroupCost(&I, VF); + if (interleavedAccessCanBeWidened(&I, VF)) + InterleaveCost = getInterleaveGroupCost(&I, VF); } unsigned GatherScatterCost = @@ -6152,7 +6225,8 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { } VPInterleaveRecipe *VPRecipeBuilder::tryToInterleaveMemory(Instruction *I, - VFRange &Range) { + VFRange &Range, + VPlanPtr &Plan) { const InterleaveGroup *IG = CM.getInterleavedAccessGroup(I); if (!IG) return nullptr; @@ -6174,7 +6248,11 @@ VPInterleaveRecipe *VPRecipeBuilder::tryToInterleaveMemory(Instruction *I, assert(I == IG->getInsertPos() && "Generating a recipe for an adjunct member of an interleave group"); - return new VPInterleaveRecipe(IG); + VPValue *Mask = nullptr; + if (Legal->isMaskRequired(I)) + Mask = createBlockInMask(I->getParent(), Plan); + + return new VPInterleaveRecipe(IG, Mask); } VPWidenMemoryInstructionRecipe * @@ -6442,7 +6520,7 @@ bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range, VPRecipeBase *Recipe = nullptr; // Check if Instr should belong to an interleave memory recipe, or already // does. In the latter case Instr is irrelevant. - if ((Recipe = tryToInterleaveMemory(Instr, Range))) { + if ((Recipe = tryToInterleaveMemory(Instr, Range, Plan))) { VPBB->appendRecipe(Recipe); return true; } @@ -6669,6 +6747,10 @@ void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const { O << " +\n" << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; IG->getInsertPos()->printAsOperand(O, false); + if (User) { + O << ", "; + User->getOperand(0)->printAsOperand(O); + } O << "\\l\""; for (unsigned i = 0; i < IG->getFactor(); ++i) if (Instruction *I = IG->getMember(i)) @@ -6731,7 +6813,15 @@ void VPBlendRecipe::execute(VPTransformState &State) { void VPInterleaveRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Interleave group being replicated."); - State.ILV->vectorizeInterleaveGroup(IG->getInsertPos()); + if (!User) + return State.ILV->vectorizeInterleaveGroup(IG->getInsertPos()); + + // Last (and currently only) operand is a mask. + InnerLoopVectorizer::VectorParts MaskValues(State.UF); + VPValue *Mask = User->getOperand(User->getNumOperands() - 1); + for (unsigned Part = 0; Part < State.UF; ++Part) + MaskValues[Part] = State.get(Mask, Part); + State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), &MaskValues); } void VPReplicateRecipe::execute(VPTransformState &State) { @@ -7030,7 +7120,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { // Analyze interleaved memory accesses. if (UseInterleaved) { - IAI.analyzeInterleaving(); + IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); } // Use the cost model. diff --git a/lib/Transforms/Vectorize/VPRecipeBuilder.h b/lib/Transforms/Vectorize/VPRecipeBuilder.h index f43a8bb123b..15d38ac9c84 100644 --- a/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -69,7 +69,8 @@ public: /// \return value is , as it is handled by another recipe. /// \p Range.End may be decreased to ensure same decision from \p Range.Start /// to \p Range.End. - VPInterleaveRecipe *tryToInterleaveMemory(Instruction *I, VFRange &Range); + VPInterleaveRecipe *tryToInterleaveMemory(Instruction *I, VFRange &Range, + VPlanPtr &Plan); /// Check if \I is a memory instruction to be widened for \p Range.Start and /// potentially masked. Such instructions are handled by a recipe that takes diff --git a/lib/Transforms/Vectorize/VPlan.h b/lib/Transforms/Vectorize/VPlan.h index c3123b41600..81b1986c97d 100644 --- a/lib/Transforms/Vectorize/VPlan.h +++ b/lib/Transforms/Vectorize/VPlan.h @@ -769,10 +769,14 @@ public: class VPInterleaveRecipe : public VPRecipeBase { private: const InterleaveGroup *IG; + std::unique_ptr User; public: - VPInterleaveRecipe(const InterleaveGroup *IG) - : VPRecipeBase(VPInterleaveSC), IG(IG) {} + VPInterleaveRecipe(const InterleaveGroup *IG, VPValue *Mask) + : VPRecipeBase(VPInterleaveSC), IG(IG) { + if (Mask) // Create a VPInstruction to register as a user of the mask. + User.reset(new VPUser({Mask})); + } ~VPInterleaveRecipe() override = default; /// Method to support type inquiry through isa, cast, and dyn_cast. diff --git a/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll b/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll new file mode 100644 index 00000000000..b1163d0a199 --- /dev/null +++ b/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll @@ -0,0 +1,164 @@ +; RUN: opt -mcpu=skx -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses < %s | FileCheck %s -check-prefix=DISABLED_MASKED_STRIDED +; RUN: opt -mcpu=skx -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses -enable-masked-interleaved-mem-accesses < %s | FileCheck %s -check-prefix=ENABLED_MASKED_STRIDED + +target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128" +target triple = "i386-unknown-linux-gnu" + +; When masked-interleaved-groups are disabled: +; Check that the predicated load is not vectorized as an +; interleaved-group but rather as a scalarized accesses. +; (For SKX, Gather is not supported by the compiler for chars, therefore +; the only remaining alternative is to scalarize). +; When masked-interleave-group is enabled we expect to find the proper mask +; shuffling code, feeding the wide masked load for an interleave-group (with +; a single member). +; +; void masked_strided1(const unsigned char* restrict p, +; unsigned char* restrict q, +; unsigned char guard) { +; for(ix=0; ix < 1024; ++ix) { +; if (ix > guard) { +; char t = p[2*ix]; +; q[ix] = t; +; } +; } +; } + +;DISABLED_MASKED_STRIDED-LABEL: @masked_strided1( +;DISABLED_MASKED_STRIDED: vector.body: +;DISABLED_MASKED_STRIDED-NEXT: %index = phi i32 +;DISABLED_MASKED_STRIDED-NEXT: %[[VECIND:.+]] = phi <8 x i32> [ +;DISABLED_MASKED_STRIDED-NOT: %interleaved.mask = +;DISABLED_MASKED_STRIDED-NOT: call void @llvm.masked.load. +;DISABLED_MASKED_STRIDED-NOT: %{{.*}} = shufflevector <16 x i8> %[[WIDEVEC]], <16 x i8> undef, <8 x i32> +;DISABLED_MASKED_STRIDED: %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}} +;DISABLED_MASKED_STRIDED-NEXT: %{{.*}} = shl nuw nsw <8 x i32> %[[VECIND]], +;DISABLED_MASKED_STRIDED-NEXT: %[[M:.+]] = extractelement <8 x i1> %[[VMASK]], i32 0 +;DISABLED_MASKED_STRIDED-NEXT: br i1 %[[M]], label %pred.load.if, label %pred.load.continue +;DISABLED_MASKED_STRIDED-NOT: %interleaved.mask = +;DISABLED_MASKED_STRIDED-NOT: call void @llvm.masked.load. +;DISABLED_MASKED_STRIDED-NOT: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <8 x i32> + +;ENABLED_MASKED_STRIDED-LABEL: @masked_strided1( +;ENABLED_MASKED_STRIDED: vector.body: +;ENABLED_MASKED_STRIDED-NEXT: %index = phi i32 +;ENABLED_MASKED_STRIDED-NEXT: %[[VECIND:.+]] = phi <8 x i32> [ +;ENABLED_MASKED_STRIDED: %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}} +;ENABLED_MASKED_STRIDED: %interleaved.mask = shufflevector <8 x i1> %[[VMASK]], <8 x i1> undef, <16 x i32> +;ENABLED_MASKED_STRIDED-NEXT: %[[WIDEMASKEDLOAD:.+]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %{{.*}}, i32 1, <16 x i1> %interleaved.mask, <16 x i8> undef) +;ENABLED_MASKED_STRIDED-NEXT: %[[STRIDEDVEC:.+]] = shufflevector <16 x i8> %[[WIDEMASKEDLOAD]], <16 x i8> undef, <8 x i32> + +define dso_local void @masked_strided1(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr { +entry: + %conv = zext i8 %guard to i32 + br label %for.body + +for.body: + %ix.09 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp1 = icmp ugt i32 %ix.09, %conv + br i1 %cmp1, label %if.then, label %for.inc + +if.then: + %mul = shl nuw nsw i32 %ix.09, 1 + %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul + %0 = load i8, i8* %arrayidx, align 1 + %arrayidx3 = getelementptr inbounds i8, i8* %q, i32 %ix.09 + store i8 %0, i8* %arrayidx3, align 1 + br label %for.inc + +for.inc: + %inc = add nuw nsw i32 %ix.09, 1 + %exitcond = icmp eq i32 %inc, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +; Check also a scenario with full interleave-groups (no gaps) as well as both +; load and store groups. We check that when masked-interleave-group is disabled +; the predicated loads (and stores) are not vectorized as an +; interleaved-group but rather as four separate scalarized accesses. +; (For SKX, gather/scatter is not supported by the compiler for chars, therefore +; the only remaining alternative is to scalarize). +; When masked-interleave-group is enabled we expect to find the proper mask +; shuffling code, feeding the wide masked load/store for the two interleave- +; groups. +; +; void masked_strided2(const unsigned char* restrict p, +; unsigned char* restrict q, +; unsigned char guard) { +; for(ix=0; ix < 1024; ++ix) { +; if (ix > guard) { +; char left = p[2*ix]; +; char right = p[2*ix + 1]; +; char max = max(left, right); +; q[2*ix] = max; +; q[2*ix+1] = 0 - max; +; } +; } +;} + +;DISABLED_MASKED_STRIDED-LABEL: @masked_strided2( +;DISABLED_MASKED_STRIDED: vector.body: +;DISABLED_MASKED_STRIDED-NEXT: %index = phi i32 +;DISABLED_MASKED_STRIDED-NEXT: %[[VECIND:.+]] = phi <8 x i32> [ +;DISABLED_MASKED_STRIDED-NOT: %interleaved.mask = +;DISABLED_MASKED_STRIDED-NOT: call void @llvm.masked.load. +;DISABLED_MASKED_STRIDED-NOT: call void @llvm.masked.store. +;DISABLED_MASKED_STRIDED-NOT: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <8 x i32> +;DISABLED_MASKED_STRIDED: %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}} +;DISABLED_MASKED_STRIDED-NEXT: %{{.*}} = shl nuw nsw <8 x i32> %[[VECIND]], +;DISABLED_MASKED_STRIDED-NEXT: %[[M:.+]] = extractelement <8 x i1> %[[VMASK]], i32 0 +;DISABLED_MASKED_STRIDED-NEXT: br i1 %[[M]], label %pred.load.if, label %pred.load.continue +;DISABLED_MASKED_STRIDED-NOT: %interleaved.mask = +;DISABLED_MASKED_STRIDED-NOT: call void @llvm.masked.load. +;DISABLED_MASKED_STRIDED-NOT: call void @llvm.masked.store. +;DISABLED_MASKED_STRIDED-NOT: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <8 x i32> + +;ENABLED_MASKED_STRIDED-LABEL: @masked_strided2( +;ENABLED_MASKED_STRIDED: vector.body: +;ENABLED_MASKED_STRIDED-NEXT: %index = phi i32 +;ENABLED_MASKED_STRIDED-NEXT: %[[VECIND:.+]] = phi <8 x i32> [ +;ENABLED_MASKED_STRIDED: %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}} +;ENABLED_MASKED_STRIDED: %interleaved.mask = shufflevector <8 x i1> %[[VMASK]], <8 x i1> undef, <16 x i32> +;ENABLED_MASKED_STRIDED-NEXT: %[[WIDEMASKEDLOAD:.+]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %{{.*}}, i32 1, <16 x i1> %interleaved.mask, <16 x i8> undef) +;ENABLED_MASKED_STRIDED-NEXT: %{{.*}} = shufflevector <16 x i8> %[[WIDEMASKEDLOAD]], <16 x i8> undef, <8 x i32> +;ENABLED_MASKED_STRIDED-NEXT: %{{.*}} = shufflevector <16 x i8> %[[WIDEMASKEDLOAD]], <16 x i8> undef, <8 x i32> +;ENABLED_MASKED_STRIDED: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %{{.*}}, <16 x i8>* %{{.*}}, i32 1, <16 x i1> %interleaved.mask) + +; Function Attrs: norecurse nounwind +define dso_local void @masked_strided2(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr { +entry: + %conv = zext i8 %guard to i32 + br label %for.body + +for.body: + %ix.024 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp1 = icmp ugt i32 %ix.024, %conv + br i1 %cmp1, label %if.then, label %for.inc + +if.then: + %mul = shl nuw nsw i32 %ix.024, 1 + %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul + %0 = load i8, i8* %arrayidx, align 1 + %add = or i32 %mul, 1 + %arrayidx4 = getelementptr inbounds i8, i8* %p, i32 %add + %1 = load i8, i8* %arrayidx4, align 1 + %cmp.i = icmp slt i8 %0, %1 + %spec.select.i = select i1 %cmp.i, i8 %1, i8 %0 + %arrayidx6 = getelementptr inbounds i8, i8* %q, i32 %mul + store i8 %spec.select.i, i8* %arrayidx6, align 1 + %sub = sub i8 0, %spec.select.i + %arrayidx11 = getelementptr inbounds i8, i8* %q, i32 %add + store i8 %sub, i8* %arrayidx11, align 1 + br label %for.inc + +for.inc: + %inc = add nuw nsw i32 %ix.024, 1 + %exitcond = icmp eq i32 %inc, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} diff --git a/test/Transforms/LoopVectorize/interleaved-accesses-masked-group.ll b/test/Transforms/LoopVectorize/interleaved-accesses-masked-group.ll new file mode 100644 index 00000000000..9ed66a22dbf --- /dev/null +++ b/test/Transforms/LoopVectorize/interleaved-accesses-masked-group.ll @@ -0,0 +1,222 @@ +; REQUIRES: asserts +; RUN: opt -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses -debug-only=loop-vectorize,vectorutils -disable-output < %s 2>&1 | FileCheck %s -check-prefix=STRIDED_UNMASKED +; RUN: opt -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses -enable-masked-interleaved-mem-accesses -debug-only=loop-vectorize,vectorutils -disable-output < %s 2>&1 | FileCheck %s -check-prefix=STRIDED_MASKED + +target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128" + +; We test here that the loop-vectorizer forms an interleave-groups from +; predicated memory accesses only if they are both in the same (predicated) +; block (first scenario below). +; If the accesses are not in the same predicated block, an interleave-group +; is not formed (scenarios 2,3 below). + +; Scenario 1: Check the case where it is legal to create masked interleave- +; groups. Altogether two groups are created (one for loads and one for stores) +; when masked-interleaved-acceses are enabled. When masked-interleaved-acceses +; are disabled we do not create any interleave-group. +; +; void masked_strided1(const unsigned char* restrict p, +; unsigned char* restrict q, +; unsigned char guard) { +; for(ix=0; ix < 1024; ++ix) { +; if (ix > guard) { +; char left = p[2*ix]; +; char right = p[2*ix + 1]; +; char max = max(left, right); +; q[2*ix] = max; +; q[2*ix+1] = 0 - max; +; } +; } +;} + + +; STRIDED_UNMASKED: LV: Checking a loop in "masked_strided1" +; STRIDED_UNMASKED: LV: Analyzing interleaved accesses... +; STRIDED_UNMASKED-NOT: LV: Creating an interleave group + +; STRIDED_MASKED: LV: Checking a loop in "masked_strided1" +; STRIDED_MASKED: LV: Analyzing interleaved accesses... +; STRIDED_MASKED-NEXT: LV: Creating an interleave group with: store i8 %{{.*}}, i8* %{{.*}}, align 1 +; STRIDED_MASKED-NEXT: LV: Inserted: store i8 %{{.*}}, i8* %{{.*}}, align 1 +; STRIDED_MASKED-NEXT: into the interleave group with store i8 %{{.*}}, i8* %{{.*}}, align 1 +; STRIDED_MASKED-NEXT: LV: Creating an interleave group with: %{{.*}} = load i8, i8* %{{.*}}, align 1 +; STRIDED_MASKED-NEXT: LV: Inserted: %{{.*}} = load i8, i8* %{{.*}}, align 1 +; STRIDED_MASKED-NEXT: into the interleave group with %{{.*}} = load i8, i8* %{{.*}}, align 1 + +; Scenario 2: Check the case where it is illegal to create a masked interleave- +; group because the first access is predicated, and the second isn't. +; We therefore create a separate interleave-group with gaps for each of the +; stores (if masked-interleaved-accesses are enabled) and these are later +; invalidated because interleave-groups of stores with gaps are not supported. +; If masked-interleaved-accesses is not enabled we create only one interleave +; group of stores (for the non-predicated store) and it is later invalidated +; due to gaps. +; +; void masked_strided2(const unsigned char* restrict p, +; unsigned char* restrict q, +; unsigned char guard1, +; unsigned char guard2) { +; for(ix=0; ix < 1024; ++ix) { +; if (ix > guard1) { +; q[2*ix] = 1; +; } +; q[2*ix+1] = 2; +; } +;} + +; STRIDED_UNMASKED: LV: Checking a loop in "masked_strided2" +; STRIDED_UNMASKED: LV: Analyzing interleaved accesses... +; STRIDED_UNMASKED-NEXT: LV: Creating an interleave group with: store i8 1, i8* %{{.*}}, align 1 +; STRIDED_UNMASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps. +; STRIDED_UNMASKED-NOT: LV: Creating an interleave group + +; STRIDED_MASKED: LV: Checking a loop in "masked_strided2" +; STRIDED_MASKED: LV: Analyzing interleaved accesses... +; STRIDED_MASKED-NEXT: LV: Creating an interleave group with: store i8 2, i8* %{{.*}}, align 1 +; STRIDED_MASKED-NEXT: LV: Creating an interleave group with: store i8 1, i8* %{{.*}}, align 1 +; STRIDED_MASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps. +; STRIDED_MASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps. + + +; Scenario 3: Check the case where it is illegal to create a masked interleave- +; group because the two accesses are in separate predicated blocks. +; We therefore create a separate interleave-group with gaps for each of the accesses, +; (which are later invalidated because interleave-groups of stores with gaps are +; not supported). +; If masked-interleaved-accesses is not enabled we don't create any interleave +; group because all accesses are predicated. +; +; void masked_strided3(const unsigned char* restrict p, +; unsigned char* restrict q, +; unsigned char guard1, +; unsigned char guard2) { +; for(ix=0; ix < 1024; ++ix) { +; if (ix > guard1) { +; q[2*ix] = 1; +; } +; if (ix > guard2) { +; q[2*ix+1] = 2; +; } +; } +;} + + +; STRIDED_UNMASKED: LV: Checking a loop in "masked_strided3" +; STRIDED_UNMASKED: LV: Analyzing interleaved accesses... +; STRIDED_UNMASKED-NOT: LV: Creating an interleave group + +; STRIDED_MASKED: LV: Checking a loop in "masked_strided3" +; STRIDED_MASKED: LV: Analyzing interleaved accesses... +; STRIDED_MASKED-NEXT: LV: Creating an interleave group with: store i8 2, i8* %{{.*}}, align 1 +; STRIDED_MASKED-NEXT: LV: Creating an interleave group with: store i8 1, i8* %{{.*}}, align 1 +; STRIDED_MASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps. +; STRIDED_MASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps. + + +; ModuleID = 'test.c' +source_filename = "test.c" +target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128" +target triple = "i386-unknown-linux-gnu" + +define dso_local void @masked_strided1(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr #0 { +entry: + %conv = zext i8 %guard to i32 + br label %for.body + +for.body: + %ix.024 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp1 = icmp ugt i32 %ix.024, %conv + br i1 %cmp1, label %if.then, label %for.inc + +if.then: + %mul = shl nuw nsw i32 %ix.024, 1 + %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul + %0 = load i8, i8* %arrayidx, align 1 + %add = or i32 %mul, 1 + %arrayidx4 = getelementptr inbounds i8, i8* %p, i32 %add + %1 = load i8, i8* %arrayidx4, align 1 + %cmp.i = icmp slt i8 %0, %1 + %spec.select.i = select i1 %cmp.i, i8 %1, i8 %0 + %arrayidx6 = getelementptr inbounds i8, i8* %q, i32 %mul + store i8 %spec.select.i, i8* %arrayidx6, align 1 + %sub = sub i8 0, %spec.select.i + %arrayidx11 = getelementptr inbounds i8, i8* %q, i32 %add + store i8 %sub, i8* %arrayidx11, align 1 + br label %for.inc + +for.inc: + %inc = add nuw nsw i32 %ix.024, 1 + %exitcond = icmp eq i32 %inc, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + + +define dso_local void @masked_strided2(i8* noalias nocapture readnone %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr #0 { +entry: + %conv = zext i8 %guard to i32 + br label %for.body + +for.body: + %ix.012 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %mul = shl nuw nsw i32 %ix.012, 1 + %arrayidx = getelementptr inbounds i8, i8* %q, i32 %mul + store i8 1, i8* %arrayidx, align 1 + %cmp1 = icmp ugt i32 %ix.012, %conv + br i1 %cmp1, label %if.then, label %for.inc + +if.then: + %add = or i32 %mul, 1 + %arrayidx3 = getelementptr inbounds i8, i8* %q, i32 %add + store i8 2, i8* %arrayidx3, align 1 + br label %for.inc + +for.inc: + %inc = add nuw nsw i32 %ix.012, 1 + %exitcond = icmp eq i32 %inc, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + + +define dso_local void @masked_strided3(i8* noalias nocapture readnone %p, i8* noalias nocapture %q, i8 zeroext %guard1, i8 zeroext %guard2) local_unnamed_addr #0 { +entry: + %conv = zext i8 %guard1 to i32 + %conv3 = zext i8 %guard2 to i32 + br label %for.body + +for.body: + %ix.018 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %mul = shl nuw nsw i32 %ix.018, 1 + %cmp1 = icmp ugt i32 %ix.018, %conv + br i1 %cmp1, label %if.then, label %if.end + +if.then: + %arrayidx = getelementptr inbounds i8, i8* %q, i32 %mul + store i8 1, i8* %arrayidx, align 1 + br label %if.end + +if.end: + %cmp4 = icmp ugt i32 %ix.018, %conv3 + br i1 %cmp4, label %if.then6, label %for.inc + +if.then6: + %add = or i32 %mul, 1 + %arrayidx7 = getelementptr inbounds i8, i8* %q, i32 %add + store i8 2, i8* %arrayidx7, align 1 + br label %for.inc + +for.inc: + %inc = add nuw nsw i32 %ix.018, 1 + %exitcond = icmp eq i32 %inc, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +attributes #0 = { "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" } diff --git a/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll b/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll index 89c0ac10916..c647f586b18 100644 --- a/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll +++ b/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll @@ -1,4 +1,5 @@ ; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=2 -force-vector-interleave=1 -enable-interleaved-mem-accesses < %s | FileCheck %s +; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=2 -force-vector-interleave=1 -enable-interleaved-mem-accesses -enable-masked-interleaved-mem-accesses < %s | FileCheck %s target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" %pair = type { i64, i64 } -- GitLab From 473da03560118802fa66f089afeda9d0b38b2ab4 Mon Sep 17 00:00:00 2001 From: Dorit Nuzman Date: Sun, 14 Oct 2018 07:21:20 +0000 Subject: [PATCH 0165/1116] revert 344472 due to failures. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344473 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Analysis/TargetTransformInfo.h | 19 +- .../llvm/Analysis/TargetTransformInfoImpl.h | 6 +- include/llvm/Analysis/VectorUtils.h | 21 +- include/llvm/CodeGen/BasicTTIImpl.h | 38 +-- lib/Analysis/TargetTransformInfo.cpp | 10 +- lib/Analysis/VectorUtils.cpp | 29 +-- .../AArch64/AArch64TargetTransformInfo.cpp | 7 +- .../AArch64/AArch64TargetTransformInfo.h | 2 +- lib/Target/ARM/ARMTargetTransformInfo.cpp | 8 +- lib/Target/ARM/ARMTargetTransformInfo.h | 2 +- .../Hexagon/HexagonTargetTransformInfo.cpp | 6 +- .../Hexagon/HexagonTargetTransformInfo.h | 2 +- lib/Target/PowerPC/PPCTargetTransformInfo.cpp | 7 +- lib/Target/PowerPC/PPCTargetTransformInfo.h | 3 +- .../SystemZ/SystemZTargetTransformInfo.cpp | 6 +- .../SystemZ/SystemZTargetTransformInfo.h | 2 +- lib/Target/X86/X86TargetTransformInfo.cpp | 23 +- lib/Target/X86/X86TargetTransformInfo.h | 9 +- lib/Transforms/Vectorize/LoopVectorize.cpp | 126 ++-------- lib/Transforms/Vectorize/VPRecipeBuilder.h | 3 +- lib/Transforms/Vectorize/VPlan.h | 8 +- .../x86-interleaved-accesses-masked-group.ll | 164 ------------- .../interleaved-accesses-masked-group.ll | 222 ------------------ .../interleaved-accesses-pred-stores.ll | 1 - 24 files changed, 70 insertions(+), 654 deletions(-) delete mode 100644 test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll delete mode 100644 test/Transforms/LoopVectorize/interleaved-accesses-masked-group.ll diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h index c2a9d1ec195..18b5a5cf0e5 100644 --- a/include/llvm/Analysis/TargetTransformInfo.h +++ b/include/llvm/Analysis/TargetTransformInfo.h @@ -587,10 +587,6 @@ public: /// Enable matching of interleaved access groups. bool enableInterleavedAccessVectorization() const; - /// Enable matching of interleaved access groups that contain predicated - /// accesses and are vectorized using masked vector loads/stores. - bool enableMaskedInterleavedAccessVectorization() const; - /// Indicate that it is potentially unsafe to automatically vectorize /// floating-point operations because the semantics of vector and scalar /// floating-point semantics may differ. For example, ARM NEON v7 SIMD math @@ -825,11 +821,9 @@ public: /// load allows gaps) /// \p Alignment is the alignment of the memory operation /// \p AddressSpace is address space of the pointer. - /// \p IsMasked indicates if the memory access is predicated. int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace, - bool IsMasked = false) const; + unsigned AddressSpace) const; /// Calculate the cost of performing a vector reduction. /// @@ -1078,7 +1072,6 @@ public: virtual const MemCmpExpansionOptions *enableMemCmpExpansion( bool IsZeroCmp) const = 0; virtual bool enableInterleavedAccessVectorization() = 0; - virtual bool enableMaskedInterleavedAccessVectorization() = 0; virtual bool isFPVectorizationPotentiallyUnsafe() = 0; virtual bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth, @@ -1139,8 +1132,7 @@ public: unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace, - bool IsMasked = false) = 0; + unsigned AddressSpace) = 0; virtual int getArithmeticReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm) = 0; virtual int getMinMaxReductionCost(Type *Ty, Type *CondTy, @@ -1354,9 +1346,6 @@ public: bool enableInterleavedAccessVectorization() override { return Impl.enableInterleavedAccessVectorization(); } - bool enableMaskedInterleavedAccessVectorization() override { - return Impl.enableMaskedInterleavedAccessVectorization(); - } bool isFPVectorizationPotentiallyUnsafe() override { return Impl.isFPVectorizationPotentiallyUnsafe(); } @@ -1482,9 +1471,9 @@ public: } int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace, bool IsMasked) override { + unsigned AddressSpace) override { return Impl.getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace, IsMasked); + Alignment, AddressSpace); } int getArithmeticReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm) override { diff --git a/include/llvm/Analysis/TargetTransformInfoImpl.h b/include/llvm/Analysis/TargetTransformInfoImpl.h index c64d4d36805..e39fe66c0a4 100644 --- a/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -313,8 +313,6 @@ public: bool enableInterleavedAccessVectorization() { return false; } - bool enableMaskedInterleavedAccessVectorization() { return false; } - bool isFPVectorizationPotentiallyUnsafe() { return false; } bool allowsMisalignedMemoryAccesses(LLVMContext &Context, @@ -452,8 +450,8 @@ public: unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, - unsigned Alignment, unsigned AddressSpace, - bool IsMasked = false) { + unsigned Alignment, + unsigned AddressSpace) { return 1; } diff --git a/include/llvm/Analysis/VectorUtils.h b/include/llvm/Analysis/VectorUtils.h index 2ac49f67662..622d932f74f 100644 --- a/include/llvm/Analysis/VectorUtils.h +++ b/include/llvm/Analysis/VectorUtils.h @@ -125,21 +125,6 @@ computeMinimumValueSizes(ArrayRef Blocks, /// This function always sets a (possibly null) value for each K in Kinds. Instruction *propagateMetadata(Instruction *I, ArrayRef VL); -/// Create a mask with replicated elements. -/// -/// This function creates a shuffle mask for replicating each of the \p VF -/// elements in a vector \p ReplicationFactor times. It can be used to -/// transform a mask of \p VF elements into a mask of -/// \p VF * \p ReplicationFactor elements used by a predicated -/// interleaved-group of loads/stores whose Interleaved-factor == -/// \p ReplicationFactor. -/// -/// For example, the mask for \p ReplicationFactor=3 and \p VF=4 is: -/// -/// <0,0,0,1,1,1,2,2,2,3,3,3> -Constant *createReplicatedMask(IRBuilder<> &Builder, unsigned ReplicationFactor, - unsigned VF); - /// Create an interleave shuffle mask. /// /// This function creates a shuffle mask for interleaving \p NumVecs vectors of @@ -343,7 +328,7 @@ public: InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L, DominatorTree *DT, LoopInfo *LI, const LoopAccessInfo *LAI) - : PSE(PSE), TheLoop(L), DT(DT), LI(LI), LAI(LAI) {} + : PSE(PSE), TheLoop(L), DT(DT), LI(LI), LAI(LAI) {} ~InterleavedAccessInfo() { SmallPtrSet DelSet; @@ -356,9 +341,7 @@ public: /// Analyze the interleaved accesses and collect them in interleave /// groups. Substitute symbolic strides using \p Strides. - /// Consider also predicated loads/stores in the analysis if - /// \p EnableMaskedInterleavedGroup is true. - void analyzeInterleaving(bool EnableMaskedInterleavedGroup); + void analyzeInterleaving(); /// Check if \p Instr belongs to any interleave group. bool isInterleaved(Instruction *Instr) const { diff --git a/include/llvm/CodeGen/BasicTTIImpl.h b/include/llvm/CodeGen/BasicTTIImpl.h index e740fe57172..b460cdc0ba1 100644 --- a/include/llvm/CodeGen/BasicTTIImpl.h +++ b/include/llvm/CodeGen/BasicTTIImpl.h @@ -783,8 +783,8 @@ public: unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, - unsigned Alignment, unsigned AddressSpace, - bool IsMasked = false) { + unsigned Alignment, + unsigned AddressSpace) { VectorType *VT = dyn_cast(VecTy); assert(VT && "Expect a vector type for interleaved memory op"); @@ -795,13 +795,8 @@ public: VectorType *SubVT = VectorType::get(VT->getElementType(), NumSubElts); // Firstly, the cost of load/store operation. - unsigned Cost; - if (IsMasked) - Cost = static_cast(this)->getMaskedMemoryOpCost( - Opcode, VecTy, Alignment, AddressSpace); - else - Cost = static_cast(this)->getMemoryOpCost(Opcode, VecTy, Alignment, - AddressSpace); + unsigned Cost = static_cast(this)->getMemoryOpCost( + Opcode, VecTy, Alignment, AddressSpace); // Legalize the vector type, and get the legalized and unlegalized type // sizes. @@ -897,31 +892,6 @@ public: ->getVectorInstrCost(Instruction::InsertElement, VT, i); } - if (!IsMasked) - return Cost; - - Type *I8Type = Type::getInt8Ty(VT->getContext()); - VectorType *MaskVT = VectorType::get(I8Type, NumElts); - SubVT = VectorType::get(I8Type, NumSubElts); - - // The Mask shuffling cost is extract all the elements of the Mask - // and insert each of them Factor times into the wide vector: - // - // E.g. an interleaved group with factor 3: - // %mask = icmp ult <8 x i32> %vec1, %vec2 - // %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef, - // <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7> - // The cost is estimated as extract all mask elements from the <8xi1> mask - // vector and insert them factor times into the <24xi1> shuffled mask - // vector. - for (unsigned i = 0; i < NumSubElts; i++) - Cost += static_cast(this)->getVectorInstrCost( - Instruction::ExtractElement, SubVT, i); - - for (unsigned i = 0; i < NumElts; i++) - Cost += static_cast(this)->getVectorInstrCost( - Instruction::InsertElement, MaskVT, i); - return Cost; } diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp index 867403d0ef1..4ad48e351a4 100644 --- a/lib/Analysis/TargetTransformInfo.cpp +++ b/lib/Analysis/TargetTransformInfo.cpp @@ -268,10 +268,6 @@ bool TargetTransformInfo::enableInterleavedAccessVectorization() const { return TTIImpl->enableInterleavedAccessVectorization(); } -bool TargetTransformInfo::enableMaskedInterleavedAccessVectorization() const { - return TTIImpl->enableMaskedInterleavedAccessVectorization(); -} - bool TargetTransformInfo::isFPVectorizationPotentiallyUnsafe() const { return TTIImpl->isFPVectorizationPotentiallyUnsafe(); } @@ -519,9 +515,9 @@ int TargetTransformInfo::getGatherScatterOpCost(unsigned Opcode, Type *DataTy, int TargetTransformInfo::getInterleavedMemoryOpCost( unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, - unsigned Alignment, unsigned AddressSpace, bool IsMasked) const { - int Cost = TTIImpl->getInterleavedMemoryOpCost( - Opcode, VecTy, Factor, Indices, Alignment, AddressSpace, IsMasked); + unsigned Alignment, unsigned AddressSpace) const { + int Cost = TTIImpl->getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, + Alignment, AddressSpace); assert(Cost >= 0 && "TTI should not produce negative costs!"); return Cost; } diff --git a/lib/Analysis/VectorUtils.cpp b/lib/Analysis/VectorUtils.cpp index e14449b8838..272c665ace1 100644 --- a/lib/Analysis/VectorUtils.cpp +++ b/lib/Analysis/VectorUtils.cpp @@ -502,16 +502,6 @@ Instruction *llvm::propagateMetadata(Instruction *Inst, ArrayRef VL) { return Inst; } -Constant *llvm::createReplicatedMask(IRBuilder<> &Builder, - unsigned ReplicationFactor, unsigned VF) { - SmallVector MaskVec; - for (unsigned i = 0; i < VF; i++) - for (unsigned j = 0; j < ReplicationFactor; j++) - MaskVec.push_back(Builder.getInt32(i)); - - return ConstantVector::get(MaskVec); -} - Constant *llvm::createInterleaveMask(IRBuilder<> &Builder, unsigned VF, unsigned NumVecs) { SmallVector Mask; @@ -682,8 +672,7 @@ void InterleavedAccessInfo::collectConstStrideAccesses( // this group because it and (2) are dependent. However, (1) can be grouped // with other accesses that may precede it in program order. Note that a // bottom-up order does not imply that WAW dependences should not be checked. -void InterleavedAccessInfo::analyzeInterleaving( - bool EnablePredicatedInterleavedMemAccesses) { +void InterleavedAccessInfo::analyzeInterleaving() { LLVM_DEBUG(dbgs() << "LV: Analyzing interleaved accesses...\n"); const ValueToValueMap &Strides = LAI->getSymbolicStrides(); @@ -723,8 +712,9 @@ void InterleavedAccessInfo::analyzeInterleaving( // create a group for B, we continue with the bottom-up algorithm to ensure // we don't break any of B's dependences. InterleaveGroup *Group = nullptr; - if (isStrided(DesB.Stride) && - (!isPredicated(B->getParent()) || EnablePredicatedInterleavedMemAccesses)) { + // TODO: Ignore B if it is in a predicated block. This restriction can be + // relaxed in the future once we handle masked interleaved groups. + if (isStrided(DesB.Stride) && !isPredicated(B->getParent())) { Group = getInterleaveGroup(B); if (!Group) { LLVM_DEBUG(dbgs() << "LV: Creating an interleave group with:" << *B @@ -818,12 +808,11 @@ void InterleavedAccessInfo::analyzeInterleaving( if (DistanceToB % static_cast(DesB.Size)) continue; - // All members of a predicated interleave-group must have the same predicate, - // and currently must reside in the same BB. - BasicBlock *BlockA = A->getParent(); - BasicBlock *BlockB = B->getParent(); - if ((isPredicated(BlockA) || isPredicated(BlockB)) && - (!EnablePredicatedInterleavedMemAccesses || BlockA != BlockB)) + // Ignore A if either A or B is in a predicated block. Although we + // currently prevent group formation for predicated accesses, we may be + // able to relax this limitation in the future once we handle more + // complicated blocks. + if (isPredicated(A->getParent()) || isPredicated(B->getParent())) continue; // The index of A is the index of B plus A's distance to B in multiples diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index a16de89cf10..96e751e8697 100644 --- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -659,12 +659,11 @@ int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace, - bool IsMasked) { + unsigned AddressSpace) { assert(Factor >= 2 && "Invalid interleave factor"); assert(isa(VecTy) && "Expect a vector type"); - if (!IsMasked && Factor <= TLI->getMaxSupportedInterleaveFactor()) { + if (Factor <= TLI->getMaxSupportedInterleaveFactor()) { unsigned NumElts = VecTy->getVectorNumElements(); auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor); @@ -677,7 +676,7 @@ int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, } return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace, IsMasked); + Alignment, AddressSpace); } int AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef Tys) { diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.h b/lib/Target/AArch64/AArch64TargetTransformInfo.h index b3893d32850..c056a7d2428 100644 --- a/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -146,7 +146,7 @@ public: int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace, bool IsMasked = false); + unsigned AddressSpace); bool shouldConsiderAddressTypePromotion(const Instruction &I, diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp index bac3e6c2387..1b0d162f726 100644 --- a/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -542,16 +542,14 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace - bool IsMasked) { + unsigned AddressSpace) { assert(Factor >= 2 && "Invalid interleave factor"); assert(isa(VecTy) && "Expect a vector type"); // vldN/vstN doesn't support vector types of i64/f64 element. bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64; - if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits && - !IsMasked) { + if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits) { unsigned NumElts = VecTy->getVectorNumElements(); auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor); @@ -564,7 +562,7 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, } return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace, IsMasked); + Alignment, AddressSpace); } void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, diff --git a/lib/Target/ARM/ARMTargetTransformInfo.h b/lib/Target/ARM/ARMTargetTransformInfo.h index 84e3055c6bc..7d14bd7c256 100644 --- a/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/lib/Target/ARM/ARMTargetTransformInfo.h @@ -169,7 +169,7 @@ public: int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace, bool IsMasked); + unsigned AddressSpace); void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP); diff --git a/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp index 79b269bccfe..4d0e7dc52e8 100644 --- a/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp +++ b/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp @@ -206,10 +206,10 @@ unsigned HexagonTTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *DataTy, unsigned HexagonTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, - unsigned Alignment, unsigned AddressSpace, bool IsMasked) { - if (Indices.size() != Factor || IsMasked) + unsigned Alignment, unsigned AddressSpace) { + if (Indices.size() != Factor) return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace, IsMasked); + Alignment, AddressSpace); return getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, nullptr); } diff --git a/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/lib/Target/Hexagon/HexagonTargetTransformInfo.h index 901a91692e8..2c03cd268ff 100644 --- a/lib/Target/Hexagon/HexagonTargetTransformInfo.h +++ b/lib/Target/Hexagon/HexagonTargetTransformInfo.h @@ -123,7 +123,7 @@ public: bool VariableMask, unsigned Alignment); unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace, bool IsMasked); + unsigned AddressSpace); unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, const Instruction *I); unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp index 2c81661cb17..b0da9b5a6d7 100644 --- a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -473,12 +473,7 @@ int PPCTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace, - Bool IsMasked) { - if (IsMasked) - return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace, IsMasked); - + unsigned AddressSpace) { assert(isa(VecTy) && "Expect a vector type for interleaved memory op"); diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.h b/lib/Target/PowerPC/PPCTargetTransformInfo.h index 252d46e7a2a..2ee2b3eb808 100644 --- a/lib/Target/PowerPC/PPCTargetTransformInfo.h +++ b/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -90,8 +90,7 @@ public: unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace, - bool IsMasked = false); + unsigned AddressSpace); /// @} }; diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp index 1eaeb9699bf..6f553d5bed3 100644 --- a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -909,11 +909,7 @@ int SystemZTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace, - bool IsMasked) { - if (IsMasked) - return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace, IsMasked); + unsigned AddressSpace) { assert(isa(VecTy) && "Expect a vector type for interleaved memory op"); diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/lib/Target/SystemZ/SystemZTargetTransformInfo.h index 92b2b9bdcb8..bfa942357c5 100644 --- a/lib/Target/SystemZ/SystemZTargetTransformInfo.h +++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.h @@ -92,7 +92,7 @@ public: unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace, bool IsMasked = false); + unsigned AddressSpace); /// @} }; diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp index 82e4dfe25b7..d3a75123935 100644 --- a/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -2723,12 +2723,7 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace, - bool IsMasked) { - - if (IsMasked) - return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace, IsMasked); + unsigned AddressSpace) { // We currently Support only fully-interleaved groups, with no gaps. // TODO: Support also strided loads (interleaved-groups with gaps). @@ -2837,12 +2832,7 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace, - bool IsMasked) { - - if (IsMasked) - return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace, IsMasked); + unsigned AddressSpace) { // VecTy for interleave memop is . // So, for VF=4, Interleave Factor = 3, Element type = i32 we have @@ -2960,8 +2950,7 @@ int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace, - bool IsMasked) { + unsigned AddressSpace) { auto isSupportedOnAVX512 = [](Type *VecTy, bool HasBW) { Type *EltTy = VecTy->getVectorElementType(); if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) || @@ -2973,11 +2962,11 @@ int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, }; if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI())) return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace, IsMasked); + Alignment, AddressSpace); if (ST->hasAVX2()) return getInterleavedMemoryOpCostAVX2(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace, IsMasked); + Alignment, AddressSpace); return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace, IsMasked); + Alignment, AddressSpace); } diff --git a/lib/Target/X86/X86TargetTransformInfo.h b/lib/Target/X86/X86TargetTransformInfo.h index 2bd778a4211..3df89903882 100644 --- a/lib/Target/X86/X86TargetTransformInfo.h +++ b/lib/Target/X86/X86TargetTransformInfo.h @@ -101,16 +101,13 @@ public: int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, - unsigned Alignment, unsigned AddressSpace, - bool IsMasked = false); + unsigned Alignment, unsigned AddressSpace); int getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, - unsigned Alignment, unsigned AddressSpace, - bool IsMasked = false); + unsigned Alignment, unsigned AddressSpace); int getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, - unsigned Alignment, unsigned AddressSpace, - bool IsMasked = false); + unsigned Alignment, unsigned AddressSpace); int getIntImmCost(int64_t); diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index e93cfb34156..7ebe8d102b7 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -172,10 +172,6 @@ static cl::opt EnableInterleavedMemAccesses( "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on interleaved memory accesses in a loop")); -static cl::opt EnableMaskedInterleavedMemAccesses( - "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, - cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); - /// We don't interleave loops with a known constant trip count below this /// number. static const unsigned TinyTripCountInterleaveThreshold = 128; @@ -412,10 +408,8 @@ public: /// Construct the vector value of a scalarized value \p V one lane at a time. void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); - /// Try to vectorize the interleaved access group that \p Instr belongs to, - /// optionally masking the vector operations if \p BlockInMask is non-null. - void vectorizeInterleaveGroup(Instruction *Instr, - VectorParts *BlockInMask = nullptr); + /// Try to vectorize the interleaved access group that \p Instr belongs to. + void vectorizeInterleaveGroup(Instruction *Instr); /// Vectorize Load and Store instructions, optionally masking the vector /// operations if \p BlockInMask is non-null. @@ -1118,11 +1112,6 @@ public: /// access that can be widened. bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1); - /// Returns true if \p I is a memory instruction in an interleaved-group - /// of memory accesses that can be vectorized with wide vector loads/stores - /// and shuffles. - bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1); - /// Check if \p Instr belongs to any interleaved access group. bool isAccessInterleaved(Instruction *Instr) { return InterleaveInfo.isInterleaved(Instr); @@ -1957,8 +1946,7 @@ Value *InnerLoopVectorizer::reverseVector(Value *Vec) { // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B -void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, - VectorParts *BlockInMask) { +void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) { const InterleaveGroup *Group = Cost->getInterleavedAccessGroup(Instr); assert(Group && "Fail to get an interleaved access group."); @@ -1980,15 +1968,6 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, SmallVector NewPtrs; unsigned Index = Group->getIndex(Instr); - VectorParts Mask; - bool IsMaskRequired = BlockInMask; - if (IsMaskRequired) { - Mask = *BlockInMask; - // TODO: extend the masked interleaved-group support to reversed access. - assert(!Group->isReverse() && "Reversed masked interleave-group " - "not supported."); - } - // If the group is reverse, adjust the index to refer to the last vector lane // instead of the first. We adjust the index from the first vector lane, // rather than directly getting the pointer for lane VF - 1, because the @@ -2032,19 +2011,8 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, // For each unroll part, create a wide load for the group. SmallVector NewLoads; for (unsigned Part = 0; Part < UF; Part++) { - Instruction *NewLoad; - if (IsMaskRequired) { - auto *Undefs = UndefValue::get(Mask[Part]->getType()); - auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF); - Value *ShuffledMask = Builder.CreateShuffleVector( - Mask[Part], Undefs, RepMask, "interleaved.mask"); - NewLoad = Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(), - ShuffledMask, UndefVec, - "wide.masked.vec"); - } - else - NewLoad = Builder.CreateAlignedLoad(NewPtrs[Part], - Group->getAlignment(), "wide.vec"); + auto *NewLoad = Builder.CreateAlignedLoad( + NewPtrs[Part], Group->getAlignment(), "wide.vec"); Group->addMetadata(NewLoad); NewLoads.push_back(NewLoad); } @@ -2111,18 +2079,8 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask, "interleaved.vec"); - Instruction *NewStoreInstr; - if (IsMaskRequired) { - auto *Undefs = UndefValue::get(Mask[Part]->getType()); - auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF); - Value *ShuffledMask = Builder.CreateShuffleVector( - Mask[Part], Undefs, RepMask, "interleaved.mask"); - NewStoreInstr = Builder.CreateMaskedStore( - IVec, NewPtrs[Part], Group->getAlignment(), ShuffledMask); - } - else - NewStoreInstr = Builder.CreateAlignedStore(IVec, NewPtrs[Part], - Group->getAlignment()); + Instruction *NewStoreInstr = + Builder.CreateAlignedStore(IVec, NewPtrs[Part], Group->getAlignment()); Group->addMetadata(NewStoreInstr); } @@ -4295,32 +4253,6 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigne return false; } -static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { - if (!(EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)) - return TTI.enableMaskedInterleavedAccessVectorization(); - - // If an override option has been passed in for interleaved accesses, use it. - return EnableMaskedInterleavedMemAccesses; -} - -bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I, - unsigned VF) { - assert(isAccessInterleaved(I) && "Expecting interleaved access."); - assert(getWideningDecision(I, VF) == CM_Unknown && - "Decision should not be set yet."); - - if (!Legal->blockNeedsPredication(I->getParent()) || - !Legal->isMaskRequired(I)) - return true; - - if (!useMaskedInterleavedAccesses(TTI)) - return false; - - auto *Ty = getMemInstValueType(I); - return isa(I) ? TTI.isLegalMaskedLoad(Ty) - : TTI.isLegalMaskedStore(Ty); -} - bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I, unsigned VF) { // Get and ensure we have a valid memory instruction. @@ -5439,17 +5371,13 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, } // Calculate the cost of the whole interleaved group. - unsigned Cost = TTI.getInterleavedMemoryOpCost( - I->getOpcode(), WideVecTy, Group->getFactor(), Indices, - Group->getAlignment(), AS, Legal->isMaskRequired(I)); - - if (Group->isReverse()) { - // TODO: Add support for reversed masked interleaved access. - assert(!Legal->isMaskRequired(I) && - "Reverse masked interleaved access not supported."); + unsigned Cost = TTI.getInterleavedMemoryOpCost(I->getOpcode(), WideVecTy, + Group->getFactor(), Indices, + Group->getAlignment(), AS); + + if (Group->isReverse()) Cost += Group->getNumMembers() * TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); - } return Cost; } @@ -5551,8 +5479,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { continue; NumAccesses = Group->getNumMembers(); - if (interleavedAccessCanBeWidened(&I, VF)) - InterleaveCost = getInterleaveGroupCost(&I, VF); + InterleaveCost = getInterleaveGroupCost(&I, VF); } unsigned GatherScatterCost = @@ -6225,8 +6152,7 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { } VPInterleaveRecipe *VPRecipeBuilder::tryToInterleaveMemory(Instruction *I, - VFRange &Range, - VPlanPtr &Plan) { + VFRange &Range) { const InterleaveGroup *IG = CM.getInterleavedAccessGroup(I); if (!IG) return nullptr; @@ -6248,11 +6174,7 @@ VPInterleaveRecipe *VPRecipeBuilder::tryToInterleaveMemory(Instruction *I, assert(I == IG->getInsertPos() && "Generating a recipe for an adjunct member of an interleave group"); - VPValue *Mask = nullptr; - if (Legal->isMaskRequired(I)) - Mask = createBlockInMask(I->getParent(), Plan); - - return new VPInterleaveRecipe(IG, Mask); + return new VPInterleaveRecipe(IG); } VPWidenMemoryInstructionRecipe * @@ -6520,7 +6442,7 @@ bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range, VPRecipeBase *Recipe = nullptr; // Check if Instr should belong to an interleave memory recipe, or already // does. In the latter case Instr is irrelevant. - if ((Recipe = tryToInterleaveMemory(Instr, Range, Plan))) { + if ((Recipe = tryToInterleaveMemory(Instr, Range))) { VPBB->appendRecipe(Recipe); return true; } @@ -6747,10 +6669,6 @@ void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const { O << " +\n" << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; IG->getInsertPos()->printAsOperand(O, false); - if (User) { - O << ", "; - User->getOperand(0)->printAsOperand(O); - } O << "\\l\""; for (unsigned i = 0; i < IG->getFactor(); ++i) if (Instruction *I = IG->getMember(i)) @@ -6813,15 +6731,7 @@ void VPBlendRecipe::execute(VPTransformState &State) { void VPInterleaveRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Interleave group being replicated."); - if (!User) - return State.ILV->vectorizeInterleaveGroup(IG->getInsertPos()); - - // Last (and currently only) operand is a mask. - InnerLoopVectorizer::VectorParts MaskValues(State.UF); - VPValue *Mask = User->getOperand(User->getNumOperands() - 1); - for (unsigned Part = 0; Part < State.UF; ++Part) - MaskValues[Part] = State.get(Mask, Part); - State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), &MaskValues); + State.ILV->vectorizeInterleaveGroup(IG->getInsertPos()); } void VPReplicateRecipe::execute(VPTransformState &State) { @@ -7120,7 +7030,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { // Analyze interleaved memory accesses. if (UseInterleaved) { - IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); + IAI.analyzeInterleaving(); } // Use the cost model. diff --git a/lib/Transforms/Vectorize/VPRecipeBuilder.h b/lib/Transforms/Vectorize/VPRecipeBuilder.h index 15d38ac9c84..f43a8bb123b 100644 --- a/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -69,8 +69,7 @@ public: /// \return value is , as it is handled by another recipe. /// \p Range.End may be decreased to ensure same decision from \p Range.Start /// to \p Range.End. - VPInterleaveRecipe *tryToInterleaveMemory(Instruction *I, VFRange &Range, - VPlanPtr &Plan); + VPInterleaveRecipe *tryToInterleaveMemory(Instruction *I, VFRange &Range); /// Check if \I is a memory instruction to be widened for \p Range.Start and /// potentially masked. Such instructions are handled by a recipe that takes diff --git a/lib/Transforms/Vectorize/VPlan.h b/lib/Transforms/Vectorize/VPlan.h index 81b1986c97d..c3123b41600 100644 --- a/lib/Transforms/Vectorize/VPlan.h +++ b/lib/Transforms/Vectorize/VPlan.h @@ -769,14 +769,10 @@ public: class VPInterleaveRecipe : public VPRecipeBase { private: const InterleaveGroup *IG; - std::unique_ptr User; public: - VPInterleaveRecipe(const InterleaveGroup *IG, VPValue *Mask) - : VPRecipeBase(VPInterleaveSC), IG(IG) { - if (Mask) // Create a VPInstruction to register as a user of the mask. - User.reset(new VPUser({Mask})); - } + VPInterleaveRecipe(const InterleaveGroup *IG) + : VPRecipeBase(VPInterleaveSC), IG(IG) {} ~VPInterleaveRecipe() override = default; /// Method to support type inquiry through isa, cast, and dyn_cast. diff --git a/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll b/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll deleted file mode 100644 index b1163d0a199..00000000000 --- a/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll +++ /dev/null @@ -1,164 +0,0 @@ -; RUN: opt -mcpu=skx -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses < %s | FileCheck %s -check-prefix=DISABLED_MASKED_STRIDED -; RUN: opt -mcpu=skx -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses -enable-masked-interleaved-mem-accesses < %s | FileCheck %s -check-prefix=ENABLED_MASKED_STRIDED - -target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128" -target triple = "i386-unknown-linux-gnu" - -; When masked-interleaved-groups are disabled: -; Check that the predicated load is not vectorized as an -; interleaved-group but rather as a scalarized accesses. -; (For SKX, Gather is not supported by the compiler for chars, therefore -; the only remaining alternative is to scalarize). -; When masked-interleave-group is enabled we expect to find the proper mask -; shuffling code, feeding the wide masked load for an interleave-group (with -; a single member). -; -; void masked_strided1(const unsigned char* restrict p, -; unsigned char* restrict q, -; unsigned char guard) { -; for(ix=0; ix < 1024; ++ix) { -; if (ix > guard) { -; char t = p[2*ix]; -; q[ix] = t; -; } -; } -; } - -;DISABLED_MASKED_STRIDED-LABEL: @masked_strided1( -;DISABLED_MASKED_STRIDED: vector.body: -;DISABLED_MASKED_STRIDED-NEXT: %index = phi i32 -;DISABLED_MASKED_STRIDED-NEXT: %[[VECIND:.+]] = phi <8 x i32> [ -;DISABLED_MASKED_STRIDED-NOT: %interleaved.mask = -;DISABLED_MASKED_STRIDED-NOT: call void @llvm.masked.load. -;DISABLED_MASKED_STRIDED-NOT: %{{.*}} = shufflevector <16 x i8> %[[WIDEVEC]], <16 x i8> undef, <8 x i32> -;DISABLED_MASKED_STRIDED: %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}} -;DISABLED_MASKED_STRIDED-NEXT: %{{.*}} = shl nuw nsw <8 x i32> %[[VECIND]], -;DISABLED_MASKED_STRIDED-NEXT: %[[M:.+]] = extractelement <8 x i1> %[[VMASK]], i32 0 -;DISABLED_MASKED_STRIDED-NEXT: br i1 %[[M]], label %pred.load.if, label %pred.load.continue -;DISABLED_MASKED_STRIDED-NOT: %interleaved.mask = -;DISABLED_MASKED_STRIDED-NOT: call void @llvm.masked.load. -;DISABLED_MASKED_STRIDED-NOT: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <8 x i32> - -;ENABLED_MASKED_STRIDED-LABEL: @masked_strided1( -;ENABLED_MASKED_STRIDED: vector.body: -;ENABLED_MASKED_STRIDED-NEXT: %index = phi i32 -;ENABLED_MASKED_STRIDED-NEXT: %[[VECIND:.+]] = phi <8 x i32> [ -;ENABLED_MASKED_STRIDED: %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}} -;ENABLED_MASKED_STRIDED: %interleaved.mask = shufflevector <8 x i1> %[[VMASK]], <8 x i1> undef, <16 x i32> -;ENABLED_MASKED_STRIDED-NEXT: %[[WIDEMASKEDLOAD:.+]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %{{.*}}, i32 1, <16 x i1> %interleaved.mask, <16 x i8> undef) -;ENABLED_MASKED_STRIDED-NEXT: %[[STRIDEDVEC:.+]] = shufflevector <16 x i8> %[[WIDEMASKEDLOAD]], <16 x i8> undef, <8 x i32> - -define dso_local void @masked_strided1(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr { -entry: - %conv = zext i8 %guard to i32 - br label %for.body - -for.body: - %ix.09 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] - %cmp1 = icmp ugt i32 %ix.09, %conv - br i1 %cmp1, label %if.then, label %for.inc - -if.then: - %mul = shl nuw nsw i32 %ix.09, 1 - %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul - %0 = load i8, i8* %arrayidx, align 1 - %arrayidx3 = getelementptr inbounds i8, i8* %q, i32 %ix.09 - store i8 %0, i8* %arrayidx3, align 1 - br label %for.inc - -for.inc: - %inc = add nuw nsw i32 %ix.09, 1 - %exitcond = icmp eq i32 %inc, 1024 - br i1 %exitcond, label %for.end, label %for.body - -for.end: - ret void -} - -; Check also a scenario with full interleave-groups (no gaps) as well as both -; load and store groups. We check that when masked-interleave-group is disabled -; the predicated loads (and stores) are not vectorized as an -; interleaved-group but rather as four separate scalarized accesses. -; (For SKX, gather/scatter is not supported by the compiler for chars, therefore -; the only remaining alternative is to scalarize). -; When masked-interleave-group is enabled we expect to find the proper mask -; shuffling code, feeding the wide masked load/store for the two interleave- -; groups. -; -; void masked_strided2(const unsigned char* restrict p, -; unsigned char* restrict q, -; unsigned char guard) { -; for(ix=0; ix < 1024; ++ix) { -; if (ix > guard) { -; char left = p[2*ix]; -; char right = p[2*ix + 1]; -; char max = max(left, right); -; q[2*ix] = max; -; q[2*ix+1] = 0 - max; -; } -; } -;} - -;DISABLED_MASKED_STRIDED-LABEL: @masked_strided2( -;DISABLED_MASKED_STRIDED: vector.body: -;DISABLED_MASKED_STRIDED-NEXT: %index = phi i32 -;DISABLED_MASKED_STRIDED-NEXT: %[[VECIND:.+]] = phi <8 x i32> [ -;DISABLED_MASKED_STRIDED-NOT: %interleaved.mask = -;DISABLED_MASKED_STRIDED-NOT: call void @llvm.masked.load. -;DISABLED_MASKED_STRIDED-NOT: call void @llvm.masked.store. -;DISABLED_MASKED_STRIDED-NOT: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <8 x i32> -;DISABLED_MASKED_STRIDED: %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}} -;DISABLED_MASKED_STRIDED-NEXT: %{{.*}} = shl nuw nsw <8 x i32> %[[VECIND]], -;DISABLED_MASKED_STRIDED-NEXT: %[[M:.+]] = extractelement <8 x i1> %[[VMASK]], i32 0 -;DISABLED_MASKED_STRIDED-NEXT: br i1 %[[M]], label %pred.load.if, label %pred.load.continue -;DISABLED_MASKED_STRIDED-NOT: %interleaved.mask = -;DISABLED_MASKED_STRIDED-NOT: call void @llvm.masked.load. -;DISABLED_MASKED_STRIDED-NOT: call void @llvm.masked.store. -;DISABLED_MASKED_STRIDED-NOT: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <8 x i32> - -;ENABLED_MASKED_STRIDED-LABEL: @masked_strided2( -;ENABLED_MASKED_STRIDED: vector.body: -;ENABLED_MASKED_STRIDED-NEXT: %index = phi i32 -;ENABLED_MASKED_STRIDED-NEXT: %[[VECIND:.+]] = phi <8 x i32> [ -;ENABLED_MASKED_STRIDED: %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}} -;ENABLED_MASKED_STRIDED: %interleaved.mask = shufflevector <8 x i1> %[[VMASK]], <8 x i1> undef, <16 x i32> -;ENABLED_MASKED_STRIDED-NEXT: %[[WIDEMASKEDLOAD:.+]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %{{.*}}, i32 1, <16 x i1> %interleaved.mask, <16 x i8> undef) -;ENABLED_MASKED_STRIDED-NEXT: %{{.*}} = shufflevector <16 x i8> %[[WIDEMASKEDLOAD]], <16 x i8> undef, <8 x i32> -;ENABLED_MASKED_STRIDED-NEXT: %{{.*}} = shufflevector <16 x i8> %[[WIDEMASKEDLOAD]], <16 x i8> undef, <8 x i32> -;ENABLED_MASKED_STRIDED: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %{{.*}}, <16 x i8>* %{{.*}}, i32 1, <16 x i1> %interleaved.mask) - -; Function Attrs: norecurse nounwind -define dso_local void @masked_strided2(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr { -entry: - %conv = zext i8 %guard to i32 - br label %for.body - -for.body: - %ix.024 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] - %cmp1 = icmp ugt i32 %ix.024, %conv - br i1 %cmp1, label %if.then, label %for.inc - -if.then: - %mul = shl nuw nsw i32 %ix.024, 1 - %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul - %0 = load i8, i8* %arrayidx, align 1 - %add = or i32 %mul, 1 - %arrayidx4 = getelementptr inbounds i8, i8* %p, i32 %add - %1 = load i8, i8* %arrayidx4, align 1 - %cmp.i = icmp slt i8 %0, %1 - %spec.select.i = select i1 %cmp.i, i8 %1, i8 %0 - %arrayidx6 = getelementptr inbounds i8, i8* %q, i32 %mul - store i8 %spec.select.i, i8* %arrayidx6, align 1 - %sub = sub i8 0, %spec.select.i - %arrayidx11 = getelementptr inbounds i8, i8* %q, i32 %add - store i8 %sub, i8* %arrayidx11, align 1 - br label %for.inc - -for.inc: - %inc = add nuw nsw i32 %ix.024, 1 - %exitcond = icmp eq i32 %inc, 1024 - br i1 %exitcond, label %for.end, label %for.body - -for.end: - ret void -} diff --git a/test/Transforms/LoopVectorize/interleaved-accesses-masked-group.ll b/test/Transforms/LoopVectorize/interleaved-accesses-masked-group.ll deleted file mode 100644 index 9ed66a22dbf..00000000000 --- a/test/Transforms/LoopVectorize/interleaved-accesses-masked-group.ll +++ /dev/null @@ -1,222 +0,0 @@ -; REQUIRES: asserts -; RUN: opt -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses -debug-only=loop-vectorize,vectorutils -disable-output < %s 2>&1 | FileCheck %s -check-prefix=STRIDED_UNMASKED -; RUN: opt -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses -enable-masked-interleaved-mem-accesses -debug-only=loop-vectorize,vectorutils -disable-output < %s 2>&1 | FileCheck %s -check-prefix=STRIDED_MASKED - -target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128" - -; We test here that the loop-vectorizer forms an interleave-groups from -; predicated memory accesses only if they are both in the same (predicated) -; block (first scenario below). -; If the accesses are not in the same predicated block, an interleave-group -; is not formed (scenarios 2,3 below). - -; Scenario 1: Check the case where it is legal to create masked interleave- -; groups. Altogether two groups are created (one for loads and one for stores) -; when masked-interleaved-acceses are enabled. When masked-interleaved-acceses -; are disabled we do not create any interleave-group. -; -; void masked_strided1(const unsigned char* restrict p, -; unsigned char* restrict q, -; unsigned char guard) { -; for(ix=0; ix < 1024; ++ix) { -; if (ix > guard) { -; char left = p[2*ix]; -; char right = p[2*ix + 1]; -; char max = max(left, right); -; q[2*ix] = max; -; q[2*ix+1] = 0 - max; -; } -; } -;} - - -; STRIDED_UNMASKED: LV: Checking a loop in "masked_strided1" -; STRIDED_UNMASKED: LV: Analyzing interleaved accesses... -; STRIDED_UNMASKED-NOT: LV: Creating an interleave group - -; STRIDED_MASKED: LV: Checking a loop in "masked_strided1" -; STRIDED_MASKED: LV: Analyzing interleaved accesses... -; STRIDED_MASKED-NEXT: LV: Creating an interleave group with: store i8 %{{.*}}, i8* %{{.*}}, align 1 -; STRIDED_MASKED-NEXT: LV: Inserted: store i8 %{{.*}}, i8* %{{.*}}, align 1 -; STRIDED_MASKED-NEXT: into the interleave group with store i8 %{{.*}}, i8* %{{.*}}, align 1 -; STRIDED_MASKED-NEXT: LV: Creating an interleave group with: %{{.*}} = load i8, i8* %{{.*}}, align 1 -; STRIDED_MASKED-NEXT: LV: Inserted: %{{.*}} = load i8, i8* %{{.*}}, align 1 -; STRIDED_MASKED-NEXT: into the interleave group with %{{.*}} = load i8, i8* %{{.*}}, align 1 - -; Scenario 2: Check the case where it is illegal to create a masked interleave- -; group because the first access is predicated, and the second isn't. -; We therefore create a separate interleave-group with gaps for each of the -; stores (if masked-interleaved-accesses are enabled) and these are later -; invalidated because interleave-groups of stores with gaps are not supported. -; If masked-interleaved-accesses is not enabled we create only one interleave -; group of stores (for the non-predicated store) and it is later invalidated -; due to gaps. -; -; void masked_strided2(const unsigned char* restrict p, -; unsigned char* restrict q, -; unsigned char guard1, -; unsigned char guard2) { -; for(ix=0; ix < 1024; ++ix) { -; if (ix > guard1) { -; q[2*ix] = 1; -; } -; q[2*ix+1] = 2; -; } -;} - -; STRIDED_UNMASKED: LV: Checking a loop in "masked_strided2" -; STRIDED_UNMASKED: LV: Analyzing interleaved accesses... -; STRIDED_UNMASKED-NEXT: LV: Creating an interleave group with: store i8 1, i8* %{{.*}}, align 1 -; STRIDED_UNMASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps. -; STRIDED_UNMASKED-NOT: LV: Creating an interleave group - -; STRIDED_MASKED: LV: Checking a loop in "masked_strided2" -; STRIDED_MASKED: LV: Analyzing interleaved accesses... -; STRIDED_MASKED-NEXT: LV: Creating an interleave group with: store i8 2, i8* %{{.*}}, align 1 -; STRIDED_MASKED-NEXT: LV: Creating an interleave group with: store i8 1, i8* %{{.*}}, align 1 -; STRIDED_MASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps. -; STRIDED_MASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps. - - -; Scenario 3: Check the case where it is illegal to create a masked interleave- -; group because the two accesses are in separate predicated blocks. -; We therefore create a separate interleave-group with gaps for each of the accesses, -; (which are later invalidated because interleave-groups of stores with gaps are -; not supported). -; If masked-interleaved-accesses is not enabled we don't create any interleave -; group because all accesses are predicated. -; -; void masked_strided3(const unsigned char* restrict p, -; unsigned char* restrict q, -; unsigned char guard1, -; unsigned char guard2) { -; for(ix=0; ix < 1024; ++ix) { -; if (ix > guard1) { -; q[2*ix] = 1; -; } -; if (ix > guard2) { -; q[2*ix+1] = 2; -; } -; } -;} - - -; STRIDED_UNMASKED: LV: Checking a loop in "masked_strided3" -; STRIDED_UNMASKED: LV: Analyzing interleaved accesses... -; STRIDED_UNMASKED-NOT: LV: Creating an interleave group - -; STRIDED_MASKED: LV: Checking a loop in "masked_strided3" -; STRIDED_MASKED: LV: Analyzing interleaved accesses... -; STRIDED_MASKED-NEXT: LV: Creating an interleave group with: store i8 2, i8* %{{.*}}, align 1 -; STRIDED_MASKED-NEXT: LV: Creating an interleave group with: store i8 1, i8* %{{.*}}, align 1 -; STRIDED_MASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps. -; STRIDED_MASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps. - - -; ModuleID = 'test.c' -source_filename = "test.c" -target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128" -target triple = "i386-unknown-linux-gnu" - -define dso_local void @masked_strided1(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr #0 { -entry: - %conv = zext i8 %guard to i32 - br label %for.body - -for.body: - %ix.024 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] - %cmp1 = icmp ugt i32 %ix.024, %conv - br i1 %cmp1, label %if.then, label %for.inc - -if.then: - %mul = shl nuw nsw i32 %ix.024, 1 - %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul - %0 = load i8, i8* %arrayidx, align 1 - %add = or i32 %mul, 1 - %arrayidx4 = getelementptr inbounds i8, i8* %p, i32 %add - %1 = load i8, i8* %arrayidx4, align 1 - %cmp.i = icmp slt i8 %0, %1 - %spec.select.i = select i1 %cmp.i, i8 %1, i8 %0 - %arrayidx6 = getelementptr inbounds i8, i8* %q, i32 %mul - store i8 %spec.select.i, i8* %arrayidx6, align 1 - %sub = sub i8 0, %spec.select.i - %arrayidx11 = getelementptr inbounds i8, i8* %q, i32 %add - store i8 %sub, i8* %arrayidx11, align 1 - br label %for.inc - -for.inc: - %inc = add nuw nsw i32 %ix.024, 1 - %exitcond = icmp eq i32 %inc, 1024 - br i1 %exitcond, label %for.end, label %for.body - -for.end: - ret void -} - - -define dso_local void @masked_strided2(i8* noalias nocapture readnone %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr #0 { -entry: - %conv = zext i8 %guard to i32 - br label %for.body - -for.body: - %ix.012 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] - %mul = shl nuw nsw i32 %ix.012, 1 - %arrayidx = getelementptr inbounds i8, i8* %q, i32 %mul - store i8 1, i8* %arrayidx, align 1 - %cmp1 = icmp ugt i32 %ix.012, %conv - br i1 %cmp1, label %if.then, label %for.inc - -if.then: - %add = or i32 %mul, 1 - %arrayidx3 = getelementptr inbounds i8, i8* %q, i32 %add - store i8 2, i8* %arrayidx3, align 1 - br label %for.inc - -for.inc: - %inc = add nuw nsw i32 %ix.012, 1 - %exitcond = icmp eq i32 %inc, 1024 - br i1 %exitcond, label %for.end, label %for.body - -for.end: - ret void -} - - -define dso_local void @masked_strided3(i8* noalias nocapture readnone %p, i8* noalias nocapture %q, i8 zeroext %guard1, i8 zeroext %guard2) local_unnamed_addr #0 { -entry: - %conv = zext i8 %guard1 to i32 - %conv3 = zext i8 %guard2 to i32 - br label %for.body - -for.body: - %ix.018 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] - %mul = shl nuw nsw i32 %ix.018, 1 - %cmp1 = icmp ugt i32 %ix.018, %conv - br i1 %cmp1, label %if.then, label %if.end - -if.then: - %arrayidx = getelementptr inbounds i8, i8* %q, i32 %mul - store i8 1, i8* %arrayidx, align 1 - br label %if.end - -if.end: - %cmp4 = icmp ugt i32 %ix.018, %conv3 - br i1 %cmp4, label %if.then6, label %for.inc - -if.then6: - %add = or i32 %mul, 1 - %arrayidx7 = getelementptr inbounds i8, i8* %q, i32 %add - store i8 2, i8* %arrayidx7, align 1 - br label %for.inc - -for.inc: - %inc = add nuw nsw i32 %ix.018, 1 - %exitcond = icmp eq i32 %inc, 1024 - br i1 %exitcond, label %for.end, label %for.body - -for.end: - ret void -} - -attributes #0 = { "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" } diff --git a/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll b/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll index c647f586b18..89c0ac10916 100644 --- a/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll +++ b/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll @@ -1,5 +1,4 @@ ; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=2 -force-vector-interleave=1 -enable-interleaved-mem-accesses < %s | FileCheck %s -; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=2 -force-vector-interleave=1 -enable-interleaved-mem-accesses -enable-masked-interleaved-mem-accesses < %s | FileCheck %s target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" %pair = type { i64, i64 } -- GitLab From 7d7250490bddc598c7811fe7f30a9ffb8bb5acfe Mon Sep 17 00:00:00 2001 From: Dorit Nuzman Date: Sun, 14 Oct 2018 08:50:06 +0000 Subject: [PATCH 0166/1116] recommit 344472 after fixing build failure on ARM and PPC. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344475 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Analysis/TargetTransformInfo.h | 19 +- .../llvm/Analysis/TargetTransformInfoImpl.h | 6 +- include/llvm/Analysis/VectorUtils.h | 21 +- include/llvm/CodeGen/BasicTTIImpl.h | 38 ++- lib/Analysis/TargetTransformInfo.cpp | 10 +- lib/Analysis/VectorUtils.cpp | 29 ++- .../AArch64/AArch64TargetTransformInfo.cpp | 7 +- .../AArch64/AArch64TargetTransformInfo.h | 2 +- lib/Target/ARM/ARMTargetTransformInfo.cpp | 8 +- lib/Target/ARM/ARMTargetTransformInfo.h | 2 +- .../Hexagon/HexagonTargetTransformInfo.cpp | 6 +- .../Hexagon/HexagonTargetTransformInfo.h | 2 +- lib/Target/PowerPC/PPCTargetTransformInfo.cpp | 7 +- lib/Target/PowerPC/PPCTargetTransformInfo.h | 3 +- .../SystemZ/SystemZTargetTransformInfo.cpp | 6 +- .../SystemZ/SystemZTargetTransformInfo.h | 2 +- lib/Target/X86/X86TargetTransformInfo.cpp | 23 +- lib/Target/X86/X86TargetTransformInfo.h | 9 +- lib/Transforms/Vectorize/LoopVectorize.cpp | 126 ++++++++-- lib/Transforms/Vectorize/VPRecipeBuilder.h | 3 +- lib/Transforms/Vectorize/VPlan.h | 8 +- .../x86-interleaved-accesses-masked-group.ll | 164 +++++++++++++ .../interleaved-accesses-masked-group.ll | 222 ++++++++++++++++++ .../interleaved-accesses-pred-stores.ll | 1 + 24 files changed, 654 insertions(+), 70 deletions(-) create mode 100644 test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll create mode 100644 test/Transforms/LoopVectorize/interleaved-accesses-masked-group.ll diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h index 18b5a5cf0e5..c2a9d1ec195 100644 --- a/include/llvm/Analysis/TargetTransformInfo.h +++ b/include/llvm/Analysis/TargetTransformInfo.h @@ -587,6 +587,10 @@ public: /// Enable matching of interleaved access groups. bool enableInterleavedAccessVectorization() const; + /// Enable matching of interleaved access groups that contain predicated + /// accesses and are vectorized using masked vector loads/stores. + bool enableMaskedInterleavedAccessVectorization() const; + /// Indicate that it is potentially unsafe to automatically vectorize /// floating-point operations because the semantics of vector and scalar /// floating-point semantics may differ. For example, ARM NEON v7 SIMD math @@ -821,9 +825,11 @@ public: /// load allows gaps) /// \p Alignment is the alignment of the memory operation /// \p AddressSpace is address space of the pointer. + /// \p IsMasked indicates if the memory access is predicated. int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace) const; + unsigned AddressSpace, + bool IsMasked = false) const; /// Calculate the cost of performing a vector reduction. /// @@ -1072,6 +1078,7 @@ public: virtual const MemCmpExpansionOptions *enableMemCmpExpansion( bool IsZeroCmp) const = 0; virtual bool enableInterleavedAccessVectorization() = 0; + virtual bool enableMaskedInterleavedAccessVectorization() = 0; virtual bool isFPVectorizationPotentiallyUnsafe() = 0; virtual bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth, @@ -1132,7 +1139,8 @@ public: unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace) = 0; + unsigned AddressSpace, + bool IsMasked = false) = 0; virtual int getArithmeticReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm) = 0; virtual int getMinMaxReductionCost(Type *Ty, Type *CondTy, @@ -1346,6 +1354,9 @@ public: bool enableInterleavedAccessVectorization() override { return Impl.enableInterleavedAccessVectorization(); } + bool enableMaskedInterleavedAccessVectorization() override { + return Impl.enableMaskedInterleavedAccessVectorization(); + } bool isFPVectorizationPotentiallyUnsafe() override { return Impl.isFPVectorizationPotentiallyUnsafe(); } @@ -1471,9 +1482,9 @@ public: } int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace) override { + unsigned AddressSpace, bool IsMasked) override { return Impl.getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace); + Alignment, AddressSpace, IsMasked); } int getArithmeticReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm) override { diff --git a/include/llvm/Analysis/TargetTransformInfoImpl.h b/include/llvm/Analysis/TargetTransformInfoImpl.h index e39fe66c0a4..c64d4d36805 100644 --- a/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -313,6 +313,8 @@ public: bool enableInterleavedAccessVectorization() { return false; } + bool enableMaskedInterleavedAccessVectorization() { return false; } + bool isFPVectorizationPotentiallyUnsafe() { return false; } bool allowsMisalignedMemoryAccesses(LLVMContext &Context, @@ -450,8 +452,8 @@ public: unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, - unsigned Alignment, - unsigned AddressSpace) { + unsigned Alignment, unsigned AddressSpace, + bool IsMasked = false) { return 1; } diff --git a/include/llvm/Analysis/VectorUtils.h b/include/llvm/Analysis/VectorUtils.h index 622d932f74f..2ac49f67662 100644 --- a/include/llvm/Analysis/VectorUtils.h +++ b/include/llvm/Analysis/VectorUtils.h @@ -125,6 +125,21 @@ computeMinimumValueSizes(ArrayRef Blocks, /// This function always sets a (possibly null) value for each K in Kinds. Instruction *propagateMetadata(Instruction *I, ArrayRef VL); +/// Create a mask with replicated elements. +/// +/// This function creates a shuffle mask for replicating each of the \p VF +/// elements in a vector \p ReplicationFactor times. It can be used to +/// transform a mask of \p VF elements into a mask of +/// \p VF * \p ReplicationFactor elements used by a predicated +/// interleaved-group of loads/stores whose Interleaved-factor == +/// \p ReplicationFactor. +/// +/// For example, the mask for \p ReplicationFactor=3 and \p VF=4 is: +/// +/// <0,0,0,1,1,1,2,2,2,3,3,3> +Constant *createReplicatedMask(IRBuilder<> &Builder, unsigned ReplicationFactor, + unsigned VF); + /// Create an interleave shuffle mask. /// /// This function creates a shuffle mask for interleaving \p NumVecs vectors of @@ -328,7 +343,7 @@ public: InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L, DominatorTree *DT, LoopInfo *LI, const LoopAccessInfo *LAI) - : PSE(PSE), TheLoop(L), DT(DT), LI(LI), LAI(LAI) {} + : PSE(PSE), TheLoop(L), DT(DT), LI(LI), LAI(LAI) {} ~InterleavedAccessInfo() { SmallPtrSet DelSet; @@ -341,7 +356,9 @@ public: /// Analyze the interleaved accesses and collect them in interleave /// groups. Substitute symbolic strides using \p Strides. - void analyzeInterleaving(); + /// Consider also predicated loads/stores in the analysis if + /// \p EnableMaskedInterleavedGroup is true. + void analyzeInterleaving(bool EnableMaskedInterleavedGroup); /// Check if \p Instr belongs to any interleave group. bool isInterleaved(Instruction *Instr) const { diff --git a/include/llvm/CodeGen/BasicTTIImpl.h b/include/llvm/CodeGen/BasicTTIImpl.h index b460cdc0ba1..e740fe57172 100644 --- a/include/llvm/CodeGen/BasicTTIImpl.h +++ b/include/llvm/CodeGen/BasicTTIImpl.h @@ -783,8 +783,8 @@ public: unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, - unsigned Alignment, - unsigned AddressSpace) { + unsigned Alignment, unsigned AddressSpace, + bool IsMasked = false) { VectorType *VT = dyn_cast(VecTy); assert(VT && "Expect a vector type for interleaved memory op"); @@ -795,8 +795,13 @@ public: VectorType *SubVT = VectorType::get(VT->getElementType(), NumSubElts); // Firstly, the cost of load/store operation. - unsigned Cost = static_cast(this)->getMemoryOpCost( - Opcode, VecTy, Alignment, AddressSpace); + unsigned Cost; + if (IsMasked) + Cost = static_cast(this)->getMaskedMemoryOpCost( + Opcode, VecTy, Alignment, AddressSpace); + else + Cost = static_cast(this)->getMemoryOpCost(Opcode, VecTy, Alignment, + AddressSpace); // Legalize the vector type, and get the legalized and unlegalized type // sizes. @@ -892,6 +897,31 @@ public: ->getVectorInstrCost(Instruction::InsertElement, VT, i); } + if (!IsMasked) + return Cost; + + Type *I8Type = Type::getInt8Ty(VT->getContext()); + VectorType *MaskVT = VectorType::get(I8Type, NumElts); + SubVT = VectorType::get(I8Type, NumSubElts); + + // The Mask shuffling cost is extract all the elements of the Mask + // and insert each of them Factor times into the wide vector: + // + // E.g. an interleaved group with factor 3: + // %mask = icmp ult <8 x i32> %vec1, %vec2 + // %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef, + // <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7> + // The cost is estimated as extract all mask elements from the <8xi1> mask + // vector and insert them factor times into the <24xi1> shuffled mask + // vector. + for (unsigned i = 0; i < NumSubElts; i++) + Cost += static_cast(this)->getVectorInstrCost( + Instruction::ExtractElement, SubVT, i); + + for (unsigned i = 0; i < NumElts; i++) + Cost += static_cast(this)->getVectorInstrCost( + Instruction::InsertElement, MaskVT, i); + return Cost; } diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp index 4ad48e351a4..867403d0ef1 100644 --- a/lib/Analysis/TargetTransformInfo.cpp +++ b/lib/Analysis/TargetTransformInfo.cpp @@ -268,6 +268,10 @@ bool TargetTransformInfo::enableInterleavedAccessVectorization() const { return TTIImpl->enableInterleavedAccessVectorization(); } +bool TargetTransformInfo::enableMaskedInterleavedAccessVectorization() const { + return TTIImpl->enableMaskedInterleavedAccessVectorization(); +} + bool TargetTransformInfo::isFPVectorizationPotentiallyUnsafe() const { return TTIImpl->isFPVectorizationPotentiallyUnsafe(); } @@ -515,9 +519,9 @@ int TargetTransformInfo::getGatherScatterOpCost(unsigned Opcode, Type *DataTy, int TargetTransformInfo::getInterleavedMemoryOpCost( unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, - unsigned Alignment, unsigned AddressSpace) const { - int Cost = TTIImpl->getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace); + unsigned Alignment, unsigned AddressSpace, bool IsMasked) const { + int Cost = TTIImpl->getInterleavedMemoryOpCost( + Opcode, VecTy, Factor, Indices, Alignment, AddressSpace, IsMasked); assert(Cost >= 0 && "TTI should not produce negative costs!"); return Cost; } diff --git a/lib/Analysis/VectorUtils.cpp b/lib/Analysis/VectorUtils.cpp index 272c665ace1..e14449b8838 100644 --- a/lib/Analysis/VectorUtils.cpp +++ b/lib/Analysis/VectorUtils.cpp @@ -502,6 +502,16 @@ Instruction *llvm::propagateMetadata(Instruction *Inst, ArrayRef VL) { return Inst; } +Constant *llvm::createReplicatedMask(IRBuilder<> &Builder, + unsigned ReplicationFactor, unsigned VF) { + SmallVector MaskVec; + for (unsigned i = 0; i < VF; i++) + for (unsigned j = 0; j < ReplicationFactor; j++) + MaskVec.push_back(Builder.getInt32(i)); + + return ConstantVector::get(MaskVec); +} + Constant *llvm::createInterleaveMask(IRBuilder<> &Builder, unsigned VF, unsigned NumVecs) { SmallVector Mask; @@ -672,7 +682,8 @@ void InterleavedAccessInfo::collectConstStrideAccesses( // this group because it and (2) are dependent. However, (1) can be grouped // with other accesses that may precede it in program order. Note that a // bottom-up order does not imply that WAW dependences should not be checked. -void InterleavedAccessInfo::analyzeInterleaving() { +void InterleavedAccessInfo::analyzeInterleaving( + bool EnablePredicatedInterleavedMemAccesses) { LLVM_DEBUG(dbgs() << "LV: Analyzing interleaved accesses...\n"); const ValueToValueMap &Strides = LAI->getSymbolicStrides(); @@ -712,9 +723,8 @@ void InterleavedAccessInfo::analyzeInterleaving() { // create a group for B, we continue with the bottom-up algorithm to ensure // we don't break any of B's dependences. InterleaveGroup *Group = nullptr; - // TODO: Ignore B if it is in a predicated block. This restriction can be - // relaxed in the future once we handle masked interleaved groups. - if (isStrided(DesB.Stride) && !isPredicated(B->getParent())) { + if (isStrided(DesB.Stride) && + (!isPredicated(B->getParent()) || EnablePredicatedInterleavedMemAccesses)) { Group = getInterleaveGroup(B); if (!Group) { LLVM_DEBUG(dbgs() << "LV: Creating an interleave group with:" << *B @@ -808,11 +818,12 @@ void InterleavedAccessInfo::analyzeInterleaving() { if (DistanceToB % static_cast(DesB.Size)) continue; - // Ignore A if either A or B is in a predicated block. Although we - // currently prevent group formation for predicated accesses, we may be - // able to relax this limitation in the future once we handle more - // complicated blocks. - if (isPredicated(A->getParent()) || isPredicated(B->getParent())) + // All members of a predicated interleave-group must have the same predicate, + // and currently must reside in the same BB. + BasicBlock *BlockA = A->getParent(); + BasicBlock *BlockB = B->getParent(); + if ((isPredicated(BlockA) || isPredicated(BlockB)) && + (!EnablePredicatedInterleavedMemAccesses || BlockA != BlockB)) continue; // The index of A is the index of B plus A's distance to B in multiples diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 96e751e8697..a16de89cf10 100644 --- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -659,11 +659,12 @@ int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace) { + unsigned AddressSpace, + bool IsMasked) { assert(Factor >= 2 && "Invalid interleave factor"); assert(isa(VecTy) && "Expect a vector type"); - if (Factor <= TLI->getMaxSupportedInterleaveFactor()) { + if (!IsMasked && Factor <= TLI->getMaxSupportedInterleaveFactor()) { unsigned NumElts = VecTy->getVectorNumElements(); auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor); @@ -676,7 +677,7 @@ int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, } return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace); + Alignment, AddressSpace, IsMasked); } int AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef Tys) { diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.h b/lib/Target/AArch64/AArch64TargetTransformInfo.h index c056a7d2428..b3893d32850 100644 --- a/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -146,7 +146,7 @@ public: int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace); + unsigned AddressSpace, bool IsMasked = false); bool shouldConsiderAddressTypePromotion(const Instruction &I, diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp index 1b0d162f726..90e0cd96682 100644 --- a/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -542,14 +542,16 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace) { + unsigned AddressSpace, + bool IsMasked) { assert(Factor >= 2 && "Invalid interleave factor"); assert(isa(VecTy) && "Expect a vector type"); // vldN/vstN doesn't support vector types of i64/f64 element. bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64; - if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits) { + if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits && + !IsMasked) { unsigned NumElts = VecTy->getVectorNumElements(); auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor); @@ -562,7 +564,7 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, } return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace); + Alignment, AddressSpace, IsMasked); } void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, diff --git a/lib/Target/ARM/ARMTargetTransformInfo.h b/lib/Target/ARM/ARMTargetTransformInfo.h index 7d14bd7c256..84e3055c6bc 100644 --- a/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/lib/Target/ARM/ARMTargetTransformInfo.h @@ -169,7 +169,7 @@ public: int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace); + unsigned AddressSpace, bool IsMasked); void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP); diff --git a/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp index 4d0e7dc52e8..79b269bccfe 100644 --- a/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp +++ b/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp @@ -206,10 +206,10 @@ unsigned HexagonTTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *DataTy, unsigned HexagonTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, - unsigned Alignment, unsigned AddressSpace) { - if (Indices.size() != Factor) + unsigned Alignment, unsigned AddressSpace, bool IsMasked) { + if (Indices.size() != Factor || IsMasked) return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace); + Alignment, AddressSpace, IsMasked); return getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, nullptr); } diff --git a/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/lib/Target/Hexagon/HexagonTargetTransformInfo.h index 2c03cd268ff..901a91692e8 100644 --- a/lib/Target/Hexagon/HexagonTargetTransformInfo.h +++ b/lib/Target/Hexagon/HexagonTargetTransformInfo.h @@ -123,7 +123,7 @@ public: bool VariableMask, unsigned Alignment); unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace); + unsigned AddressSpace, bool IsMasked); unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, const Instruction *I); unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp index b0da9b5a6d7..f67bacc87ec 100644 --- a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -473,7 +473,12 @@ int PPCTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace) { + unsigned AddressSpace, + bool IsMasked) { + if (IsMasked) + return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, + Alignment, AddressSpace, IsMasked); + assert(isa(VecTy) && "Expect a vector type for interleaved memory op"); diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.h b/lib/Target/PowerPC/PPCTargetTransformInfo.h index 2ee2b3eb808..252d46e7a2a 100644 --- a/lib/Target/PowerPC/PPCTargetTransformInfo.h +++ b/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -90,7 +90,8 @@ public: unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace); + unsigned AddressSpace, + bool IsMasked = false); /// @} }; diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp index 6f553d5bed3..1eaeb9699bf 100644 --- a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -909,7 +909,11 @@ int SystemZTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace) { + unsigned AddressSpace, + bool IsMasked) { + if (IsMasked) + return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, + Alignment, AddressSpace, IsMasked); assert(isa(VecTy) && "Expect a vector type for interleaved memory op"); diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/lib/Target/SystemZ/SystemZTargetTransformInfo.h index bfa942357c5..92b2b9bdcb8 100644 --- a/lib/Target/SystemZ/SystemZTargetTransformInfo.h +++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.h @@ -92,7 +92,7 @@ public: unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace); + unsigned AddressSpace, bool IsMasked = false); /// @} }; diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp index d3a75123935..82e4dfe25b7 100644 --- a/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -2723,7 +2723,12 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace) { + unsigned AddressSpace, + bool IsMasked) { + + if (IsMasked) + return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, + Alignment, AddressSpace, IsMasked); // We currently Support only fully-interleaved groups, with no gaps. // TODO: Support also strided loads (interleaved-groups with gaps). @@ -2832,7 +2837,12 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace) { + unsigned AddressSpace, + bool IsMasked) { + + if (IsMasked) + return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, + Alignment, AddressSpace, IsMasked); // VecTy for interleave memop is . // So, for VF=4, Interleave Factor = 3, Element type = i32 we have @@ -2950,7 +2960,8 @@ int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, unsigned Alignment, - unsigned AddressSpace) { + unsigned AddressSpace, + bool IsMasked) { auto isSupportedOnAVX512 = [](Type *VecTy, bool HasBW) { Type *EltTy = VecTy->getVectorElementType(); if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) || @@ -2962,11 +2973,11 @@ int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, }; if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI())) return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace); + Alignment, AddressSpace, IsMasked); if (ST->hasAVX2()) return getInterleavedMemoryOpCostAVX2(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace); + Alignment, AddressSpace, IsMasked); return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace); + Alignment, AddressSpace, IsMasked); } diff --git a/lib/Target/X86/X86TargetTransformInfo.h b/lib/Target/X86/X86TargetTransformInfo.h index 3df89903882..2bd778a4211 100644 --- a/lib/Target/X86/X86TargetTransformInfo.h +++ b/lib/Target/X86/X86TargetTransformInfo.h @@ -101,13 +101,16 @@ public: int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, - unsigned Alignment, unsigned AddressSpace); + unsigned Alignment, unsigned AddressSpace, + bool IsMasked = false); int getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, - unsigned Alignment, unsigned AddressSpace); + unsigned Alignment, unsigned AddressSpace, + bool IsMasked = false); int getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, - unsigned Alignment, unsigned AddressSpace); + unsigned Alignment, unsigned AddressSpace, + bool IsMasked = false); int getIntImmCost(int64_t); diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 7ebe8d102b7..e93cfb34156 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -172,6 +172,10 @@ static cl::opt EnableInterleavedMemAccesses( "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on interleaved memory accesses in a loop")); +static cl::opt EnableMaskedInterleavedMemAccesses( + "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, + cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); + /// We don't interleave loops with a known constant trip count below this /// number. static const unsigned TinyTripCountInterleaveThreshold = 128; @@ -408,8 +412,10 @@ public: /// Construct the vector value of a scalarized value \p V one lane at a time. void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); - /// Try to vectorize the interleaved access group that \p Instr belongs to. - void vectorizeInterleaveGroup(Instruction *Instr); + /// Try to vectorize the interleaved access group that \p Instr belongs to, + /// optionally masking the vector operations if \p BlockInMask is non-null. + void vectorizeInterleaveGroup(Instruction *Instr, + VectorParts *BlockInMask = nullptr); /// Vectorize Load and Store instructions, optionally masking the vector /// operations if \p BlockInMask is non-null. @@ -1112,6 +1118,11 @@ public: /// access that can be widened. bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1); + /// Returns true if \p I is a memory instruction in an interleaved-group + /// of memory accesses that can be vectorized with wide vector loads/stores + /// and shuffles. + bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1); + /// Check if \p Instr belongs to any interleaved access group. bool isAccessInterleaved(Instruction *Instr) { return InterleaveInfo.isInterleaved(Instr); @@ -1946,7 +1957,8 @@ Value *InnerLoopVectorizer::reverseVector(Value *Vec) { // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B -void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) { +void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, + VectorParts *BlockInMask) { const InterleaveGroup *Group = Cost->getInterleavedAccessGroup(Instr); assert(Group && "Fail to get an interleaved access group."); @@ -1968,6 +1980,15 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) { SmallVector NewPtrs; unsigned Index = Group->getIndex(Instr); + VectorParts Mask; + bool IsMaskRequired = BlockInMask; + if (IsMaskRequired) { + Mask = *BlockInMask; + // TODO: extend the masked interleaved-group support to reversed access. + assert(!Group->isReverse() && "Reversed masked interleave-group " + "not supported."); + } + // If the group is reverse, adjust the index to refer to the last vector lane // instead of the first. We adjust the index from the first vector lane, // rather than directly getting the pointer for lane VF - 1, because the @@ -2011,8 +2032,19 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) { // For each unroll part, create a wide load for the group. SmallVector NewLoads; for (unsigned Part = 0; Part < UF; Part++) { - auto *NewLoad = Builder.CreateAlignedLoad( - NewPtrs[Part], Group->getAlignment(), "wide.vec"); + Instruction *NewLoad; + if (IsMaskRequired) { + auto *Undefs = UndefValue::get(Mask[Part]->getType()); + auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF); + Value *ShuffledMask = Builder.CreateShuffleVector( + Mask[Part], Undefs, RepMask, "interleaved.mask"); + NewLoad = Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(), + ShuffledMask, UndefVec, + "wide.masked.vec"); + } + else + NewLoad = Builder.CreateAlignedLoad(NewPtrs[Part], + Group->getAlignment(), "wide.vec"); Group->addMetadata(NewLoad); NewLoads.push_back(NewLoad); } @@ -2079,8 +2111,18 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) { Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask, "interleaved.vec"); - Instruction *NewStoreInstr = - Builder.CreateAlignedStore(IVec, NewPtrs[Part], Group->getAlignment()); + Instruction *NewStoreInstr; + if (IsMaskRequired) { + auto *Undefs = UndefValue::get(Mask[Part]->getType()); + auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF); + Value *ShuffledMask = Builder.CreateShuffleVector( + Mask[Part], Undefs, RepMask, "interleaved.mask"); + NewStoreInstr = Builder.CreateMaskedStore( + IVec, NewPtrs[Part], Group->getAlignment(), ShuffledMask); + } + else + NewStoreInstr = Builder.CreateAlignedStore(IVec, NewPtrs[Part], + Group->getAlignment()); Group->addMetadata(NewStoreInstr); } @@ -4253,6 +4295,32 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigne return false; } +static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { + if (!(EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)) + return TTI.enableMaskedInterleavedAccessVectorization(); + + // If an override option has been passed in for interleaved accesses, use it. + return EnableMaskedInterleavedMemAccesses; +} + +bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I, + unsigned VF) { + assert(isAccessInterleaved(I) && "Expecting interleaved access."); + assert(getWideningDecision(I, VF) == CM_Unknown && + "Decision should not be set yet."); + + if (!Legal->blockNeedsPredication(I->getParent()) || + !Legal->isMaskRequired(I)) + return true; + + if (!useMaskedInterleavedAccesses(TTI)) + return false; + + auto *Ty = getMemInstValueType(I); + return isa(I) ? TTI.isLegalMaskedLoad(Ty) + : TTI.isLegalMaskedStore(Ty); +} + bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I, unsigned VF) { // Get and ensure we have a valid memory instruction. @@ -5371,13 +5439,17 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, } // Calculate the cost of the whole interleaved group. - unsigned Cost = TTI.getInterleavedMemoryOpCost(I->getOpcode(), WideVecTy, - Group->getFactor(), Indices, - Group->getAlignment(), AS); - - if (Group->isReverse()) + unsigned Cost = TTI.getInterleavedMemoryOpCost( + I->getOpcode(), WideVecTy, Group->getFactor(), Indices, + Group->getAlignment(), AS, Legal->isMaskRequired(I)); + + if (Group->isReverse()) { + // TODO: Add support for reversed masked interleaved access. + assert(!Legal->isMaskRequired(I) && + "Reverse masked interleaved access not supported."); Cost += Group->getNumMembers() * TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); + } return Cost; } @@ -5479,7 +5551,8 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { continue; NumAccesses = Group->getNumMembers(); - InterleaveCost = getInterleaveGroupCost(&I, VF); + if (interleavedAccessCanBeWidened(&I, VF)) + InterleaveCost = getInterleaveGroupCost(&I, VF); } unsigned GatherScatterCost = @@ -6152,7 +6225,8 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { } VPInterleaveRecipe *VPRecipeBuilder::tryToInterleaveMemory(Instruction *I, - VFRange &Range) { + VFRange &Range, + VPlanPtr &Plan) { const InterleaveGroup *IG = CM.getInterleavedAccessGroup(I); if (!IG) return nullptr; @@ -6174,7 +6248,11 @@ VPInterleaveRecipe *VPRecipeBuilder::tryToInterleaveMemory(Instruction *I, assert(I == IG->getInsertPos() && "Generating a recipe for an adjunct member of an interleave group"); - return new VPInterleaveRecipe(IG); + VPValue *Mask = nullptr; + if (Legal->isMaskRequired(I)) + Mask = createBlockInMask(I->getParent(), Plan); + + return new VPInterleaveRecipe(IG, Mask); } VPWidenMemoryInstructionRecipe * @@ -6442,7 +6520,7 @@ bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range, VPRecipeBase *Recipe = nullptr; // Check if Instr should belong to an interleave memory recipe, or already // does. In the latter case Instr is irrelevant. - if ((Recipe = tryToInterleaveMemory(Instr, Range))) { + if ((Recipe = tryToInterleaveMemory(Instr, Range, Plan))) { VPBB->appendRecipe(Recipe); return true; } @@ -6669,6 +6747,10 @@ void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const { O << " +\n" << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; IG->getInsertPos()->printAsOperand(O, false); + if (User) { + O << ", "; + User->getOperand(0)->printAsOperand(O); + } O << "\\l\""; for (unsigned i = 0; i < IG->getFactor(); ++i) if (Instruction *I = IG->getMember(i)) @@ -6731,7 +6813,15 @@ void VPBlendRecipe::execute(VPTransformState &State) { void VPInterleaveRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Interleave group being replicated."); - State.ILV->vectorizeInterleaveGroup(IG->getInsertPos()); + if (!User) + return State.ILV->vectorizeInterleaveGroup(IG->getInsertPos()); + + // Last (and currently only) operand is a mask. + InnerLoopVectorizer::VectorParts MaskValues(State.UF); + VPValue *Mask = User->getOperand(User->getNumOperands() - 1); + for (unsigned Part = 0; Part < State.UF; ++Part) + MaskValues[Part] = State.get(Mask, Part); + State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), &MaskValues); } void VPReplicateRecipe::execute(VPTransformState &State) { @@ -7030,7 +7120,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { // Analyze interleaved memory accesses. if (UseInterleaved) { - IAI.analyzeInterleaving(); + IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); } // Use the cost model. diff --git a/lib/Transforms/Vectorize/VPRecipeBuilder.h b/lib/Transforms/Vectorize/VPRecipeBuilder.h index f43a8bb123b..15d38ac9c84 100644 --- a/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -69,7 +69,8 @@ public: /// \return value is , as it is handled by another recipe. /// \p Range.End may be decreased to ensure same decision from \p Range.Start /// to \p Range.End. - VPInterleaveRecipe *tryToInterleaveMemory(Instruction *I, VFRange &Range); + VPInterleaveRecipe *tryToInterleaveMemory(Instruction *I, VFRange &Range, + VPlanPtr &Plan); /// Check if \I is a memory instruction to be widened for \p Range.Start and /// potentially masked. Such instructions are handled by a recipe that takes diff --git a/lib/Transforms/Vectorize/VPlan.h b/lib/Transforms/Vectorize/VPlan.h index c3123b41600..81b1986c97d 100644 --- a/lib/Transforms/Vectorize/VPlan.h +++ b/lib/Transforms/Vectorize/VPlan.h @@ -769,10 +769,14 @@ public: class VPInterleaveRecipe : public VPRecipeBase { private: const InterleaveGroup *IG; + std::unique_ptr User; public: - VPInterleaveRecipe(const InterleaveGroup *IG) - : VPRecipeBase(VPInterleaveSC), IG(IG) {} + VPInterleaveRecipe(const InterleaveGroup *IG, VPValue *Mask) + : VPRecipeBase(VPInterleaveSC), IG(IG) { + if (Mask) // Create a VPInstruction to register as a user of the mask. + User.reset(new VPUser({Mask})); + } ~VPInterleaveRecipe() override = default; /// Method to support type inquiry through isa, cast, and dyn_cast. diff --git a/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll b/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll new file mode 100644 index 00000000000..b1163d0a199 --- /dev/null +++ b/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll @@ -0,0 +1,164 @@ +; RUN: opt -mcpu=skx -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses < %s | FileCheck %s -check-prefix=DISABLED_MASKED_STRIDED +; RUN: opt -mcpu=skx -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses -enable-masked-interleaved-mem-accesses < %s | FileCheck %s -check-prefix=ENABLED_MASKED_STRIDED + +target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128" +target triple = "i386-unknown-linux-gnu" + +; When masked-interleaved-groups are disabled: +; Check that the predicated load is not vectorized as an +; interleaved-group but rather as a scalarized accesses. +; (For SKX, Gather is not supported by the compiler for chars, therefore +; the only remaining alternative is to scalarize). +; When masked-interleave-group is enabled we expect to find the proper mask +; shuffling code, feeding the wide masked load for an interleave-group (with +; a single member). +; +; void masked_strided1(const unsigned char* restrict p, +; unsigned char* restrict q, +; unsigned char guard) { +; for(ix=0; ix < 1024; ++ix) { +; if (ix > guard) { +; char t = p[2*ix]; +; q[ix] = t; +; } +; } +; } + +;DISABLED_MASKED_STRIDED-LABEL: @masked_strided1( +;DISABLED_MASKED_STRIDED: vector.body: +;DISABLED_MASKED_STRIDED-NEXT: %index = phi i32 +;DISABLED_MASKED_STRIDED-NEXT: %[[VECIND:.+]] = phi <8 x i32> [ +;DISABLED_MASKED_STRIDED-NOT: %interleaved.mask = +;DISABLED_MASKED_STRIDED-NOT: call void @llvm.masked.load. +;DISABLED_MASKED_STRIDED-NOT: %{{.*}} = shufflevector <16 x i8> %[[WIDEVEC]], <16 x i8> undef, <8 x i32> +;DISABLED_MASKED_STRIDED: %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}} +;DISABLED_MASKED_STRIDED-NEXT: %{{.*}} = shl nuw nsw <8 x i32> %[[VECIND]], +;DISABLED_MASKED_STRIDED-NEXT: %[[M:.+]] = extractelement <8 x i1> %[[VMASK]], i32 0 +;DISABLED_MASKED_STRIDED-NEXT: br i1 %[[M]], label %pred.load.if, label %pred.load.continue +;DISABLED_MASKED_STRIDED-NOT: %interleaved.mask = +;DISABLED_MASKED_STRIDED-NOT: call void @llvm.masked.load. +;DISABLED_MASKED_STRIDED-NOT: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <8 x i32> + +;ENABLED_MASKED_STRIDED-LABEL: @masked_strided1( +;ENABLED_MASKED_STRIDED: vector.body: +;ENABLED_MASKED_STRIDED-NEXT: %index = phi i32 +;ENABLED_MASKED_STRIDED-NEXT: %[[VECIND:.+]] = phi <8 x i32> [ +;ENABLED_MASKED_STRIDED: %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}} +;ENABLED_MASKED_STRIDED: %interleaved.mask = shufflevector <8 x i1> %[[VMASK]], <8 x i1> undef, <16 x i32> +;ENABLED_MASKED_STRIDED-NEXT: %[[WIDEMASKEDLOAD:.+]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %{{.*}}, i32 1, <16 x i1> %interleaved.mask, <16 x i8> undef) +;ENABLED_MASKED_STRIDED-NEXT: %[[STRIDEDVEC:.+]] = shufflevector <16 x i8> %[[WIDEMASKEDLOAD]], <16 x i8> undef, <8 x i32> + +define dso_local void @masked_strided1(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr { +entry: + %conv = zext i8 %guard to i32 + br label %for.body + +for.body: + %ix.09 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp1 = icmp ugt i32 %ix.09, %conv + br i1 %cmp1, label %if.then, label %for.inc + +if.then: + %mul = shl nuw nsw i32 %ix.09, 1 + %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul + %0 = load i8, i8* %arrayidx, align 1 + %arrayidx3 = getelementptr inbounds i8, i8* %q, i32 %ix.09 + store i8 %0, i8* %arrayidx3, align 1 + br label %for.inc + +for.inc: + %inc = add nuw nsw i32 %ix.09, 1 + %exitcond = icmp eq i32 %inc, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +; Check also a scenario with full interleave-groups (no gaps) as well as both +; load and store groups. We check that when masked-interleave-group is disabled +; the predicated loads (and stores) are not vectorized as an +; interleaved-group but rather as four separate scalarized accesses. +; (For SKX, gather/scatter is not supported by the compiler for chars, therefore +; the only remaining alternative is to scalarize). +; When masked-interleave-group is enabled we expect to find the proper mask +; shuffling code, feeding the wide masked load/store for the two interleave- +; groups. +; +; void masked_strided2(const unsigned char* restrict p, +; unsigned char* restrict q, +; unsigned char guard) { +; for(ix=0; ix < 1024; ++ix) { +; if (ix > guard) { +; char left = p[2*ix]; +; char right = p[2*ix + 1]; +; char max = max(left, right); +; q[2*ix] = max; +; q[2*ix+1] = 0 - max; +; } +; } +;} + +;DISABLED_MASKED_STRIDED-LABEL: @masked_strided2( +;DISABLED_MASKED_STRIDED: vector.body: +;DISABLED_MASKED_STRIDED-NEXT: %index = phi i32 +;DISABLED_MASKED_STRIDED-NEXT: %[[VECIND:.+]] = phi <8 x i32> [ +;DISABLED_MASKED_STRIDED-NOT: %interleaved.mask = +;DISABLED_MASKED_STRIDED-NOT: call void @llvm.masked.load. +;DISABLED_MASKED_STRIDED-NOT: call void @llvm.masked.store. +;DISABLED_MASKED_STRIDED-NOT: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <8 x i32> +;DISABLED_MASKED_STRIDED: %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}} +;DISABLED_MASKED_STRIDED-NEXT: %{{.*}} = shl nuw nsw <8 x i32> %[[VECIND]], +;DISABLED_MASKED_STRIDED-NEXT: %[[M:.+]] = extractelement <8 x i1> %[[VMASK]], i32 0 +;DISABLED_MASKED_STRIDED-NEXT: br i1 %[[M]], label %pred.load.if, label %pred.load.continue +;DISABLED_MASKED_STRIDED-NOT: %interleaved.mask = +;DISABLED_MASKED_STRIDED-NOT: call void @llvm.masked.load. +;DISABLED_MASKED_STRIDED-NOT: call void @llvm.masked.store. +;DISABLED_MASKED_STRIDED-NOT: %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <8 x i32> + +;ENABLED_MASKED_STRIDED-LABEL: @masked_strided2( +;ENABLED_MASKED_STRIDED: vector.body: +;ENABLED_MASKED_STRIDED-NEXT: %index = phi i32 +;ENABLED_MASKED_STRIDED-NEXT: %[[VECIND:.+]] = phi <8 x i32> [ +;ENABLED_MASKED_STRIDED: %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}} +;ENABLED_MASKED_STRIDED: %interleaved.mask = shufflevector <8 x i1> %[[VMASK]], <8 x i1> undef, <16 x i32> +;ENABLED_MASKED_STRIDED-NEXT: %[[WIDEMASKEDLOAD:.+]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %{{.*}}, i32 1, <16 x i1> %interleaved.mask, <16 x i8> undef) +;ENABLED_MASKED_STRIDED-NEXT: %{{.*}} = shufflevector <16 x i8> %[[WIDEMASKEDLOAD]], <16 x i8> undef, <8 x i32> +;ENABLED_MASKED_STRIDED-NEXT: %{{.*}} = shufflevector <16 x i8> %[[WIDEMASKEDLOAD]], <16 x i8> undef, <8 x i32> +;ENABLED_MASKED_STRIDED: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %{{.*}}, <16 x i8>* %{{.*}}, i32 1, <16 x i1> %interleaved.mask) + +; Function Attrs: norecurse nounwind +define dso_local void @masked_strided2(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr { +entry: + %conv = zext i8 %guard to i32 + br label %for.body + +for.body: + %ix.024 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp1 = icmp ugt i32 %ix.024, %conv + br i1 %cmp1, label %if.then, label %for.inc + +if.then: + %mul = shl nuw nsw i32 %ix.024, 1 + %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul + %0 = load i8, i8* %arrayidx, align 1 + %add = or i32 %mul, 1 + %arrayidx4 = getelementptr inbounds i8, i8* %p, i32 %add + %1 = load i8, i8* %arrayidx4, align 1 + %cmp.i = icmp slt i8 %0, %1 + %spec.select.i = select i1 %cmp.i, i8 %1, i8 %0 + %arrayidx6 = getelementptr inbounds i8, i8* %q, i32 %mul + store i8 %spec.select.i, i8* %arrayidx6, align 1 + %sub = sub i8 0, %spec.select.i + %arrayidx11 = getelementptr inbounds i8, i8* %q, i32 %add + store i8 %sub, i8* %arrayidx11, align 1 + br label %for.inc + +for.inc: + %inc = add nuw nsw i32 %ix.024, 1 + %exitcond = icmp eq i32 %inc, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} diff --git a/test/Transforms/LoopVectorize/interleaved-accesses-masked-group.ll b/test/Transforms/LoopVectorize/interleaved-accesses-masked-group.ll new file mode 100644 index 00000000000..9ed66a22dbf --- /dev/null +++ b/test/Transforms/LoopVectorize/interleaved-accesses-masked-group.ll @@ -0,0 +1,222 @@ +; REQUIRES: asserts +; RUN: opt -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses -debug-only=loop-vectorize,vectorutils -disable-output < %s 2>&1 | FileCheck %s -check-prefix=STRIDED_UNMASKED +; RUN: opt -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses -enable-masked-interleaved-mem-accesses -debug-only=loop-vectorize,vectorutils -disable-output < %s 2>&1 | FileCheck %s -check-prefix=STRIDED_MASKED + +target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128" + +; We test here that the loop-vectorizer forms an interleave-groups from +; predicated memory accesses only if they are both in the same (predicated) +; block (first scenario below). +; If the accesses are not in the same predicated block, an interleave-group +; is not formed (scenarios 2,3 below). + +; Scenario 1: Check the case where it is legal to create masked interleave- +; groups. Altogether two groups are created (one for loads and one for stores) +; when masked-interleaved-acceses are enabled. When masked-interleaved-acceses +; are disabled we do not create any interleave-group. +; +; void masked_strided1(const unsigned char* restrict p, +; unsigned char* restrict q, +; unsigned char guard) { +; for(ix=0; ix < 1024; ++ix) { +; if (ix > guard) { +; char left = p[2*ix]; +; char right = p[2*ix + 1]; +; char max = max(left, right); +; q[2*ix] = max; +; q[2*ix+1] = 0 - max; +; } +; } +;} + + +; STRIDED_UNMASKED: LV: Checking a loop in "masked_strided1" +; STRIDED_UNMASKED: LV: Analyzing interleaved accesses... +; STRIDED_UNMASKED-NOT: LV: Creating an interleave group + +; STRIDED_MASKED: LV: Checking a loop in "masked_strided1" +; STRIDED_MASKED: LV: Analyzing interleaved accesses... +; STRIDED_MASKED-NEXT: LV: Creating an interleave group with: store i8 %{{.*}}, i8* %{{.*}}, align 1 +; STRIDED_MASKED-NEXT: LV: Inserted: store i8 %{{.*}}, i8* %{{.*}}, align 1 +; STRIDED_MASKED-NEXT: into the interleave group with store i8 %{{.*}}, i8* %{{.*}}, align 1 +; STRIDED_MASKED-NEXT: LV: Creating an interleave group with: %{{.*}} = load i8, i8* %{{.*}}, align 1 +; STRIDED_MASKED-NEXT: LV: Inserted: %{{.*}} = load i8, i8* %{{.*}}, align 1 +; STRIDED_MASKED-NEXT: into the interleave group with %{{.*}} = load i8, i8* %{{.*}}, align 1 + +; Scenario 2: Check the case where it is illegal to create a masked interleave- +; group because the first access is predicated, and the second isn't. +; We therefore create a separate interleave-group with gaps for each of the +; stores (if masked-interleaved-accesses are enabled) and these are later +; invalidated because interleave-groups of stores with gaps are not supported. +; If masked-interleaved-accesses is not enabled we create only one interleave +; group of stores (for the non-predicated store) and it is later invalidated +; due to gaps. +; +; void masked_strided2(const unsigned char* restrict p, +; unsigned char* restrict q, +; unsigned char guard1, +; unsigned char guard2) { +; for(ix=0; ix < 1024; ++ix) { +; if (ix > guard1) { +; q[2*ix] = 1; +; } +; q[2*ix+1] = 2; +; } +;} + +; STRIDED_UNMASKED: LV: Checking a loop in "masked_strided2" +; STRIDED_UNMASKED: LV: Analyzing interleaved accesses... +; STRIDED_UNMASKED-NEXT: LV: Creating an interleave group with: store i8 1, i8* %{{.*}}, align 1 +; STRIDED_UNMASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps. +; STRIDED_UNMASKED-NOT: LV: Creating an interleave group + +; STRIDED_MASKED: LV: Checking a loop in "masked_strided2" +; STRIDED_MASKED: LV: Analyzing interleaved accesses... +; STRIDED_MASKED-NEXT: LV: Creating an interleave group with: store i8 2, i8* %{{.*}}, align 1 +; STRIDED_MASKED-NEXT: LV: Creating an interleave group with: store i8 1, i8* %{{.*}}, align 1 +; STRIDED_MASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps. +; STRIDED_MASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps. + + +; Scenario 3: Check the case where it is illegal to create a masked interleave- +; group because the two accesses are in separate predicated blocks. +; We therefore create a separate interleave-group with gaps for each of the accesses, +; (which are later invalidated because interleave-groups of stores with gaps are +; not supported). +; If masked-interleaved-accesses is not enabled we don't create any interleave +; group because all accesses are predicated. +; +; void masked_strided3(const unsigned char* restrict p, +; unsigned char* restrict q, +; unsigned char guard1, +; unsigned char guard2) { +; for(ix=0; ix < 1024; ++ix) { +; if (ix > guard1) { +; q[2*ix] = 1; +; } +; if (ix > guard2) { +; q[2*ix+1] = 2; +; } +; } +;} + + +; STRIDED_UNMASKED: LV: Checking a loop in "masked_strided3" +; STRIDED_UNMASKED: LV: Analyzing interleaved accesses... +; STRIDED_UNMASKED-NOT: LV: Creating an interleave group + +; STRIDED_MASKED: LV: Checking a loop in "masked_strided3" +; STRIDED_MASKED: LV: Analyzing interleaved accesses... +; STRIDED_MASKED-NEXT: LV: Creating an interleave group with: store i8 2, i8* %{{.*}}, align 1 +; STRIDED_MASKED-NEXT: LV: Creating an interleave group with: store i8 1, i8* %{{.*}}, align 1 +; STRIDED_MASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps. +; STRIDED_MASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps. + + +; ModuleID = 'test.c' +source_filename = "test.c" +target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128" +target triple = "i386-unknown-linux-gnu" + +define dso_local void @masked_strided1(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr #0 { +entry: + %conv = zext i8 %guard to i32 + br label %for.body + +for.body: + %ix.024 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp1 = icmp ugt i32 %ix.024, %conv + br i1 %cmp1, label %if.then, label %for.inc + +if.then: + %mul = shl nuw nsw i32 %ix.024, 1 + %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul + %0 = load i8, i8* %arrayidx, align 1 + %add = or i32 %mul, 1 + %arrayidx4 = getelementptr inbounds i8, i8* %p, i32 %add + %1 = load i8, i8* %arrayidx4, align 1 + %cmp.i = icmp slt i8 %0, %1 + %spec.select.i = select i1 %cmp.i, i8 %1, i8 %0 + %arrayidx6 = getelementptr inbounds i8, i8* %q, i32 %mul + store i8 %spec.select.i, i8* %arrayidx6, align 1 + %sub = sub i8 0, %spec.select.i + %arrayidx11 = getelementptr inbounds i8, i8* %q, i32 %add + store i8 %sub, i8* %arrayidx11, align 1 + br label %for.inc + +for.inc: + %inc = add nuw nsw i32 %ix.024, 1 + %exitcond = icmp eq i32 %inc, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + + +define dso_local void @masked_strided2(i8* noalias nocapture readnone %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr #0 { +entry: + %conv = zext i8 %guard to i32 + br label %for.body + +for.body: + %ix.012 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %mul = shl nuw nsw i32 %ix.012, 1 + %arrayidx = getelementptr inbounds i8, i8* %q, i32 %mul + store i8 1, i8* %arrayidx, align 1 + %cmp1 = icmp ugt i32 %ix.012, %conv + br i1 %cmp1, label %if.then, label %for.inc + +if.then: + %add = or i32 %mul, 1 + %arrayidx3 = getelementptr inbounds i8, i8* %q, i32 %add + store i8 2, i8* %arrayidx3, align 1 + br label %for.inc + +for.inc: + %inc = add nuw nsw i32 %ix.012, 1 + %exitcond = icmp eq i32 %inc, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + + +define dso_local void @masked_strided3(i8* noalias nocapture readnone %p, i8* noalias nocapture %q, i8 zeroext %guard1, i8 zeroext %guard2) local_unnamed_addr #0 { +entry: + %conv = zext i8 %guard1 to i32 + %conv3 = zext i8 %guard2 to i32 + br label %for.body + +for.body: + %ix.018 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %mul = shl nuw nsw i32 %ix.018, 1 + %cmp1 = icmp ugt i32 %ix.018, %conv + br i1 %cmp1, label %if.then, label %if.end + +if.then: + %arrayidx = getelementptr inbounds i8, i8* %q, i32 %mul + store i8 1, i8* %arrayidx, align 1 + br label %if.end + +if.end: + %cmp4 = icmp ugt i32 %ix.018, %conv3 + br i1 %cmp4, label %if.then6, label %for.inc + +if.then6: + %add = or i32 %mul, 1 + %arrayidx7 = getelementptr inbounds i8, i8* %q, i32 %add + store i8 2, i8* %arrayidx7, align 1 + br label %for.inc + +for.inc: + %inc = add nuw nsw i32 %ix.018, 1 + %exitcond = icmp eq i32 %inc, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +attributes #0 = { "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" } diff --git a/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll b/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll index 89c0ac10916..c647f586b18 100644 --- a/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll +++ b/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll @@ -1,4 +1,5 @@ ; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=2 -force-vector-interleave=1 -enable-interleaved-mem-accesses < %s | FileCheck %s +; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=2 -force-vector-interleave=1 -enable-interleaved-mem-accesses -enable-masked-interleaved-mem-accesses < %s | FileCheck %s target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" %pair = type { i64, i64 } -- GitLab From 3394148166680c0a876eab4de4ba9092fef5cd3a Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Sun, 14 Oct 2018 15:25:06 +0000 Subject: [PATCH 0167/1116] [InstCombine] combine a shuffle and an extract subvector shuffle This is part of the missing IR-level folding noted in D52912. This should be ok as a canonicalization because the new shuffle mask can't be any more complicated than the existing shuffle mask. If there's some target where the shorter vector shuffle is not legal, it should just end up expanding to something like the pair of shuffles that we're starting with here. Differential Revision: https://reviews.llvm.org/D53037 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344476 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../InstCombine/InstCombineVectorOps.cpp | 38 +++++++++++++++++++ test/Transforms/InstCombine/vec_shuffle.ll | 8 ++-- 2 files changed, 41 insertions(+), 5 deletions(-) diff --git a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp index 61a3e31f960..bdd8fe3eead 100644 --- a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -1477,6 +1477,41 @@ static Instruction *narrowVectorSelect(ShuffleVectorInst &Shuf, return SelectInst::Create(NarrowCond, NarrowX, NarrowY); } +/// Try to combine 2 shuffles into 1 shuffle by concatenating a shuffle mask. +static Instruction *foldIdentityExtractShuffle(ShuffleVectorInst &Shuf) { + Value *Op0 = Shuf.getOperand(0), *Op1 = Shuf.getOperand(1); + if (!Shuf.isIdentityWithExtract() || !isa(Op1)) + return nullptr; + + Value *X, *Y; + Constant *Mask; + if (!match(Op0, m_ShuffleVector(m_Value(X), m_Value(Y), m_Constant(Mask)))) + return nullptr; + + // We are extracting a subvector from a shuffle. Remove excess elements from + // the 1st shuffle mask to eliminate the extract. + // + // This transform is conservatively limited to identity extracts because we do + // not allow arbitrary shuffle mask creation as a target-independent transform + // (because we can't guarantee that will lower efficiently). + // + // If the extracting shuffle has an undef mask element, it transfers to the + // new shuffle mask. Otherwise, copy the original mask element. Example: + // shuf (shuf X, Y, ), undef, <0, undef, 2, 3> --> + // shuf X, Y, + unsigned NumElts = Shuf.getType()->getVectorNumElements(); + SmallVector NewMask(NumElts); + assert(NumElts < Mask->getType()->getVectorNumElements() && + "Identity with extract must have less elements than its inputs"); + + for (unsigned i = 0; i != NumElts; ++i) { + Constant *ExtractMaskElt = Shuf.getMask()->getAggregateElement(i); + Constant *MaskElt = Mask->getAggregateElement(i); + NewMask[i] = isa(ExtractMaskElt) ? ExtractMaskElt : MaskElt; + } + return new ShuffleVectorInst(X, Y, ConstantVector::get(NewMask)); +} + Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) { Value *LHS = SVI.getOperand(0); Value *RHS = SVI.getOperand(1); @@ -1499,6 +1534,9 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) { return &SVI; } + if (Instruction *I = foldIdentityExtractShuffle(SVI)) + return I; + SmallVector Mask = SVI.getShuffleMask(); Type *Int32Ty = Type::getInt32Ty(SVI.getContext()); unsigned LHSWidth = LHS->getType()->getVectorNumElements(); diff --git a/test/Transforms/InstCombine/vec_shuffle.ll b/test/Transforms/InstCombine/vec_shuffle.ll index e9c3539ef6b..7692fe3e05c 100644 --- a/test/Transforms/InstCombine/vec_shuffle.ll +++ b/test/Transforms/InstCombine/vec_shuffle.ll @@ -170,12 +170,11 @@ define <8 x i8> @test12a(<8 x i8> %t6, <8 x i8> %t2) { ret <8 x i8> %t3 } -; TODO: The mask length of the 1st shuffle can be reduced to eliminate the 2nd shuffle. +; The mask length of the 1st shuffle can be reduced to eliminate the 2nd shuffle. define <2 x i8> @extract_subvector_of_shuffle(<2 x i8> %x, <2 x i8> %y) { ; CHECK-LABEL: @extract_subvector_of_shuffle( -; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <2 x i8> [[X:%.*]], <2 x i8> [[Y:%.*]], <3 x i32> -; CHECK-NEXT: [[EXTRACT_SUBV:%.*]] = shufflevector <3 x i8> [[SHUF]], <3 x i8> undef, <2 x i32> +; CHECK-NEXT: [[EXTRACT_SUBV:%.*]] = shufflevector <2 x i8> [[X:%.*]], <2 x i8> [[Y:%.*]], <2 x i32> ; CHECK-NEXT: ret <2 x i8> [[EXTRACT_SUBV]] ; %shuf = shufflevector <2 x i8> %x, <2 x i8> %y, <3 x i32> @@ -183,7 +182,6 @@ define <2 x i8> @extract_subvector_of_shuffle(<2 x i8> %x, <2 x i8> %y) { ret <2 x i8> %extract_subv } -; TODO: ; Extra uses are ok. ; Undef elements in either mask are ok. Undefs from the 2nd shuffle mask should propagate to the new shuffle. ; The type of the inputs does not have to match the output type. @@ -194,7 +192,7 @@ define <4 x i8> @extract_subvector_of_shuffle_extra_use(<2 x i8> %x, <2 x i8> %y ; CHECK-LABEL: @extract_subvector_of_shuffle_extra_use( ; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <2 x i8> [[X:%.*]], <2 x i8> [[Y:%.*]], <5 x i32> ; CHECK-NEXT: call void @use_v5i8(<5 x i8> [[SHUF]]) -; CHECK-NEXT: [[EXTRACT_SUBV:%.*]] = shufflevector <5 x i8> [[SHUF]], <5 x i8> undef, <4 x i32> +; CHECK-NEXT: [[EXTRACT_SUBV:%.*]] = shufflevector <2 x i8> [[X]], <2 x i8> [[Y]], <4 x i32> ; CHECK-NEXT: ret <4 x i8> [[EXTRACT_SUBV]] ; %shuf = shufflevector <2 x i8> %x, <2 x i8> %y, <5 x i32> -- GitLab From ffc6fe6727622c4e63572c4d676658b26e21ceba Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sun, 14 Oct 2018 15:56:28 +0000 Subject: [PATCH 0168/1116] [LegalizeDAG] Don't bother with final MUL+SRL stage for byte CTPOP. The final stage of CTPOP expansion (v = (v * 0x01010101...) >> (Len - 8)) is completely pointless for the byte (Len = 8) case as it reduces to (v = (v * 0x01...) >> 0), but annoyingly this doesn't always get optimized away. Found while investigating generic vector CTPOP expansion (PR32655). git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344477 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index bb2c76a6a41..175df889ef2 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -2750,9 +2750,10 @@ SDValue SelectionDAGLegalize::ExpandBitCount(unsigned Opc, SDValue Op, DAG.getConstant(4, dl, ShVT))), Mask0F); // v = (v * 0x01010101...) >> (Len - 8) - Op = DAG.getNode(ISD::SRL, dl, VT, - DAG.getNode(ISD::MUL, dl, VT, Op, Mask01), - DAG.getConstant(Len - 8, dl, ShVT)); + if (Len > 8) + Op = DAG.getNode(ISD::SRL, dl, VT, + DAG.getNode(ISD::MUL, dl, VT, Op, Mask01), + DAG.getConstant(Len - 8, dl, ShVT)); return Op; } -- GitLab From c8309b5ac33fcd1c4ac977a1cfbba7f4e1cdf6e0 Mon Sep 17 00:00:00 2001 From: Lang Hames Date: Sun, 14 Oct 2018 16:09:59 +0000 Subject: [PATCH 0169/1116] [ORC] Remove XXLayer::add methods that default to using the main JITDylib. They're not currently used and may complicate upcoming changes to add's signature and behavior. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344478 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/ExecutionEngine/Orc/Layer.h | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/include/llvm/ExecutionEngine/Orc/Layer.h b/include/llvm/ExecutionEngine/Orc/Layer.h index 3bd23ae5416..be5d9653dd8 100644 --- a/include/llvm/ExecutionEngine/Orc/Layer.h +++ b/include/llvm/ExecutionEngine/Orc/Layer.h @@ -51,12 +51,6 @@ public: /// JITDylib. virtual Error add(JITDylib &JD, VModuleKey K, ThreadSafeModule TSM); - /// Adds a MaterializationUnit representing the given IR to the main - /// JITDylib. - Error add(VModuleKey K, ThreadSafeModule TSM) { - return add(ES.getMainJITDylib(), K, std::move(TSM)); - } - /// Emit should materialize the given IR. virtual void emit(MaterializationResponsibility R, VModuleKey K, ThreadSafeModule TSM) = 0; @@ -127,12 +121,6 @@ public: /// JITDylib. virtual Error add(JITDylib &JD, VModuleKey K, std::unique_ptr O); - /// Adds a MaterializationUnit representing the given object to the main - /// JITDylib. - Error add(VModuleKey K, std::unique_ptr O) { - return add(ES.getMainJITDylib(), K, std::move(O)); - } - /// Emit should materialize the given IR. virtual void emit(MaterializationResponsibility R, VModuleKey K, std::unique_ptr O) = 0; -- GitLab From 7e9c8da5fd4730d5beb7b8d3625f8a4a5383a54b Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sun, 14 Oct 2018 16:49:04 +0000 Subject: [PATCH 0170/1116] [ARM] Regenerate cttz tests Improve codegen view as part of PR32655 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344479 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/CodeGen/ARM/cttz_vector.ll | 419 +++++++++++++++++++++----------- 1 file changed, 283 insertions(+), 136 deletions(-) diff --git a/test/CodeGen/ARM/cttz_vector.ll b/test/CodeGen/ARM/cttz_vector.ll index bed64498041..f27c1e4b417 100644 --- a/test/CodeGen/ARM/cttz_vector.ll +++ b/test/CodeGen/ARM/cttz_vector.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple armv7-linux-gnueabihf -mattr=+neon | FileCheck %s ; This test checks the @llvm.cttz.* intrinsics for vectors. @@ -23,7 +24,14 @@ declare <2 x i64> @llvm.cttz.v2i64(<2 x i64>, i1) ;------------------------------------------------------------------------------ define void @test_v1i8(<1 x i8>* %p) { -; CHECK-LABEL: test_v1i8 +; CHECK-LABEL: test_v1i8: +; CHECK: @ %bb.0: +; CHECK-NEXT: ldrb r1, [r0] +; CHECK-NEXT: orr r1, r1, #256 +; CHECK-NEXT: rbit r1, r1 +; CHECK-NEXT: clz r1, r1 +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: bx lr %a = load <1 x i8>, <1 x i8>* %p %tmp = call <1 x i8> @llvm.cttz.v1i8(<1 x i8> %a, i1 false) store <1 x i8> %tmp, <1 x i8>* %p @@ -32,6 +40,21 @@ define void @test_v1i8(<1 x i8>* %p) { define void @test_v2i8(<2 x i8>* %p) { ; CHECK-LABEL: test_v2i8: +; CHECK: @ %bb.0: +; CHECK-NEXT: vld1.16 {d16[0]}, [r0:16] +; CHECK-NEXT: vmovl.u8 q8, d16 +; CHECK-NEXT: vmovl.u16 q8, d16 +; CHECK-NEXT: vorr.i32 d16, #0x100 +; CHECK-NEXT: vneg.s32 d18, d16 +; CHECK-NEXT: vand d16, d16, d18 +; CHECK-NEXT: vmov.i32 d17, #0x1f +; CHECK-NEXT: vclz.i32 d16, d16 +; CHECK-NEXT: vsub.i32 d16, d17, d16 +; CHECK-NEXT: vmov.32 r1, d16[1] +; CHECK-NEXT: vmov.32 r2, d16[0] +; CHECK-NEXT: strb r1, [r0, #1] +; CHECK-NEXT: strb r2, [r0] +; CHECK-NEXT: bx lr %a = load <2 x i8>, <2 x i8>* %p %tmp = call <2 x i8> @llvm.cttz.v2i8(<2 x i8> %a, i1 false) store <2 x i8> %tmp, <2 x i8>* %p @@ -40,6 +63,19 @@ define void @test_v2i8(<2 x i8>* %p) { define void @test_v4i8(<4 x i8>* %p) { ; CHECK-LABEL: test_v4i8: +; CHECK: @ %bb.0: +; CHECK-NEXT: vld1.32 {d16[0]}, [r0:32] +; CHECK-NEXT: vmov.i16 d19, #0x1 +; CHECK-NEXT: vmovl.u8 q8, d16 +; CHECK-NEXT: vorr.i16 d16, #0x100 +; CHECK-NEXT: vneg.s16 d18, d16 +; CHECK-NEXT: vand d16, d16, d18 +; CHECK-NEXT: vsub.i16 d16, d16, d19 +; CHECK-NEXT: vcnt.8 d16, d16 +; CHECK-NEXT: vpaddl.u8 d16, d16 +; CHECK-NEXT: vuzp.8 d16, d17 +; CHECK-NEXT: vst1.32 {d16[0]}, [r0:32] +; CHECK-NEXT: bx lr %a = load <4 x i8>, <4 x i8>* %p %tmp = call <4 x i8> @llvm.cttz.v4i8(<4 x i8> %a, i1 false) store <4 x i8> %tmp, <4 x i8>* %p @@ -48,13 +84,15 @@ define void @test_v4i8(<4 x i8>* %p) { define void @test_v8i8(<8 x i8>* %p) { ; CHECK-LABEL: test_v8i8: -; CHECK: vldr [[D1:d[0-9]+]], [r0] -; CHECK: vmov.i8 [[D2:d[0-9]+]], #0x1 -; CHECK: vneg.s8 [[D3:d[0-9]+]], [[D1]] -; CHECK: vand [[D1]], [[D1]], [[D3]] -; CHECK: vsub.i8 [[D1]], [[D1]], [[D2]] -; CHECK: vcnt.8 [[D1]], [[D1]] -; CHECK: vstr [[D1]], [r0] +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr d16, [r0] +; CHECK-NEXT: vmov.i8 d18, #0x1 +; CHECK-NEXT: vneg.s8 d17, d16 +; CHECK-NEXT: vand d16, d16, d17 +; CHECK-NEXT: vsub.i8 d16, d16, d18 +; CHECK-NEXT: vcnt.8 d16, d16 +; CHECK-NEXT: vstr d16, [r0] +; CHECK-NEXT: bx lr %a = load <8 x i8>, <8 x i8>* %p %tmp = call <8 x i8> @llvm.cttz.v8i8(<8 x i8> %a, i1 false) store <8 x i8> %tmp, <8 x i8>* %p @@ -63,13 +101,15 @@ define void @test_v8i8(<8 x i8>* %p) { define void @test_v16i8(<16 x i8>* %p) { ; CHECK-LABEL: test_v16i8: -; CHECK: vld1.64 {[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0] -; CHECK: vmov.i8 [[Q2:q[0-9]+]], #0x1 -; CHECK: vneg.s8 [[Q3:q[0-9]+]], [[Q1:q[0-9]+]] -; CHECK: vand [[Q1]], [[Q1]], [[Q3]] -; CHECK: vsub.i8 [[Q1]], [[Q1]], [[Q2]] -; CHECK: vcnt.8 [[Q1]], [[Q1]] -; CHECK: vst1.64 {[[D1]], [[D2]]}, [r0] +; CHECK: @ %bb.0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r0] +; CHECK-NEXT: vmov.i8 q10, #0x1 +; CHECK-NEXT: vneg.s8 q9, q8 +; CHECK-NEXT: vand q8, q8, q9 +; CHECK-NEXT: vsub.i8 q8, q8, q10 +; CHECK-NEXT: vcnt.8 q8, q8 +; CHECK-NEXT: vst1.64 {d16, d17}, [r0] +; CHECK-NEXT: bx lr %a = load <16 x i8>, <16 x i8>* %p %tmp = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %a, i1 false) store <16 x i8> %tmp, <16 x i8>* %p @@ -78,6 +118,13 @@ define void @test_v16i8(<16 x i8>* %p) { define void @test_v1i16(<1 x i16>* %p) { ; CHECK-LABEL: test_v1i16: +; CHECK: @ %bb.0: +; CHECK-NEXT: ldrh r1, [r0] +; CHECK-NEXT: orr r1, r1, #65536 +; CHECK-NEXT: rbit r1, r1 +; CHECK-NEXT: clz r1, r1 +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: bx lr %a = load <1 x i16>, <1 x i16>* %p %tmp = call <1 x i16> @llvm.cttz.v1i16(<1 x i16> %a, i1 false) store <1 x i16> %tmp, <1 x i16>* %p @@ -86,6 +133,18 @@ define void @test_v1i16(<1 x i16>* %p) { define void @test_v2i16(<2 x i16>* %p) { ; CHECK-LABEL: test_v2i16: +; CHECK: @ %bb.0: +; CHECK-NEXT: vld1.32 {d16[0]}, [r0:32] +; CHECK-NEXT: vmovl.u16 q8, d16 +; CHECK-NEXT: vorr.i32 d16, #0x10000 +; CHECK-NEXT: vneg.s32 d18, d16 +; CHECK-NEXT: vand d16, d16, d18 +; CHECK-NEXT: vmov.i32 d17, #0x1f +; CHECK-NEXT: vclz.i32 d16, d16 +; CHECK-NEXT: vsub.i32 d16, d17, d16 +; CHECK-NEXT: vuzp.16 d16, d17 +; CHECK-NEXT: vst1.32 {d16[0]}, [r0:32] +; CHECK-NEXT: bx lr %a = load <2 x i16>, <2 x i16>* %p %tmp = call <2 x i16> @llvm.cttz.v2i16(<2 x i16> %a, i1 false) store <2 x i16> %tmp, <2 x i16>* %p @@ -94,14 +153,16 @@ define void @test_v2i16(<2 x i16>* %p) { define void @test_v4i16(<4 x i16>* %p) { ; CHECK-LABEL: test_v4i16: -; CHECK: vldr [[D1:d[0-9]+]], [r0] -; CHECK: vmov.i16 [[D2:d[0-9]+]], #0x1 -; CHECK: vneg.s16 [[D3:d[0-9]+]], [[D1]] -; CHECK: vand [[D1]], [[D1]], [[D3]] -; CHECK: vsub.i16 [[D1]], [[D1]], [[D2]] -; CHECK: vcnt.8 [[D1]], [[D1]] -; CHECK: vpaddl.u8 [[D1]], [[D1]] -; CHECK: vstr [[D1]], [r0] +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr d16, [r0] +; CHECK-NEXT: vmov.i16 d18, #0x1 +; CHECK-NEXT: vneg.s16 d17, d16 +; CHECK-NEXT: vand d16, d16, d17 +; CHECK-NEXT: vsub.i16 d16, d16, d18 +; CHECK-NEXT: vcnt.8 d16, d16 +; CHECK-NEXT: vpaddl.u8 d16, d16 +; CHECK-NEXT: vstr d16, [r0] +; CHECK-NEXT: bx lr %a = load <4 x i16>, <4 x i16>* %p %tmp = call <4 x i16> @llvm.cttz.v4i16(<4 x i16> %a, i1 false) store <4 x i16> %tmp, <4 x i16>* %p @@ -110,14 +171,16 @@ define void @test_v4i16(<4 x i16>* %p) { define void @test_v8i16(<8 x i16>* %p) { ; CHECK-LABEL: test_v8i16: -; CHECK: vld1.64 {[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0] -; CHECK: vmov.i16 [[Q2:q[0-9]+]], #0x1 -; CHECK: vneg.s16 [[Q3:q[0-9]+]], [[Q1:q[0-9]+]] -; CHECK: vand [[Q1]], [[Q1]], [[Q3]] -; CHECK: vsub.i16 [[Q1]], [[Q1]], [[Q2]] -; CHECK: vcnt.8 [[Q1]], [[Q1]] -; CHECK: vpaddl.u8 [[Q1]], [[Q1]] -; CHECK: vst1.64 {[[D1]], [[D2]]}, [r0] +; CHECK: @ %bb.0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r0] +; CHECK-NEXT: vmov.i16 q10, #0x1 +; CHECK-NEXT: vneg.s16 q9, q8 +; CHECK-NEXT: vand q8, q8, q9 +; CHECK-NEXT: vsub.i16 q8, q8, q10 +; CHECK-NEXT: vcnt.8 q8, q8 +; CHECK-NEXT: vpaddl.u8 q8, q8 +; CHECK-NEXT: vst1.64 {d16, d17}, [r0] +; CHECK-NEXT: bx lr %a = load <8 x i16>, <8 x i16>* %p %tmp = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 false) store <8 x i16> %tmp, <8 x i16>* %p @@ -126,6 +189,12 @@ define void @test_v8i16(<8 x i16>* %p) { define void @test_v1i32(<1 x i32>* %p) { ; CHECK-LABEL: test_v1i32: +; CHECK: @ %bb.0: +; CHECK-NEXT: ldr r1, [r0] +; CHECK-NEXT: rbit r1, r1 +; CHECK-NEXT: clz r1, r1 +; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: bx lr %a = load <1 x i32>, <1 x i32>* %p %tmp = call <1 x i32> @llvm.cttz.v1i32(<1 x i32> %a, i1 false) store <1 x i32> %tmp, <1 x i32>* %p @@ -134,15 +203,17 @@ define void @test_v1i32(<1 x i32>* %p) { define void @test_v2i32(<2 x i32>* %p) { ; CHECK-LABEL: test_v2i32: -; CHECK: vldr [[D1:d[0-9]+]], [r0] -; CHECK: vmov.i32 [[D2:d[0-9]+]], #0x1 -; CHECK: vneg.s32 [[D3:d[0-9]+]], [[D1]] -; CHECK: vand [[D1]], [[D1]], [[D3]] -; CHECK: vsub.i32 [[D1]], [[D1]], [[D2]] -; CHECK: vcnt.8 [[D1]], [[D1]] -; CHECK: vpaddl.u8 [[D1]], [[D1]] -; CHECK: vpaddl.u16 [[D1]], [[D1]] -; CHECK: vstr [[D1]], [r0] +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr d16, [r0] +; CHECK-NEXT: vmov.i32 d18, #0x1 +; CHECK-NEXT: vneg.s32 d17, d16 +; CHECK-NEXT: vand d16, d16, d17 +; CHECK-NEXT: vsub.i32 d16, d16, d18 +; CHECK-NEXT: vcnt.8 d16, d16 +; CHECK-NEXT: vpaddl.u8 d16, d16 +; CHECK-NEXT: vpaddl.u16 d16, d16 +; CHECK-NEXT: vstr d16, [r0] +; CHECK-NEXT: bx lr %a = load <2 x i32>, <2 x i32>* %p %tmp = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %a, i1 false) store <2 x i32> %tmp, <2 x i32>* %p @@ -151,15 +222,17 @@ define void @test_v2i32(<2 x i32>* %p) { define void @test_v4i32(<4 x i32>* %p) { ; CHECK-LABEL: test_v4i32: -; CHECK: vld1.64 {[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0] -; CHECK: vmov.i32 [[Q2:q[0-9]+]], #0x1 -; CHECK: vneg.s32 [[Q3:q[0-9]+]], [[Q1:q[0-9]+]] -; CHECK: vand [[Q1]], [[Q1]], [[Q3]] -; CHECK: vsub.i32 [[Q1]], [[Q1]], [[Q2]] -; CHECK: vcnt.8 [[Q1]], [[Q1]] -; CHECK: vpaddl.u8 [[Q1]], [[Q1]] -; CHECK: vpaddl.u16 [[Q1]], [[Q1]] -; CHECK: vst1.64 {[[D1]], [[D2]]}, [r0] +; CHECK: @ %bb.0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r0] +; CHECK-NEXT: vmov.i32 q10, #0x1 +; CHECK-NEXT: vneg.s32 q9, q8 +; CHECK-NEXT: vand q8, q8, q9 +; CHECK-NEXT: vsub.i32 q8, q8, q10 +; CHECK-NEXT: vcnt.8 q8, q8 +; CHECK-NEXT: vpaddl.u8 q8, q8 +; CHECK-NEXT: vpaddl.u16 q8, q8 +; CHECK-NEXT: vst1.64 {d16, d17}, [r0] +; CHECK-NEXT: bx lr %a = load <4 x i32>, <4 x i32>* %p %tmp = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 false) store <4 x i32> %tmp, <4 x i32>* %p @@ -168,17 +241,19 @@ define void @test_v4i32(<4 x i32>* %p) { define void @test_v1i64(<1 x i64>* %p) { ; CHECK-LABEL: test_v1i64: -; CHECK: vmov.i32 [[D2:d[0-9]+]], #0x0 -; CHECK: vldr [[D1:d[0-9]+]], [r0] -; CHECK: vmov.i64 [[D3:d[0-9]+]], #0xffffffffffffffff -; CHECK: vsub.i64 [[D2]], [[D2]], [[D1]] -; CHECK: vand [[D2]], [[D1]], [[D2]] -; CHECK: vadd.i64 [[D2]], [[D2]], [[D3]] -; CHECK: vcnt.8 [[D2]], [[D2]] -; CHECK: vpaddl.u8 [[D2]], [[D2]] -; CHECK: vpaddl.u16 [[D2]], [[D2]] -; CHECK: vpaddl.u32 [[D2]], [[D2]] -; CHECK: vstr [[D2]], [r0] +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov.i32 d16, #0x0 +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vmov.i64 d18, #0xffffffffffffffff +; CHECK-NEXT: vsub.i64 d16, d16, d17 +; CHECK-NEXT: vand d16, d17, d16 +; CHECK-NEXT: vadd.i64 d16, d16, d18 +; CHECK-NEXT: vcnt.8 d16, d16 +; CHECK-NEXT: vpaddl.u8 d16, d16 +; CHECK-NEXT: vpaddl.u16 d16, d16 +; CHECK-NEXT: vpaddl.u32 d16, d16 +; CHECK-NEXT: vstr d16, [r0] +; CHECK-NEXT: bx lr %a = load <1 x i64>, <1 x i64>* %p %tmp = call <1 x i64> @llvm.cttz.v1i64(<1 x i64> %a, i1 false) store <1 x i64> %tmp, <1 x i64>* %p @@ -187,17 +262,19 @@ define void @test_v1i64(<1 x i64>* %p) { define void @test_v2i64(<2 x i64>* %p) { ; CHECK-LABEL: test_v2i64: -; CHECK: vmov.i32 [[Q2:q[0-9]+]], #0x0 -; CHECK: vld1.64 {[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0] -; CHECK: vmov.i64 [[Q3:q[0-9]+]], #0xffffffffffffffff -; CHECK: vsub.i64 [[Q2]], [[Q2]], [[Q1:q[0-9]+]] -; CHECK: vand [[Q2]], [[Q1]], [[Q2]] -; CHECK: vadd.i64 [[Q2]], [[Q2]], [[Q3]] -; CHECK: vcnt.8 [[Q2]], [[Q2]] -; CHECK: vpaddl.u8 [[Q2]], [[Q2]] -; CHECK: vpaddl.u16 [[Q2]], [[Q2]] -; CHECK: vpaddl.u32 [[Q2]], [[Q2]] -; CHECK: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0] +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov.i32 q8, #0x0 +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: vmov.i64 q10, #0xffffffffffffffff +; CHECK-NEXT: vsub.i64 q8, q8, q9 +; CHECK-NEXT: vand q8, q9, q8 +; CHECK-NEXT: vadd.i64 q8, q8, q10 +; CHECK-NEXT: vcnt.8 q8, q8 +; CHECK-NEXT: vpaddl.u8 q8, q8 +; CHECK-NEXT: vpaddl.u16 q8, q8 +; CHECK-NEXT: vpaddl.u32 q8, q8 +; CHECK-NEXT: vst1.64 {d16, d17}, [r0] +; CHECK-NEXT: bx lr %a = load <2 x i64>, <2 x i64>* %p %tmp = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 false) store <2 x i64> %tmp, <2 x i64>* %p @@ -207,7 +284,13 @@ define void @test_v2i64(<2 x i64>* %p) { ;------------------------------------------------------------------------------ define void @test_v1i8_zero_undef(<1 x i8>* %p) { -; CHECK-LABEL: test_v1i8_zero_undef +; CHECK-LABEL: test_v1i8_zero_undef: +; CHECK: @ %bb.0: +; CHECK-NEXT: ldrb r1, [r0] +; CHECK-NEXT: rbit r1, r1 +; CHECK-NEXT: clz r1, r1 +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: bx lr %a = load <1 x i8>, <1 x i8>* %p %tmp = call <1 x i8> @llvm.cttz.v1i8(<1 x i8> %a, i1 true) store <1 x i8> %tmp, <1 x i8>* %p @@ -216,6 +299,20 @@ define void @test_v1i8_zero_undef(<1 x i8>* %p) { define void @test_v2i8_zero_undef(<2 x i8>* %p) { ; CHECK-LABEL: test_v2i8_zero_undef: +; CHECK: @ %bb.0: +; CHECK-NEXT: vld1.16 {d16[0]}, [r0:16] +; CHECK-NEXT: vmovl.u8 q8, d16 +; CHECK-NEXT: vmovl.u16 q8, d16 +; CHECK-NEXT: vneg.s32 d18, d16 +; CHECK-NEXT: vand d16, d16, d18 +; CHECK-NEXT: vmov.i32 d17, #0x1f +; CHECK-NEXT: vclz.i32 d16, d16 +; CHECK-NEXT: vsub.i32 d16, d17, d16 +; CHECK-NEXT: vmov.32 r1, d16[1] +; CHECK-NEXT: vmov.32 r2, d16[0] +; CHECK-NEXT: strb r1, [r0, #1] +; CHECK-NEXT: strb r2, [r0] +; CHECK-NEXT: bx lr %a = load <2 x i8>, <2 x i8>* %p %tmp = call <2 x i8> @llvm.cttz.v2i8(<2 x i8> %a, i1 true) store <2 x i8> %tmp, <2 x i8>* %p @@ -224,6 +321,17 @@ define void @test_v2i8_zero_undef(<2 x i8>* %p) { define void @test_v4i8_zero_undef(<4 x i8>* %p) { ; CHECK-LABEL: test_v4i8_zero_undef: +; CHECK: @ %bb.0: +; CHECK-NEXT: vld1.32 {d16[0]}, [r0:32] +; CHECK-NEXT: vmovl.u8 q8, d16 +; CHECK-NEXT: vneg.s16 d18, d16 +; CHECK-NEXT: vand d16, d16, d18 +; CHECK-NEXT: vmov.i16 d17, #0xf +; CHECK-NEXT: vclz.i16 d16, d16 +; CHECK-NEXT: vsub.i16 d16, d17, d16 +; CHECK-NEXT: vuzp.8 d16, d17 +; CHECK-NEXT: vst1.32 {d16[0]}, [r0:32] +; CHECK-NEXT: bx lr %a = load <4 x i8>, <4 x i8>* %p %tmp = call <4 x i8> @llvm.cttz.v4i8(<4 x i8> %a, i1 true) store <4 x i8> %tmp, <4 x i8>* %p @@ -232,13 +340,15 @@ define void @test_v4i8_zero_undef(<4 x i8>* %p) { define void @test_v8i8_zero_undef(<8 x i8>* %p) { ; CHECK-LABEL: test_v8i8_zero_undef: -; CHECK: vldr [[D1:d[0-9]+]], [r0] -; CHECK: vmov.i8 [[D2:d[0-9]+]], #0x1 -; CHECK: vneg.s8 [[D3:d[0-9]+]], [[D1]] -; CHECK: vand [[D1]], [[D1]], [[D3]] -; CHECK: vsub.i8 [[D1]], [[D1]], [[D2]] -; CHECK: vcnt.8 [[D1]], [[D1]] -; CHECK: vstr [[D1]], [r0] +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr d16, [r0] +; CHECK-NEXT: vmov.i8 d18, #0x1 +; CHECK-NEXT: vneg.s8 d17, d16 +; CHECK-NEXT: vand d16, d16, d17 +; CHECK-NEXT: vsub.i8 d16, d16, d18 +; CHECK-NEXT: vcnt.8 d16, d16 +; CHECK-NEXT: vstr d16, [r0] +; CHECK-NEXT: bx lr %a = load <8 x i8>, <8 x i8>* %p %tmp = call <8 x i8> @llvm.cttz.v8i8(<8 x i8> %a, i1 true) store <8 x i8> %tmp, <8 x i8>* %p @@ -247,13 +357,15 @@ define void @test_v8i8_zero_undef(<8 x i8>* %p) { define void @test_v16i8_zero_undef(<16 x i8>* %p) { ; CHECK-LABEL: test_v16i8_zero_undef: -; CHECK: vld1.64 {[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0] -; CHECK: vmov.i8 [[Q2:q[0-9]+]], #0x1 -; CHECK: vneg.s8 [[Q3:q[0-9]+]], [[Q1:q[0-9]+]] -; CHECK: vand [[Q1]], [[Q1]], [[Q3]] -; CHECK: vsub.i8 [[Q1]], [[Q1]], [[Q2]] -; CHECK: vcnt.8 [[Q1]], [[Q1]] -; CHECK: vst1.64 {[[D1]], [[D2]]}, [r0] +; CHECK: @ %bb.0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r0] +; CHECK-NEXT: vmov.i8 q10, #0x1 +; CHECK-NEXT: vneg.s8 q9, q8 +; CHECK-NEXT: vand q8, q8, q9 +; CHECK-NEXT: vsub.i8 q8, q8, q10 +; CHECK-NEXT: vcnt.8 q8, q8 +; CHECK-NEXT: vst1.64 {d16, d17}, [r0] +; CHECK-NEXT: bx lr %a = load <16 x i8>, <16 x i8>* %p %tmp = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %a, i1 true) store <16 x i8> %tmp, <16 x i8>* %p @@ -262,6 +374,12 @@ define void @test_v16i8_zero_undef(<16 x i8>* %p) { define void @test_v1i16_zero_undef(<1 x i16>* %p) { ; CHECK-LABEL: test_v1i16_zero_undef: +; CHECK: @ %bb.0: +; CHECK-NEXT: ldrh r1, [r0] +; CHECK-NEXT: rbit r1, r1 +; CHECK-NEXT: clz r1, r1 +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: bx lr %a = load <1 x i16>, <1 x i16>* %p %tmp = call <1 x i16> @llvm.cttz.v1i16(<1 x i16> %a, i1 true) store <1 x i16> %tmp, <1 x i16>* %p @@ -270,6 +388,17 @@ define void @test_v1i16_zero_undef(<1 x i16>* %p) { define void @test_v2i16_zero_undef(<2 x i16>* %p) { ; CHECK-LABEL: test_v2i16_zero_undef: +; CHECK: @ %bb.0: +; CHECK-NEXT: vld1.32 {d16[0]}, [r0:32] +; CHECK-NEXT: vmovl.u16 q8, d16 +; CHECK-NEXT: vneg.s32 d18, d16 +; CHECK-NEXT: vand d16, d16, d18 +; CHECK-NEXT: vmov.i32 d17, #0x1f +; CHECK-NEXT: vclz.i32 d16, d16 +; CHECK-NEXT: vsub.i32 d16, d17, d16 +; CHECK-NEXT: vuzp.16 d16, d17 +; CHECK-NEXT: vst1.32 {d16[0]}, [r0:32] +; CHECK-NEXT: bx lr %a = load <2 x i16>, <2 x i16>* %p %tmp = call <2 x i16> @llvm.cttz.v2i16(<2 x i16> %a, i1 true) store <2 x i16> %tmp, <2 x i16>* %p @@ -278,13 +407,15 @@ define void @test_v2i16_zero_undef(<2 x i16>* %p) { define void @test_v4i16_zero_undef(<4 x i16>* %p) { ; CHECK-LABEL: test_v4i16_zero_undef: -; CHECK: vldr [[D1:d[0-9]+]], [r0] -; CHECK: vneg.s16 [[D2:d[0-9]+]], [[D1]] -; CHECK: vand [[D1]], [[D1]], [[D2]] -; CHECK: vmov.i16 [[D3:d[0-9]+]], #0xf -; CHECK: vclz.i16 [[D1]], [[D1]] -; CHECK: vsub.i16 [[D1]], [[D3]], [[D1]] -; CHECK: vstr [[D1]], [r0] +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr d16, [r0] +; CHECK-NEXT: vneg.s16 d17, d16 +; CHECK-NEXT: vand d16, d16, d17 +; CHECK-NEXT: vmov.i16 d17, #0xf +; CHECK-NEXT: vclz.i16 d16, d16 +; CHECK-NEXT: vsub.i16 d16, d17, d16 +; CHECK-NEXT: vstr d16, [r0] +; CHECK-NEXT: bx lr %a = load <4 x i16>, <4 x i16>* %p %tmp = call <4 x i16> @llvm.cttz.v4i16(<4 x i16> %a, i1 true) store <4 x i16> %tmp, <4 x i16>* %p @@ -293,13 +424,15 @@ define void @test_v4i16_zero_undef(<4 x i16>* %p) { define void @test_v8i16_zero_undef(<8 x i16>* %p) { ; CHECK-LABEL: test_v8i16_zero_undef: -; CHECK: vld1.64 {[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0] -; CHECK: vneg.s16 [[Q2:q[0-9]+]], [[Q1:q[0-9]+]] -; CHECK: vand [[Q1]], [[Q1]], [[Q2]] -; CHECK: vmov.i16 [[Q3:q[0-9]+]], #0xf -; CHECK: vclz.i16 [[Q1]], [[Q1]] -; CHECK: vsub.i16 [[Q1]], [[Q3]], [[Q1]] -; CHECK: vst1.64 {[[D1]], [[D2]]}, [r0] +; CHECK: @ %bb.0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r0] +; CHECK-NEXT: vneg.s16 q9, q8 +; CHECK-NEXT: vand q8, q8, q9 +; CHECK-NEXT: vmov.i16 q9, #0xf +; CHECK-NEXT: vclz.i16 q8, q8 +; CHECK-NEXT: vsub.i16 q8, q9, q8 +; CHECK-NEXT: vst1.64 {d16, d17}, [r0] +; CHECK-NEXT: bx lr %a = load <8 x i16>, <8 x i16>* %p %tmp = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 true) store <8 x i16> %tmp, <8 x i16>* %p @@ -308,6 +441,12 @@ define void @test_v8i16_zero_undef(<8 x i16>* %p) { define void @test_v1i32_zero_undef(<1 x i32>* %p) { ; CHECK-LABEL: test_v1i32_zero_undef: +; CHECK: @ %bb.0: +; CHECK-NEXT: ldr r1, [r0] +; CHECK-NEXT: rbit r1, r1 +; CHECK-NEXT: clz r1, r1 +; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: bx lr %a = load <1 x i32>, <1 x i32>* %p %tmp = call <1 x i32> @llvm.cttz.v1i32(<1 x i32> %a, i1 true) store <1 x i32> %tmp, <1 x i32>* %p @@ -316,13 +455,15 @@ define void @test_v1i32_zero_undef(<1 x i32>* %p) { define void @test_v2i32_zero_undef(<2 x i32>* %p) { ; CHECK-LABEL: test_v2i32_zero_undef: -; CHECK: vldr [[D1:d[0-9]+]], [r0] -; CHECK: vneg.s32 [[D2:d[0-9]+]], [[D1]] -; CHECK: vand [[D1]], [[D1]], [[D2]] -; CHECK: vmov.i32 [[D3:d[0-9]+]], #0x1f -; CHECK: vclz.i32 [[D1]], [[D1]] -; CHECK: vsub.i32 [[D1]], [[D3]], [[D1]] -; CHECK: vstr [[D1]], [r0] +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr d16, [r0] +; CHECK-NEXT: vneg.s32 d17, d16 +; CHECK-NEXT: vand d16, d16, d17 +; CHECK-NEXT: vmov.i32 d17, #0x1f +; CHECK-NEXT: vclz.i32 d16, d16 +; CHECK-NEXT: vsub.i32 d16, d17, d16 +; CHECK-NEXT: vstr d16, [r0] +; CHECK-NEXT: bx lr %a = load <2 x i32>, <2 x i32>* %p %tmp = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %a, i1 true) store <2 x i32> %tmp, <2 x i32>* %p @@ -331,13 +472,15 @@ define void @test_v2i32_zero_undef(<2 x i32>* %p) { define void @test_v4i32_zero_undef(<4 x i32>* %p) { ; CHECK-LABEL: test_v4i32_zero_undef: -; CHECK: vld1.64 {[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0] -; CHECK: vneg.s32 [[Q2:q[0-9]+]], [[Q1:q[0-9]+]] -; CHECK: vand [[Q1]], [[Q1]], [[Q2]] -; CHECK: vmov.i32 [[Q3:q[0-9]+]], #0x1f -; CHECK: vclz.i32 [[Q1]], [[Q1]] -; CHECK: vsub.i32 [[Q1]], [[Q3]], [[Q1]] -; CHECK: vst1.64 {[[D1]], [[D2]]}, [r0] +; CHECK: @ %bb.0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r0] +; CHECK-NEXT: vneg.s32 q9, q8 +; CHECK-NEXT: vand q8, q8, q9 +; CHECK-NEXT: vmov.i32 q9, #0x1f +; CHECK-NEXT: vclz.i32 q8, q8 +; CHECK-NEXT: vsub.i32 q8, q9, q8 +; CHECK-NEXT: vst1.64 {d16, d17}, [r0] +; CHECK-NEXT: bx lr %a = load <4 x i32>, <4 x i32>* %p %tmp = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 true) store <4 x i32> %tmp, <4 x i32>* %p @@ -346,17 +489,19 @@ define void @test_v4i32_zero_undef(<4 x i32>* %p) { define void @test_v1i64_zero_undef(<1 x i64>* %p) { ; CHECK-LABEL: test_v1i64_zero_undef: -; CHECK: vmov.i32 [[D2:d[0-9]+]], #0x0 -; CHECK: vldr [[D1:d[0-9]+]], [r0] -; CHECK: vmov.i64 [[D3:d[0-9]+]], #0xffffffffffffffff -; CHECK: vsub.i64 [[D2]], [[D2]], [[D1]] -; CHECK: vand [[D2]], [[D1]], [[D2]] -; CHECK: vadd.i64 [[D2]], [[D2]], [[D3]] -; CHECK: vcnt.8 [[D2]], [[D2]] -; CHECK: vpaddl.u8 [[D2]], [[D2]] -; CHECK: vpaddl.u16 [[D2]], [[D2]] -; CHECK: vpaddl.u32 [[D2]], [[D2]] -; CHECK: vstr [[D2]], [r0] +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov.i32 d16, #0x0 +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vmov.i64 d18, #0xffffffffffffffff +; CHECK-NEXT: vsub.i64 d16, d16, d17 +; CHECK-NEXT: vand d16, d17, d16 +; CHECK-NEXT: vadd.i64 d16, d16, d18 +; CHECK-NEXT: vcnt.8 d16, d16 +; CHECK-NEXT: vpaddl.u8 d16, d16 +; CHECK-NEXT: vpaddl.u16 d16, d16 +; CHECK-NEXT: vpaddl.u32 d16, d16 +; CHECK-NEXT: vstr d16, [r0] +; CHECK-NEXT: bx lr %a = load <1 x i64>, <1 x i64>* %p %tmp = call <1 x i64> @llvm.cttz.v1i64(<1 x i64> %a, i1 true) store <1 x i64> %tmp, <1 x i64>* %p @@ -365,17 +510,19 @@ define void @test_v1i64_zero_undef(<1 x i64>* %p) { define void @test_v2i64_zero_undef(<2 x i64>* %p) { ; CHECK-LABEL: test_v2i64_zero_undef: -; CHECK: vmov.i32 [[Q2:q[0-9]+]], #0x0 -; CHECK: vld1.64 {[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0] -; CHECK: vmov.i64 [[Q3:q[0-9]+]], #0xffffffffffffffff -; CHECK: vsub.i64 [[Q2]], [[Q2]], [[Q1:q[0-9]+]] -; CHECK: vand [[Q2]], [[Q1]], [[Q2]] -; CHECK: vadd.i64 [[Q2]], [[Q2]], [[Q3]] -; CHECK: vcnt.8 [[Q2]], [[Q2]] -; CHECK: vpaddl.u8 [[Q2]], [[Q2]] -; CHECK: vpaddl.u16 [[Q2]], [[Q2]] -; CHECK: vpaddl.u32 [[Q2]], [[Q2]] -; CHECK: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0] +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov.i32 q8, #0x0 +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: vmov.i64 q10, #0xffffffffffffffff +; CHECK-NEXT: vsub.i64 q8, q8, q9 +; CHECK-NEXT: vand q8, q9, q8 +; CHECK-NEXT: vadd.i64 q8, q8, q10 +; CHECK-NEXT: vcnt.8 q8, q8 +; CHECK-NEXT: vpaddl.u8 q8, q8 +; CHECK-NEXT: vpaddl.u16 q8, q8 +; CHECK-NEXT: vpaddl.u32 q8, q8 +; CHECK-NEXT: vst1.64 {d16, d17}, [r0] +; CHECK-NEXT: bx lr %a = load <2 x i64>, <2 x i64>* %p %tmp = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 true) store <2 x i64> %tmp, <2 x i64>* %p -- GitLab From d92ffe66987bc6686a36e13f6de086727525b8e0 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sun, 14 Oct 2018 17:34:20 +0000 Subject: [PATCH 0171/1116] [X86][AVX] Enable lowerVectorShuffleAsLanePermuteAndPermute v16i16/v32i8 shuffle lowering Extends D53148 from v4f64 now that we have test coverage for v16i16/v32i8 shuffles. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344481 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 10 ++++++++++ test/CodeGen/X86/vector-shuffle-256-v16.ll | 6 ++---- test/CodeGen/X86/vector-shuffle-256-v32.ll | 6 ++---- 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 441f26dd4c6..4eaf1cc921b 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -14692,6 +14692,11 @@ static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef Mask, DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG)) return Result; + // Try to permute the lanes and then use a per-lane permute. + if (SDValue V = lowerVectorShuffleAsLanePermuteAndPermute( + DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget)) + return V; + // Otherwise fall back on generic lowering. return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG); } @@ -14772,6 +14777,11 @@ static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef Mask, DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG)) return Result; + // Try to permute the lanes and then use a per-lane permute. + if (SDValue V = lowerVectorShuffleAsLanePermuteAndPermute( + DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget)) + return V; + // Otherwise fall back on generic lowering. return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG); } diff --git a/test/CodeGen/X86/vector-shuffle-256-v16.ll b/test/CodeGen/X86/vector-shuffle-256-v16.ll index 90970f15fea..2f0be026fd9 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -4063,10 +4063,8 @@ define <16 x i16> @shuffle_v16i16_00_02_04_06_01_03_05_07_31_30_29_28_27_26_25_2 ; ; AVX2-LABEL: shuffle_v16i16_00_02_04_06_01_03_05_07_31_30_29_28_27_26_25_24: ; AVX2: # %bb.0: -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,2,3,6,7,10,11,14,15] -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,2,3,6,7,10,11,14,15,30,31,28,29,26,27,24,25,22,23,20,21,18,19,16,17] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_00_02_04_06_01_03_05_07_31_30_29_28_27_26_25_24: diff --git a/test/CodeGen/X86/vector-shuffle-256-v32.ll b/test/CodeGen/X86/vector-shuffle-256-v32.ll index 5e9f30a727d..9dfbb6af075 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -2505,16 +2505,14 @@ define <32 x i8> @shuffle_v32i8_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00_ ; ; AVX2-LABEL: shuffle_v32i8_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00_32_34_36_38_40_42_44_46_33_35_37_39_41_43_45_47: ; AVX2: # %bb.0: -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15] ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,16,18,20,22,24,26,28,30,17,19,21,23,25,27,29,31] ; AVX2-NEXT: retq ; ; AVX512VLBW-LABEL: shuffle_v32i8_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00_32_34_36_38_40_42_44_46_33_35_37_39_41_43_45_47: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] -; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15] ; AVX512VLBW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,16,18,20,22,24,26,28,30,17,19,21,23,25,27,29,31] ; AVX512VLBW-NEXT: retq ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00_32_34_36_38_40_42_44_46_33_35_37_39_41_43_45_47: -- GitLab From 631cfd79b3240f5df03548a0a3eab519709743e9 Mon Sep 17 00:00:00 2001 From: Ayal Zaks Date: Sun, 14 Oct 2018 17:53:02 +0000 Subject: [PATCH 0172/1116] [LV] Fix comments reported when not vectorizing single iteration loops; NFC Landing this as a separate part of https://reviews.llvm.org/D50480, being a seemingly unrelated change ([LV] Vectorizing loops of arbitrary trip count without remainder under opt for size). git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344483 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index e93cfb34156..2ba2f00b4a5 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4558,8 +4558,15 @@ Optional LoopVectorizationCostModel::computeMaxVF(bool OptForSize) { // If we optimize the program for size, avoid creating the tail loop. LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); + if (TC == 1) { + ORE->emit(createMissedAnalysis("SingleIterationLoop") + << "loop trip count is one, irrelevant for vectorization"); + LLVM_DEBUG(dbgs() << "LV: Aborting, single iteration (non) loop.\n"); + return None; + } + // If we don't know the precise trip count, don't try to vectorize. - if (TC < 2) { + if (TC == 0) { ORE->emit( createMissedAnalysis("UnknownLoopCountComplexCFG") << "unable to calculate the loop count due to complex control flow"); -- GitLab From c76c02e1ed1a158628019f61aea45fcf87712d2c Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sun, 14 Oct 2018 20:14:33 +0000 Subject: [PATCH 0173/1116] [InstCombine] Add PR27343 test cases git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344484 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/Transforms/InstCombine/pr27343.ll | 33 ++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 test/Transforms/InstCombine/pr27343.ll diff --git a/test/Transforms/InstCombine/pr27343.ll b/test/Transforms/InstCombine/pr27343.ll new file mode 100644 index 00000000000..5a9267b16af --- /dev/null +++ b/test/Transforms/InstCombine/pr27343.ll @@ -0,0 +1,33 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -S -instcombine | FileCheck %s + +define i32 @__isnan(float %x) alwaysinline nounwind optsize { +; CHECK-LABEL: @__isnan( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTCAST:%.*]] = bitcast float [[X:%.*]] to i32 +; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[DOTCAST]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i32 [[SHL]], -16777216 +; CHECK-NEXT: [[CONV:%.*]] = zext i1 [[CMP]] to i32 +; CHECK-NEXT: ret i32 [[CONV]] +; +entry: + %x.addr = alloca float, align 4 + store float %x, float* %x.addr, align 4 + %0 = load float, float* %x.addr, align 4 + %1 = bitcast float %0 to i32 + %shl = shl i32 %1, 1 + %cmp = icmp ugt i32 %shl, -16777216 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +define i1 @icmp_shl7(i32 %x) { +; CHECK-LABEL: @icmp_shl7( +; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[X:%.*]], 7 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[SHL]], 4608 +; CHECK-NEXT: ret i1 [[CMP]] +; + %shl = shl i32 %x, 7 + %cmp = icmp slt i32 %shl, 4608 + ret i1 %cmp +} -- GitLab From be51e5f9632255eda60f8f9e96e777c99f9415a1 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 15 Oct 2018 01:51:50 +0000 Subject: [PATCH 0174/1116] [X86] Autogenerate complete checks. NFC git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344485 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/CodeGen/X86/splat-for-size.ll | 51 ++++++++++++++++++++++-------- 1 file changed, 38 insertions(+), 13 deletions(-) diff --git a/test/CodeGen/X86/splat-for-size.ll b/test/CodeGen/X86/splat-for-size.ll index 5a98a00338b..99ed8e8ccb6 100644 --- a/test/CodeGen/X86/splat-for-size.ll +++ b/test/CodeGen/X86/splat-for-size.ll @@ -19,7 +19,7 @@ define <2 x double> @splat_v2f64(<2 x double> %x) #0 { define <4 x double> @splat_v4f64(<4 x double> %x) #1 { ; CHECK-LABEL: splat_v4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastsd {{.*}}(%rip), %ymm1 +; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm1 = [1,1,1,1] ; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: retq %add = fadd <4 x double> %x, @@ -29,7 +29,7 @@ define <4 x double> @splat_v4f64(<4 x double> %x) #1 { define <4 x float> @splat_v4f32(<4 x float> %x) #0 { ; CHECK-LABEL: splat_v4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastss {{.*}}(%rip), %xmm1 +; CHECK-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %add = fadd <4 x float> %x, @@ -39,7 +39,7 @@ define <4 x float> @splat_v4f32(<4 x float> %x) #0 { define <8 x float> @splat_v8f32(<8 x float> %x) #1 { ; CHECK-LABEL: splat_v8f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastss {{.*}}(%rip), %ymm1 +; CHECK-NEXT: vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] ; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: retq %add = fadd <8 x float> %x, @@ -57,7 +57,7 @@ define <2 x i64> @splat_v2i64(<2 x i64> %x) #1 { ; ; AVX2-LABEL: splat_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %xmm1 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,2] ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq %add = add <2 x i64> %x, @@ -78,7 +78,7 @@ define <4 x i64> @splat_v4i64(<4 x i64> %x) #0 { ; ; AVX2-LABEL: splat_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2,2,2,2] ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq %add = add <4 x i64> %x, @@ -89,13 +89,13 @@ define <4 x i64> @splat_v4i64(<4 x i64> %x) #0 { define <4 x i32> @splat_v4i32(<4 x i32> %x) #1 { ; AVX-LABEL: splat_v4i32: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastss {{.*}}(%rip), %xmm1 +; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [2,2,2,2] ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX2-LABEL: splat_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,2,2,2] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq %add = add <4 x i32> %x, @@ -107,7 +107,7 @@ define <8 x i32> @splat_v8i32(<8 x i32> %x) #0 { ; AVX-LABEL: splat_v8i32: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vbroadcastss {{.*}}(%rip), %xmm2 +; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [2,2,2,2] ; AVX-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -115,7 +115,7 @@ define <8 x i32> @splat_v8i32(<8 x i32> %x) #0 { ; ; AVX2-LABEL: splat_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2] ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq %add = add <8 x i32> %x, @@ -131,7 +131,7 @@ define <8 x i16> @splat_v8i16(<8 x i16> %x) #1 { ; ; AVX2-LABEL: splat_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastw {{.*}}(%rip), %xmm1 +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2] ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq %add = add <8 x i16> %x, @@ -151,7 +151,7 @@ define <16 x i16> @splat_v16i16(<16 x i16> %x) #0 { ; ; AVX2-LABEL: splat_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastw {{.*}}(%rip), %ymm1 +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq %add = add <16 x i16> %x, @@ -167,7 +167,7 @@ define <16 x i8> @splat_v16i8(<16 x i8> %x) #1 { ; ; AVX2-LABEL: splat_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*}}(%rip), %xmm1 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq %add = add <16 x i8> %x, @@ -187,7 +187,7 @@ define <32 x i8> @splat_v32i8(<32 x i8> %x) #0 { ; ; AVX2-LABEL: splat_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*}}(%rip), %ymm1 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] ; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq %add = add <32 x i8> %x, @@ -201,6 +201,31 @@ define <32 x i8> @splat_v32i8(<32 x i8> %x) #0 { @A = common global <3 x i64> zeroinitializer, align 32 define <8 x i64> @pr23259() #1 { +; AVX-LABEL: pr23259: +; AVX: # %bb.0: # %entry +; AVX-NEXT: pushq $1 +; AVX-NEXT: .cfi_adjust_cfa_offset 8 +; AVX-NEXT: popq %rax +; AVX-NEXT: .cfi_adjust_cfa_offset -8 +; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1] +; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX-NEXT: vbroadcastsd {{.*#+}} ymm1 = [1,1,1,1] +; AVX-NEXT: retq +; +; AVX2-LABEL: pr23259: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vmovdqa {{.*}}(%rip), %ymm0 +; AVX2-NEXT: pushq $1 +; AVX2-NEXT: .cfi_adjust_cfa_offset 8 +; AVX2-NEXT: popq %rax +; AVX2-NEXT: .cfi_adjust_cfa_offset -8 +; AVX2-NEXT: vmovq %rax, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,1,1] +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [1,1,1,1] +; AVX2-NEXT: retq entry: %0 = load <4 x i64>, <4 x i64>* bitcast (<3 x i64>* @A to <4 x i64>*), align 32 %1 = shufflevector <4 x i64> %0, <4 x i64> undef, <3 x i32> -- GitLab From 5854a1f28315df26b0aefea9566e05b3ddbf520f Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 15 Oct 2018 01:51:53 +0000 Subject: [PATCH 0175/1116] [X86] Add 128 MOVDDUP to the constant pool printing in X86AsmPrinter::EmitInstruction. We use this instruction to broadcast a single 64-bit value to a v2i64/v2f64 vector. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344486 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86MCInstLower.cpp | 6 ++ test/CodeGen/X86/avg.ll | 6 +- .../X86/bitcast-int-to-vector-bool-sext.ll | 3 +- .../X86/bitcast-int-to-vector-bool-zext.ll | 3 +- .../CodeGen/X86/bitcast-int-to-vector-bool.ll | 3 +- .../X86/broadcast-elm-cross-splat-vec.ll | 72 ++++++++++++------- test/CodeGen/X86/splat-for-size.ll | 9 ++- test/CodeGen/X86/urem-seteq-vec-nonsplat.ll | 3 +- 8 files changed, 72 insertions(+), 33 deletions(-) diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp index acb2bc20858..76f0dd4837b 100644 --- a/lib/Target/X86/X86MCInstLower.cpp +++ b/lib/Target/X86/X86MCInstLower.cpp @@ -2133,6 +2133,9 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { } } break; + case X86::MOVDDUPrm: + case X86::VMOVDDUPrm: + case X86::VMOVDDUPZ128rm: case X86::VBROADCASTSSrm: case X86::VBROADCASTSSYrm: case X86::VBROADCASTSSZ128m: @@ -2169,6 +2172,9 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { int NumElts; switch (MI->getOpcode()) { default: llvm_unreachable("Invalid opcode"); + case X86::MOVDDUPrm: NumElts = 2; break; + case X86::VMOVDDUPrm: NumElts = 2; break; + case X86::VMOVDDUPZ128rm: NumElts = 2; break; case X86::VBROADCASTSSrm: NumElts = 4; break; case X86::VBROADCASTSSYrm: NumElts = 8; break; case X86::VBROADCASTSSZ128m: NumElts = 4; break; diff --git a/test/CodeGen/X86/avg.ll b/test/CodeGen/X86/avg.ll index e8a03fe6a7b..84f1296d51c 100644 --- a/test/CodeGen/X86/avg.ll +++ b/test/CodeGen/X86/avg.ll @@ -1256,7 +1256,8 @@ define void @avg_v32i8_const(<32 x i8>* %a) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rdi), %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [7.9499288951273625E-275,7.9499288951273625E-275] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpavgb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpavgb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -1310,7 +1311,8 @@ define void @avg_v64i8_const(<64 x i8>* %a) nounwind { ; AVX1-NEXT: vmovdqa (%rdi), %ymm0 ; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [7.9499288951273625E-275,7.9499288951273625E-275] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vpavgb %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpavgb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 diff --git a/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll b/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll index 74c48e35bfe..c022d7908a1 100644 --- a/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll +++ b/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll @@ -158,7 +158,8 @@ define <16 x i8> @ext_i16_16i8(i16 %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %edi, %xmm0 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [-1.7939930131212661E-307,-1.7939930131212661E-307] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq diff --git a/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll b/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll index 6cd52c4d25c..75b5b701113 100644 --- a/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll +++ b/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll @@ -200,7 +200,8 @@ define <16 x i8> @ext_i16_16i8(i16 %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %edi, %xmm0 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [-1.7939930131212661E-307,-1.7939930131212661E-307] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0 diff --git a/test/CodeGen/X86/bitcast-int-to-vector-bool.ll b/test/CodeGen/X86/bitcast-int-to-vector-bool.ll index 1acc83485ce..3deac92d9ed 100644 --- a/test/CodeGen/X86/bitcast-int-to-vector-bool.ll +++ b/test/CodeGen/X86/bitcast-int-to-vector-bool.ll @@ -163,7 +163,8 @@ define <16 x i1> @bitcast_i16_16i1(i16 zeroext %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %edi, %xmm0 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [-1.7939930131212661E-307,-1.7939930131212661E-307] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0 diff --git a/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll b/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll index 90f65597810..bb79efcbad4 100644 --- a/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll +++ b/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll @@ -87,21 +87,24 @@ define <16 x i8> @f16xi8_i32(<16 x i8> %a) { define <16 x i8> @f16xi8_i64(<16 x i8> %a) { ; AVX-LABEL: f16xi8_i64: ; AVX: # %bb.0: -; AVX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [7.9499288951273625E-275,7.9499288951273625E-275] +; AVX-NEXT: # xmm1 = mem[0,0] ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retl ; ; ALL32-LABEL: f16xi8_i64: ; ALL32: # %bb.0: -; ALL32-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; ALL32-NEXT: vmovddup {{.*#+}} xmm1 = [7.9499288951273625E-275,7.9499288951273625E-275] +; ALL32-NEXT: # xmm1 = mem[0,0] ; ALL32-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0 ; ALL32-NEXT: retl ; ; AVX-64-LABEL: f16xi8_i64: ; AVX-64: # %bb.0: -; AVX-64-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX-64-NEXT: vmovddup {{.*#+}} xmm1 = [7.9499288951273625E-275,7.9499288951273625E-275] +; AVX-64-NEXT: # xmm1 = mem[0,0] ; AVX-64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-64-NEXT: retq @@ -202,7 +205,8 @@ define <32 x i8> @f32xi8_i64(<32 x i8> %a) { ; AVX-LABEL: f32xi8_i64: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX-NEXT: vmovddup {{.*#+}} xmm2 = [7.9499288951273625E-275,7.9499288951273625E-275] +; AVX-NEXT: # xmm2 = mem[0,0] ; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -219,7 +223,8 @@ define <32 x i8> @f32xi8_i64(<32 x i8> %a) { ; AVX-64-LABEL: f32xi8_i64: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-64-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX-64-NEXT: vmovddup {{.*#+}} xmm2 = [7.9499288951273625E-275,7.9499288951273625E-275] +; AVX-64-NEXT: # xmm2 = mem[0,0] ; AVX-64-NEXT: vpaddb %xmm2, %xmm1, %xmm1 ; AVX-64-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -424,7 +429,8 @@ define <64 x i8> @f64xi8_i64(<64 x i8> %a) { ; AVX-LABEL: f64xi8_i64: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [7.9499288951273625E-275,7.9499288951273625E-275] +; AVX-NEXT: # xmm3 = mem[0,0] ; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddb %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 @@ -456,7 +462,8 @@ define <64 x i8> @f64xi8_i64(<64 x i8> %a) { ; AVX-64-LABEL: f64xi8_i64: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-64-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX-64-NEXT: vmovddup {{.*#+}} xmm3 = [7.9499288951273625E-275,7.9499288951273625E-275] +; AVX-64-NEXT: # xmm3 = mem[0,0] ; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX-64-NEXT: vpaddb %xmm3, %xmm1, %xmm1 ; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 @@ -675,21 +682,24 @@ define <8 x i16> @f8xi16_i32(<8 x i16> %a) { define <8 x i16> @f8xi16_i64(<8 x i16> %a) { ; AVX-LABEL: f8xi16_i64: ; AVX: # %bb.0: -; AVX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [4.1720559249406128E-309,4.1720559249406128E-309] +; AVX-NEXT: # xmm1 = mem[0,0] ; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retl ; ; ALL32-LABEL: f8xi16_i64: ; ALL32: # %bb.0: -; ALL32-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; ALL32-NEXT: vmovddup {{.*#+}} xmm1 = [4.1720559249406128E-309,4.1720559249406128E-309] +; ALL32-NEXT: # xmm1 = mem[0,0] ; ALL32-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0 ; ALL32-NEXT: retl ; ; AVX-64-LABEL: f8xi16_i64: ; AVX-64: # %bb.0: -; AVX-64-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX-64-NEXT: vmovddup {{.*#+}} xmm1 = [4.1720559249406128E-309,4.1720559249406128E-309] +; AVX-64-NEXT: # xmm1 = mem[0,0] ; AVX-64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-64-NEXT: retq @@ -750,7 +760,8 @@ define <16 x i16> @f16xi16_i64(<16 x i16> %a) { ; AVX-LABEL: f16xi16_i64: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX-NEXT: vmovddup {{.*#+}} xmm2 = [4.1720559249406128E-309,4.1720559249406128E-309] +; AVX-NEXT: # xmm2 = mem[0,0] ; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -767,7 +778,8 @@ define <16 x i16> @f16xi16_i64(<16 x i16> %a) { ; AVX-64-LABEL: f16xi16_i64: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-64-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX-64-NEXT: vmovddup {{.*#+}} xmm2 = [4.1720559249406128E-309,4.1720559249406128E-309] +; AVX-64-NEXT: # xmm2 = mem[0,0] ; AVX-64-NEXT: vpaddw %xmm2, %xmm1, %xmm1 ; AVX-64-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -902,7 +914,8 @@ define <32 x i16> @f32xi16_i64(<32 x i16> %a) { ; AVX-LABEL: f32xi16_i64: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [4.1720559249406128E-309,4.1720559249406128E-309] +; AVX-NEXT: # xmm3 = mem[0,0] ; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddw %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 @@ -934,7 +947,8 @@ define <32 x i16> @f32xi16_i64(<32 x i16> %a) { ; AVX-64-LABEL: f32xi16_i64: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-64-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX-64-NEXT: vmovddup {{.*#+}} xmm3 = [4.1720559249406128E-309,4.1720559249406128E-309] +; AVX-64-NEXT: # xmm3 = mem[0,0] ; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2 ; AVX-64-NEXT: vpaddw %xmm3, %xmm1, %xmm1 ; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 @@ -1120,21 +1134,24 @@ define <32 x i16> @f32xi16_i256(<32 x i16> %a) { define <4 x i32> @f4xi32_i64(<4 x i32> %a) { ; AVX-LABEL: f4xi32_i64: ; AVX: # %bb.0: -; AVX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [2.1219957909652723E-314,2.1219957909652723E-314] +; AVX-NEXT: # xmm1 = mem[0,0] ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retl ; ; ALL32-LABEL: f4xi32_i64: ; ALL32: # %bb.0: -; ALL32-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; ALL32-NEXT: vmovddup {{.*#+}} xmm1 = [2.1219957909652723E-314,2.1219957909652723E-314] +; ALL32-NEXT: # xmm1 = mem[0,0] ; ALL32-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0 ; ALL32-NEXT: retl ; ; AVX-64-LABEL: f4xi32_i64: ; AVX-64: # %bb.0: -; AVX-64-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX-64-NEXT: vmovddup {{.*#+}} xmm1 = [2.1219957909652723E-314,2.1219957909652723E-314] +; AVX-64-NEXT: # xmm1 = mem[0,0] ; AVX-64-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-64-NEXT: retq @@ -1155,7 +1172,8 @@ define <8 x i32> @f8xi32_i64(<8 x i32> %a) { ; AVX-LABEL: f8xi32_i64: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX-NEXT: vmovddup {{.*#+}} xmm2 = [2.1219957909652723E-314,2.1219957909652723E-314] +; AVX-NEXT: # xmm2 = mem[0,0] ; AVX-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -1172,7 +1190,8 @@ define <8 x i32> @f8xi32_i64(<8 x i32> %a) { ; AVX-64-LABEL: f8xi32_i64: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-64-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX-64-NEXT: vmovddup {{.*#+}} xmm2 = [2.1219957909652723E-314,2.1219957909652723E-314] +; AVX-64-NEXT: # xmm2 = mem[0,0] ; AVX-64-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX-64-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -1237,7 +1256,8 @@ define <16 x i32> @f16xi32_i64(<16 x i32> %a) { ; AVX-LABEL: f16xi32_i64: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [2.1219957909652723E-314,2.1219957909652723E-314] +; AVX-NEXT: # xmm3 = mem[0,0] ; AVX-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddd %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 @@ -1269,7 +1289,8 @@ define <16 x i32> @f16xi32_i64(<16 x i32> %a) { ; AVX-64-LABEL: f16xi32_i64: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-64-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX-64-NEXT: vmovddup {{.*#+}} xmm3 = [2.1219957909652723E-314,2.1219957909652723E-314] +; AVX-64-NEXT: # xmm3 = mem[0,0] ; AVX-64-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ; AVX-64-NEXT: vpaddd %xmm3, %xmm1, %xmm1 ; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 @@ -1573,21 +1594,24 @@ define <8 x i64> @f8xi64_i256(<8 x i64> %a) { define <4 x float> @f4xf32_f64(<4 x float> %a) { ; AVX-LABEL: f4xf32_f64: ; AVX: # %bb.0: -; AVX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [0.0078125018626451492,0.0078125018626451492] +; AVX-NEXT: # xmm1 = mem[0,0] ; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vdivps %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retl ; ; ALL32-LABEL: f4xf32_f64: ; ALL32: # %bb.0: -; ALL32-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; ALL32-NEXT: vmovddup {{.*#+}} xmm1 = [0.0078125018626451492,0.0078125018626451492] +; ALL32-NEXT: # xmm1 = mem[0,0] ; ALL32-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; ALL32-NEXT: vdivps %xmm0, %xmm1, %xmm0 ; ALL32-NEXT: retl ; ; AVX-64-LABEL: f4xf32_f64: ; AVX-64: # %bb.0: -; AVX-64-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX-64-NEXT: vmovddup {{.*#+}} xmm1 = [0.0078125018626451492,0.0078125018626451492] +; AVX-64-NEXT: # xmm1 = mem[0,0] ; AVX-64-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX-64-NEXT: vdivps %xmm0, %xmm1, %xmm0 ; AVX-64-NEXT: retq diff --git a/test/CodeGen/X86/splat-for-size.ll b/test/CodeGen/X86/splat-for-size.ll index 99ed8e8ccb6..7567dbcdad0 100644 --- a/test/CodeGen/X86/splat-for-size.ll +++ b/test/CodeGen/X86/splat-for-size.ll @@ -9,7 +9,8 @@ define <2 x double> @splat_v2f64(<2 x double> %x) #0 { ; CHECK-LABEL: splat_v2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; CHECK-NEXT: vmovddup {{.*#+}} xmm1 = [1,1] +; CHECK-NEXT: # xmm1 = mem[0,0] ; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %add = fadd <2 x double> %x, @@ -51,7 +52,8 @@ define <8 x float> @splat_v8f32(<8 x float> %x) #1 { define <2 x i64> @splat_v2i64(<2 x i64> %x) #1 { ; AVX-LABEL: splat_v2i64: ; AVX: # %bb.0: -; AVX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [2,2] +; AVX-NEXT: # xmm1 = mem[0,0] ; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; @@ -70,7 +72,8 @@ define <4 x i64> @splat_v4i64(<4 x i64> %x) #0 { ; AVX-LABEL: splat_v4i64: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX-NEXT: vmovddup {{.*#+}} xmm2 = [2,2] +; AVX-NEXT: # xmm2 = mem[0,0] ; AVX-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 diff --git a/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll b/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll index 27541c44b9d..82385386c88 100644 --- a/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll +++ b/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll @@ -629,7 +629,8 @@ define <4 x i32> @test_urem_both(<4 x i32> %X) nounwind readnone { ; ; CHECK-AVX1-LABEL: test_urem_both: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; CHECK-AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [-9.255967385052751E+61,-9.255967385052751E+61] +; CHECK-AVX1-NEXT: # xmm1 = mem[0,0] ; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -- GitLab From a0b9470673cf59fa27e8415a962151380fcc4981 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 15 Oct 2018 01:51:58 +0000 Subject: [PATCH 0176/1116] [X86] Move promotion of vector and/or/xor from legalization to DAG combine Summary: I've noticed that the bitcasts we introduce for these make computeKnownBits and computeNumSignBits not work well in LegalizeVectorOps. LegalizeVectorOps legalizes bottom up while LegalizeDAG legalizes top down. The bottom up strategy for LegalizeVectorOps means operands are legalized before their uses. So we promote and/or/xor before we legalize the operands that use them making computeKnownBits/computeNumSignBits in places like LowerTruncate suboptimal. I looked at changing LegalizeVectorOps to be top down as well, but that was more disruptive and caused some regressions. I also looked at just moving promotion of binops to LegalizeDAG, but that had a few issues one around matching AND,ANDN,OR into VSELECT because I had to create ANDN as vXi64, but the other nodes hadn't legalized yet, I didn't look too hard at fixing that. This patch seems to produce better results overall than my other attempts. We now form broadcasts of constants better in some cases. For at least some of them the AND was being introduced in LegalizeDAG, promoted to vXi64, and the BUILD_VECTOR was also legalized there. I think we got bad ordering of that. Now the promotion is out of the legalizer so we handle this better. In the longer term I think we really should evaluate whether we should be doing this promotion at all. It's really there to reduce isel pattern count, but I'm wondering if we'd be better served just eating the pattern cost or doing C++ based isel for vector and/or/xor in X86ISelDAGToDAG. The masked and/or/xor will definitely be difficult in patterns if a bitcast gets between the vselect and the and/or/xor node. That becomes a lot of permutations to cover. Reviewers: RKSimon, spatel Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D53107 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344487 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 51 ++-- test/CodeGen/X86/avx-logic.ll | 12 +- test/CodeGen/X86/avx512-ext.ll | 4 +- test/CodeGen/X86/avx512-insert-extract.ll | 3 - test/CodeGen/X86/avx512-schedule.ll | 8 +- test/CodeGen/X86/avx512vl-vec-masked-cmp.ll | 88 ------- test/CodeGen/X86/cast-vsel.ll | 15 +- test/CodeGen/X86/combine-sdiv.ll | 13 +- test/CodeGen/X86/combine-srl.ll | 65 +++-- test/CodeGen/X86/gather-addresses.ll | 32 +-- test/CodeGen/X86/horizontal-reduce-umax.ll | 110 ++++----- test/CodeGen/X86/horizontal-reduce-umin.ll | 110 ++++----- test/CodeGen/X86/known-bits.ll | 4 +- test/CodeGen/X86/nontemporal-loads.ll | 36 +-- test/CodeGen/X86/paddus.ll | 158 ++++++------ test/CodeGen/X86/psubus.ll | 2 +- test/CodeGen/X86/sat-add.ll | 61 ++--- test/CodeGen/X86/setcc-lowering.ll | 12 +- test/CodeGen/X86/sse2-intrinsics-canonical.ll | 12 +- ...-masked-merge-vector-variablemask-const.ll | 8 +- test/CodeGen/X86/v8i1-masks.ll | 10 +- test/CodeGen/X86/vector-blend.ll | 2 +- test/CodeGen/X86/vector-reduce-umax.ll | 102 ++++---- test/CodeGen/X86/vector-reduce-umin.ll | 102 ++++---- test/CodeGen/X86/vector-shift-lshr-128.ll | 12 +- test/CodeGen/X86/vector-shift-shl-128.ll | 8 +- test/CodeGen/X86/vector-shuffle-256-v16.ll | 4 +- test/CodeGen/X86/vector-trunc-math.ll | 228 +++++++++--------- test/CodeGen/X86/vector-trunc-packus.ll | 30 +-- test/CodeGen/X86/vector-trunc-ssat.ll | 3 +- test/CodeGen/X86/vector-trunc-usat.ll | 3 +- test/CodeGen/X86/vector-trunc.ll | 21 +- test/CodeGen/X86/vshift-6.ll | 4 +- test/CodeGen/X86/x86-interleaved-access.ll | 6 +- 34 files changed, 592 insertions(+), 747 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 4eaf1cc921b..be6f9ed2188 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -871,9 +871,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) { - setOperationPromotedToType(ISD::AND, VT, MVT::v2i64); - setOperationPromotedToType(ISD::OR, VT, MVT::v2i64); - setOperationPromotedToType(ISD::XOR, VT, MVT::v2i64); setOperationPromotedToType(ISD::LOAD, VT, MVT::v2i64); } @@ -1183,9 +1180,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64. for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) { - setOperationPromotedToType(ISD::AND, VT, MVT::v4i64); - setOperationPromotedToType(ISD::OR, VT, MVT::v4i64); - setOperationPromotedToType(ISD::XOR, VT, MVT::v4i64); setOperationPromotedToType(ISD::LOAD, VT, MVT::v4i64); } @@ -1384,13 +1378,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setCondCodeAction(ISD::SETLE, VT, Custom); } - // Need to promote to 64-bit even though we have 32-bit masked instructions - // because the IR optimizers rearrange bitcasts around logic ops leaving - // too many variations to handle if we don't promote them. - setOperationPromotedToType(ISD::AND, MVT::v16i32, MVT::v8i64); - setOperationPromotedToType(ISD::OR, MVT::v16i32, MVT::v8i64); - setOperationPromotedToType(ISD::XOR, MVT::v16i32, MVT::v8i64); - if (Subtarget.hasDQI()) { setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal); @@ -1593,10 +1580,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::UMIN, VT, Legal); setOperationAction(ISD::SETCC, VT, Custom); - setOperationPromotedToType(ISD::AND, VT, MVT::v8i64); - setOperationPromotedToType(ISD::OR, VT, MVT::v8i64); - setOperationPromotedToType(ISD::XOR, VT, MVT::v8i64); - // The condition codes aren't legal in SSE/AVX and under AVX512 we use // setcc all the way to isel and prefer SETGT in some isel patterns. setCondCodeAction(ISD::SETLT, VT, Custom); @@ -35226,6 +35209,10 @@ static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG, !SplatVal.isMask()) return SDValue(); + // Don't prevent creation of ANDN. + if (isBitwiseNot(Op0)) + return SDValue(); + if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL)) return SDValue(); @@ -35426,6 +35413,27 @@ static SDValue combineParity(SDNode *N, SelectionDAG &DAG, return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0), Setnp); } +// This promotes vectors and/or/xor to a vXi64 type. We used to do this during +// op legalization, but DAG combine yields better results. +// TODO: This is largely just to reduce the number of isel patterns. Maybe we +// can just add all the patterns or do C++ based selection in X86ISelDAGToDAG? +static SDValue promoteVecLogicOp(SDNode *N, SelectionDAG &DAG) { + MVT VT = N->getSimpleValueType(0); + + if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector()) + return SDValue(); + + // Already correct type. + if (VT.getVectorElementType() == MVT::i64) + return SDValue(); + + MVT NewVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64); + SDValue Op0 = DAG.getBitcast(NewVT, N->getOperand(0)); + SDValue Op1 = DAG.getBitcast(NewVT, N->getOperand(1)); + return DAG.getBitcast(VT, DAG.getNode(N->getOpcode(), SDLoc(N), NewVT, + Op0, Op1)); +} + static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { @@ -35460,6 +35468,9 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, if (DCI.isBeforeLegalizeOps()) return SDValue(); + if (SDValue V = promoteVecLogicOp(N, DAG)) + return V; + if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget)) return R; @@ -35782,6 +35793,9 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG, if (DCI.isBeforeLegalizeOps()) return SDValue(); + if (SDValue V = promoteVecLogicOp(N, DAG)) + return V; + if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget)) return R; @@ -37810,6 +37824,9 @@ static SDValue combineXor(SDNode *N, SelectionDAG &DAG, if (DCI.isBeforeLegalizeOps()) return SDValue(); + if (SDValue V = promoteVecLogicOp(N, DAG)) + return V; + if (SDValue SetCC = foldXor1SetCC(N, DAG)) return SetCC; diff --git a/test/CodeGen/X86/avx-logic.ll b/test/CodeGen/X86/avx-logic.ll index 0fe5cbacc84..f22c6257e45 100644 --- a/test/CodeGen/X86/avx-logic.ll +++ b/test/CodeGen/X86/avx-logic.ll @@ -314,7 +314,7 @@ define <8 x i32> @and_disguised_i8_elts(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1095216660735,1095216660735] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 ; AVX1-NEXT: vpaddd %xmm4, %xmm0, %xmm0 @@ -342,7 +342,7 @@ define <8 x i32> @andn_disguised_i8_elts(<8 x i32> %x, <8 x i32> %y, <8 x i32> % ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 ; AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1095216660735,1095216660735] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255] ; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpandn %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 @@ -450,7 +450,7 @@ define <8 x i32> @or_disguised_i8_elts(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z) ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1095216660735,1095216660735] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255] ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 ; AVX1-NEXT: vpaddd %xmm4, %xmm0, %xmm0 @@ -479,7 +479,7 @@ define <8 x i32> @xor_disguised_i8_elts(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1095216660735,1095216660735] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255] ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 ; AVX1-NEXT: vpaddd %xmm4, %xmm0, %xmm0 @@ -537,7 +537,7 @@ define <8 x i32> @or_disguised_i16_elts(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [281470681808895,281470681808895] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,65535,65535,65535] ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 ; AVX1-NEXT: vpaddd %xmm4, %xmm0, %xmm0 @@ -566,7 +566,7 @@ define <8 x i32> @xor_disguised_i16_elts(<8 x i32> %x, <8 x i32> %y, <8 x i32> % ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [281470681808895,281470681808895] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,65535,65535,65535] ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 ; AVX1-NEXT: vpaddd %xmm4, %xmm0, %xmm0 diff --git a/test/CodeGen/X86/avx512-ext.ll b/test/CodeGen/X86/avx512-ext.ll index c23a474a97f..d56cf0fe09e 100644 --- a/test/CodeGen/X86/avx512-ext.ll +++ b/test/CodeGen/X86/avx512-ext.ll @@ -2157,7 +2157,7 @@ define <32 x i8> @zext_32xi1_to_32xi8(<32 x i16> %x, <32 x i16> %y) #0 { define <4 x i32> @zext_4xi1_to_4x32(<4 x i8> %x, <4 x i8> %y) #0 { ; ALL-LABEL: zext_4xi1_to_4x32: ; ALL: # %bb.0: -; ALL-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; ALL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255] ; ALL-NEXT: vpand %xmm2, %xmm1, %xmm1 ; ALL-NEXT: vpand %xmm2, %xmm0, %xmm0 ; ALL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 @@ -2171,7 +2171,7 @@ define <4 x i32> @zext_4xi1_to_4x32(<4 x i8> %x, <4 x i8> %y) #0 { define <2 x i64> @zext_2xi1_to_2xi64(<2 x i8> %x, <2 x i8> %y) #0 { ; ALL-LABEL: zext_2xi1_to_2xi64: ; ALL: # %bb.0: -; ALL-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; ALL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [255,255] ; ALL-NEXT: vpand %xmm2, %xmm1, %xmm1 ; ALL-NEXT: vpand %xmm2, %xmm0, %xmm0 ; ALL-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 diff --git a/test/CodeGen/X86/avx512-insert-extract.ll b/test/CodeGen/X86/avx512-insert-extract.ll index db3716c9530..e29d62b2605 100644 --- a/test/CodeGen/X86/avx512-insert-extract.ll +++ b/test/CodeGen/X86/avx512-insert-extract.ll @@ -993,7 +993,6 @@ define zeroext i8 @test_extractelement_v32i1(<32 x i8> %a, <32 x i8> %b) { ; KNL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 ; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 -; KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kshiftrw $2, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax @@ -1023,7 +1022,6 @@ define zeroext i8 @test_extractelement_v64i1(<64 x i8> %a, <64 x i8> %b) { ; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 ; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 -; KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax @@ -1059,7 +1057,6 @@ define zeroext i8 @extractelement_v64i1_alt(<64 x i8> %a, <64 x i8> %b) { ; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 ; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 -; KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax diff --git a/test/CodeGen/X86/avx512-schedule.ll b/test/CodeGen/X86/avx512-schedule.ll index 71dabf70a18..e0237ff0d83 100755 --- a/test/CodeGen/X86/avx512-schedule.ll +++ b/test/CodeGen/X86/avx512-schedule.ll @@ -4711,7 +4711,7 @@ define <32 x i8> @zext_32xi1_to_32xi8(<32 x i16> %x, <32 x i16> %y) #0 { define <4 x i32> @zext_4xi1_to_4x32(<4 x i8> %x, <4 x i8> %y) #0 { ; GENERIC-LABEL: zext_4xi1_to_4x32: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] sched: [6:0.50] +; GENERIC-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255] sched: [7:0.50] ; GENERIC-NEXT: vpand %xmm2, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpand %xmm2, %xmm0, %xmm0 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] @@ -4720,7 +4720,7 @@ define <4 x i32> @zext_4xi1_to_4x32(<4 x i8> %x, <4 x i8> %y) #0 { ; ; SKX-LABEL: zext_4xi1_to_4x32: ; SKX: # %bb.0: -; SKX-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] sched: [6:0.50] +; SKX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255] sched: [6:0.50] ; SKX-NEXT: vpand %xmm2, %xmm1, %xmm1 # sched: [1:0.33] ; SKX-NEXT: vpand %xmm2, %xmm0, %xmm0 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] @@ -4734,7 +4734,7 @@ define <4 x i32> @zext_4xi1_to_4x32(<4 x i8> %x, <4 x i8> %y) #0 { define <2 x i64> @zext_2xi1_to_2xi64(<2 x i8> %x, <2 x i8> %y) #0 { ; GENERIC-LABEL: zext_2xi1_to_2xi64: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] sched: [6:0.50] +; GENERIC-NEXT: vpbroadcastq {{.*#+}} xmm2 = [255,255] sched: [7:0.50] ; GENERIC-NEXT: vpand %xmm2, %xmm1, %xmm1 # sched: [1:0.33] ; GENERIC-NEXT: vpand %xmm2, %xmm0, %xmm0 # sched: [1:0.33] ; GENERIC-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] @@ -4743,7 +4743,7 @@ define <2 x i64> @zext_2xi1_to_2xi64(<2 x i8> %x, <2 x i8> %y) #0 { ; ; SKX-LABEL: zext_2xi1_to_2xi64: ; SKX: # %bb.0: -; SKX-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] sched: [6:0.50] +; SKX-NEXT: vpbroadcastq {{.*#+}} xmm2 = [255,255] sched: [6:0.50] ; SKX-NEXT: vpand %xmm2, %xmm1, %xmm1 # sched: [1:0.33] ; SKX-NEXT: vpand %xmm2, %xmm0, %xmm0 # sched: [1:0.33] ; SKX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] diff --git a/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll b/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll index 8c3fe900336..79de4aec42b 100644 --- a/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll +++ b/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll @@ -9780,7 +9780,6 @@ define zeroext i32 @test_vpcmpsgeb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %_ ; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vzeroupper @@ -9807,7 +9806,6 @@ define zeroext i32 @test_vpcmpsgeb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64 ; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vzeroupper @@ -9835,7 +9833,6 @@ define zeroext i32 @test_masked_vpcmpsgeb_v16i1_v32i1_mask(i16 zeroext %__u, <2 ; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: andl %edi, %eax @@ -9866,7 +9863,6 @@ define zeroext i32 @test_masked_vpcmpsgeb_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: andl %edi, %eax @@ -9897,7 +9893,6 @@ define zeroext i64 @test_vpcmpsgeb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %_ ; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: movzwl %ax, %eax @@ -9925,7 +9920,6 @@ define zeroext i64 @test_vpcmpsgeb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64 ; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: movzwl %ax, %eax @@ -9954,7 +9948,6 @@ define zeroext i64 @test_masked_vpcmpsgeb_v16i1_v64i1_mask(i16 zeroext %__u, <2 ; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: andl %edi, %eax @@ -9985,7 +9978,6 @@ define zeroext i64 @test_masked_vpcmpsgeb_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: andl %edi, %eax @@ -10017,12 +10009,10 @@ define zeroext i64 @test_vpcmpsgeb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_ ; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, %ecx ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: shll $16, %eax @@ -10052,12 +10042,10 @@ define zeroext i64 @test_vpcmpsgeb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64 ; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, %ecx ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: shll $16, %eax @@ -10088,14 +10076,12 @@ define zeroext i64 @test_masked_vpcmpsgeb_v32i1_v64i1_mask(i32 zeroext %__u, <4 ; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: andl %edi, %eax ; NoVLX-NEXT: shrl $16, %edi ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %ecx ; NoVLX-NEXT: andl %edi, %ecx @@ -10130,14 +10116,12 @@ define zeroext i64 @test_masked_vpcmpsgeb_v32i1_v64i1_mask_mem(i32 zeroext %__u, ; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: andl %edi, %eax ; NoVLX-NEXT: shrl $16, %edi ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %ecx ; NoVLX-NEXT: andl %edi, %ecx @@ -10172,7 +10156,6 @@ define zeroext i16 @test_vpcmpsgew_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: def $ax killed $ax killed $eax @@ -10201,7 +10184,6 @@ define zeroext i16 @test_vpcmpsgew_v8i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: def $ax killed $ax killed $eax @@ -10231,7 +10213,6 @@ define zeroext i16 @test_masked_vpcmpsgew_v8i1_v16i1_mask(i8 zeroext %__u, <2 x ; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kmovw %k0, %eax @@ -10264,7 +10245,6 @@ define zeroext i16 @test_masked_vpcmpsgew_v8i1_v16i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kmovw %k0, %eax @@ -10296,7 +10276,6 @@ define zeroext i32 @test_vpcmpsgew_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vzeroupper @@ -10323,7 +10302,6 @@ define zeroext i32 @test_vpcmpsgew_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vzeroupper @@ -10351,7 +10329,6 @@ define zeroext i32 @test_masked_vpcmpsgew_v8i1_v32i1_mask(i8 zeroext %__u, <2 x ; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kmovw %k0, %eax @@ -10382,7 +10359,6 @@ define zeroext i32 @test_masked_vpcmpsgew_v8i1_v32i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kmovw %k0, %eax @@ -10413,7 +10389,6 @@ define zeroext i64 @test_vpcmpsgew_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: movzwl %ax, %eax @@ -10441,7 +10416,6 @@ define zeroext i64 @test_vpcmpsgew_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: movzwl %ax, %eax @@ -10470,7 +10444,6 @@ define zeroext i64 @test_masked_vpcmpsgew_v8i1_v64i1_mask(i8 zeroext %__u, <2 x ; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kmovw %k0, %eax @@ -10502,7 +10475,6 @@ define zeroext i64 @test_masked_vpcmpsgew_v8i1_v64i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kmovw %k0, %eax @@ -10535,7 +10507,6 @@ define zeroext i32 @test_vpcmpsgew_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %_ ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vzeroupper @@ -10563,7 +10534,6 @@ define zeroext i32 @test_vpcmpsgew_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64 ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vzeroupper @@ -10592,7 +10562,6 @@ define zeroext i32 @test_masked_vpcmpsgew_v16i1_v32i1_mask(i16 zeroext %__u, <4 ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: andl %edi, %eax @@ -10624,7 +10593,6 @@ define zeroext i32 @test_masked_vpcmpsgew_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: andl %edi, %eax @@ -10656,7 +10624,6 @@ define zeroext i64 @test_vpcmpsgew_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_ ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: movzwl %ax, %eax @@ -10685,7 +10652,6 @@ define zeroext i64 @test_vpcmpsgew_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64 ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: movzwl %ax, %eax @@ -10715,7 +10681,6 @@ define zeroext i64 @test_masked_vpcmpsgew_v16i1_v64i1_mask(i16 zeroext %__u, <4 ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: andl %edi, %eax @@ -10747,7 +10712,6 @@ define zeroext i64 @test_masked_vpcmpsgew_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: andl %edi, %eax @@ -10782,12 +10746,10 @@ define zeroext i64 @test_vpcmpsgew_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_ ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %ecx ; NoVLX-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 ; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: shll $16, %eax @@ -10820,12 +10782,10 @@ define zeroext i64 @test_vpcmpsgew_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64 ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %ecx ; NoVLX-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 ; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: shll $16, %eax @@ -10856,7 +10816,6 @@ define zeroext i64 @test_masked_vpcmpsgew_v32i1_v64i1_mask(i32 zeroext %__u, <8 ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm2 ; NoVLX-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 ; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 -; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 ; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: andl %edi, %eax @@ -10866,7 +10825,6 @@ define zeroext i64 @test_masked_vpcmpsgew_v32i1_v64i1_mask(i32 zeroext %__u, <8 ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %ecx ; NoVLX-NEXT: andl %edi, %ecx @@ -10901,7 +10859,6 @@ define zeroext i64 @test_masked_vpcmpsgew_v32i1_v64i1_mask_mem(i32 zeroext %__u, ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm1 ; NoVLX-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 ; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: andl %edi, %eax @@ -10911,7 +10868,6 @@ define zeroext i64 @test_masked_vpcmpsgew_v32i1_v64i1_mask_mem(i32 zeroext %__u, ; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %ecx ; NoVLX-NEXT: andl %edi, %ecx @@ -14768,7 +14724,6 @@ define zeroext i32 @test_vpcmpultb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %_ ; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vzeroupper @@ -14795,7 +14750,6 @@ define zeroext i32 @test_vpcmpultb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64 ; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vzeroupper @@ -14824,7 +14778,6 @@ define zeroext i32 @test_masked_vpcmpultb_v16i1_v32i1_mask(i16 zeroext %__u, <2 ; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: andl %edi, %eax @@ -14855,7 +14808,6 @@ define zeroext i32 @test_masked_vpcmpultb_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: andl %edi, %eax @@ -14887,7 +14839,6 @@ define zeroext i64 @test_vpcmpultb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %_ ; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: movzwl %ax, %eax @@ -14915,7 +14866,6 @@ define zeroext i64 @test_vpcmpultb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64 ; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: movzwl %ax, %eax @@ -14945,7 +14895,6 @@ define zeroext i64 @test_masked_vpcmpultb_v16i1_v64i1_mask(i16 zeroext %__u, <2 ; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: andl %edi, %eax @@ -14976,7 +14925,6 @@ define zeroext i64 @test_masked_vpcmpultb_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: andl %edi, %eax @@ -15009,12 +14957,10 @@ define zeroext i64 @test_vpcmpultb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_ ; NoVLX-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, %ecx ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: shll $16, %eax @@ -15044,12 +14990,10 @@ define zeroext i64 @test_vpcmpultb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64 ; NoVLX-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, %ecx ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: shll $16, %eax @@ -15081,14 +15025,12 @@ define zeroext i64 @test_masked_vpcmpultb_v32i1_v64i1_mask(i32 zeroext %__u, <4 ; NoVLX-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: andl %edi, %eax ; NoVLX-NEXT: shrl $16, %edi ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %ecx ; NoVLX-NEXT: andl %edi, %ecx @@ -15123,14 +15065,12 @@ define zeroext i64 @test_masked_vpcmpultb_v32i1_v64i1_mask_mem(i32 zeroext %__u, ; NoVLX-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: andl %edi, %eax ; NoVLX-NEXT: shrl $16, %edi ; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %ecx ; NoVLX-NEXT: andl %edi, %ecx @@ -15166,7 +15106,6 @@ define zeroext i16 @test_vpcmpultw_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: def $ax killed $ax killed $eax @@ -15195,7 +15134,6 @@ define zeroext i16 @test_vpcmpultw_v8i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: # kill: def $ax killed $ax killed $eax @@ -15226,7 +15164,6 @@ define zeroext i16 @test_masked_vpcmpultw_v8i1_v16i1_mask(i8 zeroext %__u, <2 x ; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kmovw %k0, %eax @@ -15259,7 +15196,6 @@ define zeroext i16 @test_masked_vpcmpultw_v8i1_v16i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kmovw %k0, %eax @@ -15292,7 +15228,6 @@ define zeroext i32 @test_vpcmpultw_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vzeroupper @@ -15319,7 +15254,6 @@ define zeroext i32 @test_vpcmpultw_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vzeroupper @@ -15348,7 +15282,6 @@ define zeroext i32 @test_masked_vpcmpultw_v8i1_v32i1_mask(i8 zeroext %__u, <2 x ; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kmovw %k0, %eax @@ -15379,7 +15312,6 @@ define zeroext i32 @test_masked_vpcmpultw_v8i1_v32i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kmovw %k0, %eax @@ -15411,7 +15343,6 @@ define zeroext i64 @test_vpcmpultw_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__ ; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: movzwl %ax, %eax @@ -15439,7 +15370,6 @@ define zeroext i64 @test_vpcmpultw_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64> ; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: movzwl %ax, %eax @@ -15469,7 +15399,6 @@ define zeroext i64 @test_masked_vpcmpultw_v8i1_v64i1_mask(i8 zeroext %__u, <2 x ; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kmovw %k0, %eax @@ -15501,7 +15430,6 @@ define zeroext i64 @test_masked_vpcmpultw_v8i1_v64i1_mask_mem(i8 zeroext %__u, < ; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0 -; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0 ; NoVLX-NEXT: kmovw %edi, %k1 ; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} ; NoVLX-NEXT: kmovw %k0, %eax @@ -15535,7 +15463,6 @@ define zeroext i32 @test_vpcmpultw_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %_ ; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vzeroupper @@ -15563,7 +15490,6 @@ define zeroext i32 @test_vpcmpultw_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64 ; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: vzeroupper @@ -15593,7 +15519,6 @@ define zeroext i32 @test_masked_vpcmpultw_v16i1_v32i1_mask(i16 zeroext %__u, <4 ; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: andl %edi, %eax @@ -15625,7 +15550,6 @@ define zeroext i32 @test_masked_vpcmpultw_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: andl %edi, %eax @@ -15658,7 +15582,6 @@ define zeroext i64 @test_vpcmpultw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_ ; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: movzwl %ax, %eax @@ -15687,7 +15610,6 @@ define zeroext i64 @test_vpcmpultw_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64 ; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: movzwl %ax, %eax @@ -15718,7 +15640,6 @@ define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask(i16 zeroext %__u, <4 ; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: andl %edi, %eax @@ -15750,7 +15671,6 @@ define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: andl %edi, %eax @@ -15785,14 +15705,12 @@ define zeroext i64 @test_vpcmpultw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_ ; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %ecx ; NoVLX-NEXT: vpmaxuw %ymm3, %ymm2, %ymm0 ; NoVLX-NEXT: vpcmpeqw %ymm0, %ymm2, %ymm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: shll $16, %eax @@ -15823,14 +15741,12 @@ define zeroext i64 @test_vpcmpultw_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64 ; NoVLX-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %ecx ; NoVLX-NEXT: vpmaxuw 32(%rdi), %ymm1, %ymm0 ; NoVLX-NEXT: vpcmpeqw %ymm0, %ymm1, %ymm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: shll $16, %eax @@ -15862,7 +15778,6 @@ define zeroext i64 @test_masked_vpcmpultw_v32i1_v64i1_mask(i32 zeroext %__u, <8 ; NoVLX-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm2 ; NoVLX-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2 ; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2 -; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2 ; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: andl %edi, %eax @@ -15873,7 +15788,6 @@ define zeroext i64 @test_masked_vpcmpultw_v32i1_v64i1_mask(i32 zeroext %__u, <8 ; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %ecx ; NoVLX-NEXT: andl %edi, %ecx @@ -15908,7 +15822,6 @@ define zeroext i64 @test_masked_vpcmpultw_v32i1_v64i1_mask_mem(i32 zeroext %__u, ; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm1 ; NoVLX-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 ; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1 -; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1 ; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0 ; NoVLX-NEXT: kmovw %k0, %eax ; NoVLX-NEXT: andl %edi, %eax @@ -15918,7 +15831,6 @@ define zeroext i64 @test_masked_vpcmpultw_v32i1_v64i1_mask_mem(i32 zeroext %__u, ; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 ; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0 -; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 ; NoVLX-NEXT: kmovw %k0, %ecx ; NoVLX-NEXT: andl %edi, %ecx diff --git a/test/CodeGen/X86/cast-vsel.ll b/test/CodeGen/X86/cast-vsel.ll index ff41083835f..b1e4243d01a 100644 --- a/test/CodeGen/X86/cast-vsel.ll +++ b/test/CodeGen/X86/cast-vsel.ll @@ -357,17 +357,16 @@ define void @example25() nounwind { ; AVX2-LABEL: example25: ; AVX2: # %bb.0: # %vector.ph ; AVX2-NEXT: movq $-4096, %rax # imm = 0xF000 -; AVX2-NEXT: vbroadcastss {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1] ; AVX2-NEXT: .p2align 4, 0x90 ; AVX2-NEXT: .LBB5_1: # %vector.body ; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX2-NEXT: vmovups da+4096(%rax), %ymm1 -; AVX2-NEXT: vcmpltps db+4096(%rax), %ymm1, %ymm1 -; AVX2-NEXT: vmovups dc+4096(%rax), %ymm2 -; AVX2-NEXT: vcmpltps dd+4096(%rax), %ymm2, %ymm2 -; AVX2-NEXT: vandps %ymm0, %ymm2, %ymm2 -; AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vmovups %ymm1, dj+4096(%rax) +; AVX2-NEXT: vmovups da+4096(%rax), %ymm0 +; AVX2-NEXT: vcmpltps db+4096(%rax), %ymm0, %ymm0 +; AVX2-NEXT: vmovups dc+4096(%rax), %ymm1 +; AVX2-NEXT: vcmpltps dd+4096(%rax), %ymm1, %ymm1 +; AVX2-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $31, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqu %ymm0, dj+4096(%rax) ; AVX2-NEXT: addq $32, %rax ; AVX2-NEXT: jne .LBB5_1 ; AVX2-NEXT: # %bb.2: # %for.end diff --git a/test/CodeGen/X86/combine-sdiv.ll b/test/CodeGen/X86/combine-sdiv.ll index 72d458c8513..26a3cd47645 100644 --- a/test/CodeGen/X86/combine-sdiv.ll +++ b/test/CodeGen/X86/combine-sdiv.ll @@ -726,7 +726,8 @@ define <16 x i16> @combine_vec_sdiv_by_pow2b_v16i16(<16 x i16> %x) { ; AVX1-NEXT: vpsraw $1, %xmm3, %xmm3 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4,5,6],xmm3[7] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535] +; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535] +; AVX1-NEXT: # ymm2 = mem[0,1,0,1] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0 ; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 @@ -777,7 +778,9 @@ define <16 x i16> @combine_vec_sdiv_by_pow2b_v16i16(<16 x i16> %x) { ; XOP-NEXT: vpaddw %xmm3, %xmm0, %xmm3 ; XOP-NEXT: vpshaw %xmm2, %xmm3, %xmm2 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; XOP-NEXT: vpcmov {{.*}}(%rip), %ymm0, %ymm1, %ymm0 +; XOP-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535] +; XOP-NEXT: # ymm2 = mem[0,1,0,1] +; XOP-NEXT: vpcmov %ymm2, %ymm0, %ymm1, %ymm0 ; XOP-NEXT: retq %1 = sdiv <16 x i16> %x, ret <16 x i16> %1 @@ -960,7 +963,8 @@ define <32 x i16> @combine_vec_sdiv_by_pow2b_v32i16(<32 x i16> %x) { ; AVX1-NEXT: vpsraw $1, %xmm5, %xmm5 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3,4,5,6],xmm5[7] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 -; AVX1-NEXT: vmovaps {{.*#+}} ymm5 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535] +; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535] +; AVX1-NEXT: # ymm5 = mem[0,1,0,1] ; AVX1-NEXT: vandps %ymm5, %ymm2, %ymm2 ; AVX1-NEXT: vandnps %ymm0, %ymm5, %ymm0 ; AVX1-NEXT: vorps %ymm0, %ymm2, %ymm0 @@ -1055,7 +1059,8 @@ define <32 x i16> @combine_vec_sdiv_by_pow2b_v32i16(<32 x i16> %x) { ; XOP-NEXT: vpaddw %xmm5, %xmm0, %xmm5 ; XOP-NEXT: vpshaw %xmm3, %xmm5, %xmm5 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 -; XOP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535] +; XOP-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535] +; XOP-NEXT: # ymm5 = mem[0,1,0,1] ; XOP-NEXT: vpcmov %ymm5, %ymm0, %ymm2, %ymm0 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2 ; XOP-NEXT: vpsraw $15, %xmm2, %xmm6 diff --git a/test/CodeGen/X86/combine-srl.ll b/test/CodeGen/X86/combine-srl.ll index 9bd0be073f6..e0692166171 100644 --- a/test/CodeGen/X86/combine-srl.ll +++ b/test/CodeGen/X86/combine-srl.ll @@ -357,55 +357,50 @@ define <4 x i32> @combine_vec_lshr_lzcnt_bit1(<4 x i32> %x) { ; SSE-LABEL: combine_vec_lshr_lzcnt_bit1: ; SSE: # %bb.0: ; SSE-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: pshufb %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: psrlw $4, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSE-NEXT: movdqa %xmm3, %xmm4 ; SSE-NEXT: pshufb %xmm1, %xmm3 ; SSE-NEXT: pcmpeqb %xmm2, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: paddb %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pcmpeqb %xmm2, %xmm3 -; SSE-NEXT: psrlw $8, %xmm3 -; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE-NEXT: pand %xmm0, %xmm5 +; SSE-NEXT: pshufb %xmm5, %xmm4 +; SSE-NEXT: pand %xmm1, %xmm4 +; SSE-NEXT: paddb %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pcmpeqb %xmm2, %xmm1 ; SSE-NEXT: psrlw $8, %xmm1 -; SSE-NEXT: paddw %xmm3, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: psrlw $8, %xmm3 +; SSE-NEXT: paddw %xmm1, %xmm3 ; SSE-NEXT: pcmpeqw %xmm2, %xmm0 ; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: paddd %xmm0, %xmm1 -; SSE-NEXT: psrld $5, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: psrld $16, %xmm3 +; SSE-NEXT: paddd %xmm3, %xmm0 +; SSE-NEXT: psrld $5, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_vec_lshr_lzcnt_bit1: ; AVX: # %bb.0: ; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX-NEXT: vpand %xmm1, %xmm4, %xmm1 -; AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX-NEXT: vpcmpeqb %xmm4, %xmm1, %xmm5 -; AVX-NEXT: vpand %xmm5, %xmm2, %xmm2 -; AVX-NEXT: vpshufb %xmm1, %xmm3, %xmm1 -; AVX-NEXT: vpaddb %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm2 -; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm2 +; AVX-NEXT: vpsrlw $4, %xmm0, %xmm1 +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm3 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm4 +; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm4, %xmm5, %xmm4 +; AVX-NEXT: vpand %xmm3, %xmm4, %xmm3 +; AVX-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX-NEXT: vpaddb %xmm1, %xmm3, %xmm1 +; AVX-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm3 +; AVX-NEXT: vpsrlw $8, %xmm3, %xmm3 +; AVX-NEXT: vpand %xmm3, %xmm1, %xmm3 ; AVX-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0 +; AVX-NEXT: vpaddw %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 ; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vpsrld $16, %xmm1, %xmm1 diff --git a/test/CodeGen/X86/gather-addresses.ll b/test/CodeGen/X86/gather-addresses.ll index ca8fd2acfa3..6468523b3c4 100644 --- a/test/CodeGen/X86/gather-addresses.ll +++ b/test/CodeGen/X86/gather-addresses.ll @@ -149,11 +149,11 @@ define <4 x i64> @old(double* %p, <4 x i32>* %i, <4 x i32>* %h, i64 %f) nounwind ; LIN-SSE2-NEXT: andl %ecx, %edx ; LIN-SSE2-NEXT: andl %ecx, %esi ; LIN-SSE2-NEXT: andl %ecx, %edi -; LIN-SSE2-NEXT: movd %eax, %xmm0 -; LIN-SSE2-NEXT: movd %edx, %xmm1 +; LIN-SSE2-NEXT: movq %rax, %xmm0 +; LIN-SSE2-NEXT: movq %rdx, %xmm1 ; LIN-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; LIN-SSE2-NEXT: movd %edi, %xmm2 -; LIN-SSE2-NEXT: movd %esi, %xmm1 +; LIN-SSE2-NEXT: movq %rdi, %xmm2 +; LIN-SSE2-NEXT: movq %rsi, %xmm1 ; LIN-SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; LIN-SSE2-NEXT: retq ; @@ -169,11 +169,11 @@ define <4 x i64> @old(double* %p, <4 x i32>* %i, <4 x i32>* %h, i64 %f) nounwind ; LIN-SSE4-NEXT: andl %ecx, %edx ; LIN-SSE4-NEXT: andl %ecx, %esi ; LIN-SSE4-NEXT: andl %ecx, %edi -; LIN-SSE4-NEXT: movd %edx, %xmm1 -; LIN-SSE4-NEXT: movd %eax, %xmm0 +; LIN-SSE4-NEXT: movq %rdx, %xmm1 +; LIN-SSE4-NEXT: movq %rax, %xmm0 ; LIN-SSE4-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; LIN-SSE4-NEXT: movd %edi, %xmm2 -; LIN-SSE4-NEXT: movd %esi, %xmm1 +; LIN-SSE4-NEXT: movq %rdi, %xmm2 +; LIN-SSE4-NEXT: movq %rsi, %xmm1 ; LIN-SSE4-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; LIN-SSE4-NEXT: retq ; @@ -192,11 +192,11 @@ define <4 x i64> @old(double* %p, <4 x i32>* %i, <4 x i32>* %h, i64 %f) nounwind ; WIN-SSE2-NEXT: andl %r9d, %ecx ; WIN-SSE2-NEXT: andl %r9d, %r8d ; WIN-SSE2-NEXT: andl %r9d, %edx -; WIN-SSE2-NEXT: movd %eax, %xmm0 -; WIN-SSE2-NEXT: movd %ecx, %xmm1 +; WIN-SSE2-NEXT: movq %rax, %xmm0 +; WIN-SSE2-NEXT: movq %rcx, %xmm1 ; WIN-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; WIN-SSE2-NEXT: movd %edx, %xmm2 -; WIN-SSE2-NEXT: movd %r8d, %xmm1 +; WIN-SSE2-NEXT: movq %rdx, %xmm2 +; WIN-SSE2-NEXT: movq %r8, %xmm1 ; WIN-SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; WIN-SSE2-NEXT: retq ; @@ -212,11 +212,11 @@ define <4 x i64> @old(double* %p, <4 x i32>* %i, <4 x i32>* %h, i64 %f) nounwind ; WIN-SSE4-NEXT: andl %r9d, %ecx ; WIN-SSE4-NEXT: andl %r9d, %r8d ; WIN-SSE4-NEXT: andl %r9d, %edx -; WIN-SSE4-NEXT: movd %ecx, %xmm1 -; WIN-SSE4-NEXT: movd %eax, %xmm0 +; WIN-SSE4-NEXT: movq %rcx, %xmm1 +; WIN-SSE4-NEXT: movq %rax, %xmm0 ; WIN-SSE4-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; WIN-SSE4-NEXT: movd %edx, %xmm2 -; WIN-SSE4-NEXT: movd %r8d, %xmm1 +; WIN-SSE4-NEXT: movq %rdx, %xmm2 +; WIN-SSE4-NEXT: movq %r8, %xmm1 ; WIN-SSE4-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; WIN-SSE4-NEXT: retq ; diff --git a/test/CodeGen/X86/horizontal-reduce-umax.ll b/test/CodeGen/X86/horizontal-reduce-umax.ll index a4888e1cd3f..88f6b01131a 100644 --- a/test/CodeGen/X86/horizontal-reduce-umax.ll +++ b/test/CodeGen/X86/horizontal-reduce-umax.ll @@ -230,15 +230,14 @@ define i16 @test_reduce_v8i16(<8 x i16> %a0) { ; X86-SSE2-NEXT: pxor %xmm2, %xmm0 ; X86-SSE2-NEXT: pxor %xmm2, %xmm1 ; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1 -; X86-SSE2-NEXT: pxor %xmm2, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X86-SSE2-NEXT: pxor %xmm2, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 ; X86-SSE2-NEXT: pxor %xmm2, %xmm0 -; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; X86-SSE2-NEXT: pxor %xmm2, %xmm0 +; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: pxor %xmm2, %xmm1 ; X86-SSE2-NEXT: psrld $16, %xmm1 -; X86-SSE2-NEXT: pxor %xmm2, %xmm0 ; X86-SSE2-NEXT: pxor %xmm2, %xmm1 ; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; X86-SSE2-NEXT: pxor %xmm2, %xmm1 @@ -273,15 +272,14 @@ define i16 @test_reduce_v8i16(<8 x i16> %a0) { ; X64-SSE2-NEXT: pxor %xmm2, %xmm0 ; X64-SSE2-NEXT: pxor %xmm2, %xmm1 ; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1 -; X64-SSE2-NEXT: pxor %xmm2, %xmm1 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X64-SSE2-NEXT: pxor %xmm2, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 ; X64-SSE2-NEXT: pxor %xmm2, %xmm0 -; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; X64-SSE2-NEXT: pxor %xmm2, %xmm0 +; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0 ; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: pxor %xmm2, %xmm1 ; X64-SSE2-NEXT: psrld $16, %xmm1 -; X64-SSE2-NEXT: pxor %xmm2, %xmm0 ; X64-SSE2-NEXT: pxor %xmm2, %xmm1 ; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; X64-SSE2-NEXT: pxor %xmm2, %xmm1 @@ -832,20 +830,19 @@ define i16 @test_reduce_v16i16(<16 x i16> %a0) { ; X86-SSE2-NEXT: pxor %xmm2, %xmm1 ; X86-SSE2-NEXT: pxor %xmm2, %xmm0 ; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0 -; X86-SSE2-NEXT: pxor %xmm2, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-SSE2-NEXT: pxor %xmm2, %xmm0 -; X86-SSE2-NEXT: pxor %xmm2, %xmm1 -; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 ; X86-SSE2-NEXT: pxor %xmm2, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] ; X86-SSE2-NEXT: pxor %xmm2, %xmm1 +; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 ; X86-SSE2-NEXT: pxor %xmm2, %xmm0 -; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; X86-SSE2-NEXT: pxor %xmm2, %xmm0 +; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: pxor %xmm2, %xmm1 ; X86-SSE2-NEXT: psrld $16, %xmm1 -; X86-SSE2-NEXT: pxor %xmm2, %xmm0 ; X86-SSE2-NEXT: pxor %xmm2, %xmm1 ; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; X86-SSE2-NEXT: pxor %xmm2, %xmm1 @@ -896,20 +893,19 @@ define i16 @test_reduce_v16i16(<16 x i16> %a0) { ; X64-SSE2-NEXT: pxor %xmm2, %xmm1 ; X64-SSE2-NEXT: pxor %xmm2, %xmm0 ; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0 -; X64-SSE2-NEXT: pxor %xmm2, %xmm0 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-SSE2-NEXT: pxor %xmm2, %xmm0 -; X64-SSE2-NEXT: pxor %xmm2, %xmm1 -; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 ; X64-SSE2-NEXT: pxor %xmm2, %xmm1 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] ; X64-SSE2-NEXT: pxor %xmm2, %xmm1 +; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 ; X64-SSE2-NEXT: pxor %xmm2, %xmm0 -; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; X64-SSE2-NEXT: pxor %xmm2, %xmm0 +; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0 ; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: pxor %xmm2, %xmm1 ; X64-SSE2-NEXT: psrld $16, %xmm1 -; X64-SSE2-NEXT: pxor %xmm2, %xmm0 ; X64-SSE2-NEXT: pxor %xmm2, %xmm1 ; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; X64-SSE2-NEXT: pxor %xmm2, %xmm1 @@ -1670,35 +1666,30 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) { ; X86-SSE2-LABEL: test_reduce_v32i16: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768] -; X86-SSE2-NEXT: pxor %xmm4, %xmm2 -; X86-SSE2-NEXT: pxor %xmm4, %xmm0 -; X86-SSE2-NEXT: pmaxsw %xmm2, %xmm0 ; X86-SSE2-NEXT: pxor %xmm4, %xmm3 ; X86-SSE2-NEXT: pxor %xmm4, %xmm1 ; X86-SSE2-NEXT: pmaxsw %xmm3, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm4, %xmm2 -; X86-SSE2-NEXT: pxor %xmm4, %xmm2 -; X86-SSE2-NEXT: pxor %xmm2, %xmm1 -; X86-SSE2-NEXT: pxor %xmm0, %xmm2 -; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm2 -; X86-SSE2-NEXT: pxor %xmm4, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] ; X86-SSE2-NEXT: pxor %xmm4, %xmm2 ; X86-SSE2-NEXT: pxor %xmm4, %xmm0 ; X86-SSE2-NEXT: pmaxsw %xmm2, %xmm0 -; X86-SSE2-NEXT: pxor %xmm4, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-SSE2-NEXT: pxor %xmm4, %xmm0 +; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 ; X86-SSE2-NEXT: pxor %xmm4, %xmm1 -; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] ; X86-SSE2-NEXT: pxor %xmm4, %xmm1 +; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE2-NEXT: psrld $16, %xmm0 -; X86-SSE2-NEXT: pxor %xmm4, %xmm1 ; X86-SSE2-NEXT: pxor %xmm4, %xmm0 -; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; X86-SSE2-NEXT: pxor %xmm4, %xmm0 -; X86-SSE2-NEXT: movd %xmm0, %eax +; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: pxor %xmm4, %xmm1 +; X86-SSE2-NEXT: psrld $16, %xmm1 +; X86-SSE2-NEXT: pxor %xmm4, %xmm1 +; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; X86-SSE2-NEXT: pxor %xmm4, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax ; X86-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-SSE2-NEXT: retl ; @@ -1748,35 +1739,30 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) { ; X64-SSE2-LABEL: test_reduce_v32i16: ; X64-SSE2: ## %bb.0: ; X64-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768] -; X64-SSE2-NEXT: pxor %xmm4, %xmm2 -; X64-SSE2-NEXT: pxor %xmm4, %xmm0 -; X64-SSE2-NEXT: pmaxsw %xmm2, %xmm0 ; X64-SSE2-NEXT: pxor %xmm4, %xmm3 ; X64-SSE2-NEXT: pxor %xmm4, %xmm1 ; X64-SSE2-NEXT: pmaxsw %xmm3, %xmm1 -; X64-SSE2-NEXT: movdqa %xmm4, %xmm2 -; X64-SSE2-NEXT: pxor %xmm4, %xmm2 -; X64-SSE2-NEXT: pxor %xmm2, %xmm1 -; X64-SSE2-NEXT: pxor %xmm0, %xmm2 -; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm2 -; X64-SSE2-NEXT: pxor %xmm4, %xmm2 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] ; X64-SSE2-NEXT: pxor %xmm4, %xmm2 ; X64-SSE2-NEXT: pxor %xmm4, %xmm0 ; X64-SSE2-NEXT: pmaxsw %xmm2, %xmm0 -; X64-SSE2-NEXT: pxor %xmm4, %xmm0 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-SSE2-NEXT: pxor %xmm4, %xmm0 +; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 ; X64-SSE2-NEXT: pxor %xmm4, %xmm1 -; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] ; X64-SSE2-NEXT: pxor %xmm4, %xmm1 +; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X64-SSE2-NEXT: psrld $16, %xmm0 -; X64-SSE2-NEXT: pxor %xmm4, %xmm1 ; X64-SSE2-NEXT: pxor %xmm4, %xmm0 -; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; X64-SSE2-NEXT: pxor %xmm4, %xmm0 -; X64-SSE2-NEXT: movd %xmm0, %eax +; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: pxor %xmm4, %xmm1 +; X64-SSE2-NEXT: psrld $16, %xmm1 +; X64-SSE2-NEXT: pxor %xmm4, %xmm1 +; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; X64-SSE2-NEXT: pxor %xmm4, %xmm1 +; X64-SSE2-NEXT: movd %xmm1, %eax ; X64-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-SSE2-NEXT: retq ; diff --git a/test/CodeGen/X86/horizontal-reduce-umin.ll b/test/CodeGen/X86/horizontal-reduce-umin.ll index 3ce01cfdf4d..482d0826037 100644 --- a/test/CodeGen/X86/horizontal-reduce-umin.ll +++ b/test/CodeGen/X86/horizontal-reduce-umin.ll @@ -232,15 +232,14 @@ define i16 @test_reduce_v8i16(<8 x i16> %a0) { ; X86-SSE2-NEXT: pxor %xmm2, %xmm0 ; X86-SSE2-NEXT: pxor %xmm2, %xmm1 ; X86-SSE2-NEXT: pminsw %xmm0, %xmm1 -; X86-SSE2-NEXT: pxor %xmm2, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X86-SSE2-NEXT: pxor %xmm2, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 ; X86-SSE2-NEXT: pxor %xmm2, %xmm0 -; X86-SSE2-NEXT: pminsw %xmm1, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; X86-SSE2-NEXT: pxor %xmm2, %xmm0 +; X86-SSE2-NEXT: pminsw %xmm1, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: pxor %xmm2, %xmm1 ; X86-SSE2-NEXT: psrld $16, %xmm1 -; X86-SSE2-NEXT: pxor %xmm2, %xmm0 ; X86-SSE2-NEXT: pxor %xmm2, %xmm1 ; X86-SSE2-NEXT: pminsw %xmm0, %xmm1 ; X86-SSE2-NEXT: pxor %xmm2, %xmm1 @@ -269,15 +268,14 @@ define i16 @test_reduce_v8i16(<8 x i16> %a0) { ; X64-SSE2-NEXT: pxor %xmm2, %xmm0 ; X64-SSE2-NEXT: pxor %xmm2, %xmm1 ; X64-SSE2-NEXT: pminsw %xmm0, %xmm1 -; X64-SSE2-NEXT: pxor %xmm2, %xmm1 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X64-SSE2-NEXT: pxor %xmm2, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 ; X64-SSE2-NEXT: pxor %xmm2, %xmm0 -; X64-SSE2-NEXT: pminsw %xmm1, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; X64-SSE2-NEXT: pxor %xmm2, %xmm0 +; X64-SSE2-NEXT: pminsw %xmm1, %xmm0 ; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: pxor %xmm2, %xmm1 ; X64-SSE2-NEXT: psrld $16, %xmm1 -; X64-SSE2-NEXT: pxor %xmm2, %xmm0 ; X64-SSE2-NEXT: pxor %xmm2, %xmm1 ; X64-SSE2-NEXT: pminsw %xmm0, %xmm1 ; X64-SSE2-NEXT: pxor %xmm2, %xmm1 @@ -772,20 +770,19 @@ define i16 @test_reduce_v16i16(<16 x i16> %a0) { ; X86-SSE2-NEXT: pxor %xmm2, %xmm1 ; X86-SSE2-NEXT: pxor %xmm2, %xmm0 ; X86-SSE2-NEXT: pminsw %xmm1, %xmm0 -; X86-SSE2-NEXT: pxor %xmm2, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-SSE2-NEXT: pxor %xmm2, %xmm0 -; X86-SSE2-NEXT: pxor %xmm2, %xmm1 -; X86-SSE2-NEXT: pminsw %xmm0, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 ; X86-SSE2-NEXT: pxor %xmm2, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] ; X86-SSE2-NEXT: pxor %xmm2, %xmm1 +; X86-SSE2-NEXT: pminsw %xmm0, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 ; X86-SSE2-NEXT: pxor %xmm2, %xmm0 -; X86-SSE2-NEXT: pminsw %xmm1, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; X86-SSE2-NEXT: pxor %xmm2, %xmm0 +; X86-SSE2-NEXT: pminsw %xmm1, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: pxor %xmm2, %xmm1 ; X86-SSE2-NEXT: psrld $16, %xmm1 -; X86-SSE2-NEXT: pxor %xmm2, %xmm0 ; X86-SSE2-NEXT: pxor %xmm2, %xmm1 ; X86-SSE2-NEXT: pminsw %xmm0, %xmm1 ; X86-SSE2-NEXT: pxor %xmm2, %xmm1 @@ -827,20 +824,19 @@ define i16 @test_reduce_v16i16(<16 x i16> %a0) { ; X64-SSE2-NEXT: pxor %xmm2, %xmm1 ; X64-SSE2-NEXT: pxor %xmm2, %xmm0 ; X64-SSE2-NEXT: pminsw %xmm1, %xmm0 -; X64-SSE2-NEXT: pxor %xmm2, %xmm0 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-SSE2-NEXT: pxor %xmm2, %xmm0 -; X64-SSE2-NEXT: pxor %xmm2, %xmm1 -; X64-SSE2-NEXT: pminsw %xmm0, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 ; X64-SSE2-NEXT: pxor %xmm2, %xmm1 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] ; X64-SSE2-NEXT: pxor %xmm2, %xmm1 +; X64-SSE2-NEXT: pminsw %xmm0, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 ; X64-SSE2-NEXT: pxor %xmm2, %xmm0 -; X64-SSE2-NEXT: pminsw %xmm1, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; X64-SSE2-NEXT: pxor %xmm2, %xmm0 +; X64-SSE2-NEXT: pminsw %xmm1, %xmm0 ; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: pxor %xmm2, %xmm1 ; X64-SSE2-NEXT: psrld $16, %xmm1 -; X64-SSE2-NEXT: pxor %xmm2, %xmm0 ; X64-SSE2-NEXT: pxor %xmm2, %xmm1 ; X64-SSE2-NEXT: pminsw %xmm0, %xmm1 ; X64-SSE2-NEXT: pxor %xmm2, %xmm1 @@ -1574,35 +1570,30 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) { ; X86-SSE2-LABEL: test_reduce_v32i16: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768] -; X86-SSE2-NEXT: pxor %xmm4, %xmm2 -; X86-SSE2-NEXT: pxor %xmm4, %xmm0 -; X86-SSE2-NEXT: pminsw %xmm2, %xmm0 ; X86-SSE2-NEXT: pxor %xmm4, %xmm3 ; X86-SSE2-NEXT: pxor %xmm4, %xmm1 ; X86-SSE2-NEXT: pminsw %xmm3, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm4, %xmm2 -; X86-SSE2-NEXT: pxor %xmm4, %xmm2 -; X86-SSE2-NEXT: pxor %xmm2, %xmm1 -; X86-SSE2-NEXT: pxor %xmm0, %xmm2 -; X86-SSE2-NEXT: pminsw %xmm1, %xmm2 -; X86-SSE2-NEXT: pxor %xmm4, %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] ; X86-SSE2-NEXT: pxor %xmm4, %xmm2 ; X86-SSE2-NEXT: pxor %xmm4, %xmm0 ; X86-SSE2-NEXT: pminsw %xmm2, %xmm0 -; X86-SSE2-NEXT: pxor %xmm4, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-SSE2-NEXT: pxor %xmm4, %xmm0 +; X86-SSE2-NEXT: pminsw %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 ; X86-SSE2-NEXT: pxor %xmm4, %xmm1 -; X86-SSE2-NEXT: pminsw %xmm0, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] ; X86-SSE2-NEXT: pxor %xmm4, %xmm1 +; X86-SSE2-NEXT: pminsw %xmm0, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE2-NEXT: psrld $16, %xmm0 -; X86-SSE2-NEXT: pxor %xmm4, %xmm1 ; X86-SSE2-NEXT: pxor %xmm4, %xmm0 -; X86-SSE2-NEXT: pminsw %xmm1, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; X86-SSE2-NEXT: pxor %xmm4, %xmm0 -; X86-SSE2-NEXT: movd %xmm0, %eax +; X86-SSE2-NEXT: pminsw %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: pxor %xmm4, %xmm1 +; X86-SSE2-NEXT: psrld $16, %xmm1 +; X86-SSE2-NEXT: pxor %xmm4, %xmm1 +; X86-SSE2-NEXT: pminsw %xmm0, %xmm1 +; X86-SSE2-NEXT: pxor %xmm4, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax ; X86-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-SSE2-NEXT: retl ; @@ -1643,35 +1634,30 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) { ; X64-SSE2-LABEL: test_reduce_v32i16: ; X64-SSE2: ## %bb.0: ; X64-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768] -; X64-SSE2-NEXT: pxor %xmm4, %xmm2 -; X64-SSE2-NEXT: pxor %xmm4, %xmm0 -; X64-SSE2-NEXT: pminsw %xmm2, %xmm0 ; X64-SSE2-NEXT: pxor %xmm4, %xmm3 ; X64-SSE2-NEXT: pxor %xmm4, %xmm1 ; X64-SSE2-NEXT: pminsw %xmm3, %xmm1 -; X64-SSE2-NEXT: movdqa %xmm4, %xmm2 -; X64-SSE2-NEXT: pxor %xmm4, %xmm2 -; X64-SSE2-NEXT: pxor %xmm2, %xmm1 -; X64-SSE2-NEXT: pxor %xmm0, %xmm2 -; X64-SSE2-NEXT: pminsw %xmm1, %xmm2 -; X64-SSE2-NEXT: pxor %xmm4, %xmm2 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] ; X64-SSE2-NEXT: pxor %xmm4, %xmm2 ; X64-SSE2-NEXT: pxor %xmm4, %xmm0 ; X64-SSE2-NEXT: pminsw %xmm2, %xmm0 -; X64-SSE2-NEXT: pxor %xmm4, %xmm0 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-SSE2-NEXT: pxor %xmm4, %xmm0 +; X64-SSE2-NEXT: pminsw %xmm1, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 ; X64-SSE2-NEXT: pxor %xmm4, %xmm1 -; X64-SSE2-NEXT: pminsw %xmm0, %xmm1 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] ; X64-SSE2-NEXT: pxor %xmm4, %xmm1 +; X64-SSE2-NEXT: pminsw %xmm0, %xmm1 ; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X64-SSE2-NEXT: psrld $16, %xmm0 -; X64-SSE2-NEXT: pxor %xmm4, %xmm1 ; X64-SSE2-NEXT: pxor %xmm4, %xmm0 -; X64-SSE2-NEXT: pminsw %xmm1, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; X64-SSE2-NEXT: pxor %xmm4, %xmm0 -; X64-SSE2-NEXT: movd %xmm0, %eax +; X64-SSE2-NEXT: pminsw %xmm1, %xmm0 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: pxor %xmm4, %xmm1 +; X64-SSE2-NEXT: psrld $16, %xmm1 +; X64-SSE2-NEXT: pxor %xmm4, %xmm1 +; X64-SSE2-NEXT: pminsw %xmm0, %xmm1 +; X64-SSE2-NEXT: pxor %xmm4, %xmm1 +; X64-SSE2-NEXT: movd %xmm1, %eax ; X64-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-SSE2-NEXT: retq ; diff --git a/test/CodeGen/X86/known-bits.ll b/test/CodeGen/X86/known-bits.ll index 5d574391c50..5066e4777cc 100644 --- a/test/CodeGen/X86/known-bits.ll +++ b/test/CodeGen/X86/known-bits.ll @@ -19,7 +19,7 @@ define void @knownbits_zext_in_reg(i8*) nounwind { ; X32-NEXT: movzbl %al, %eax ; X32-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; X32-NEXT: vpinsrd $1, %eax, %xmm0, %xmm1 -; X32-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; X32-NEXT: vbroadcastss {{.*#+}} xmm2 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43] ; X32-NEXT: vpand %xmm2, %xmm1, %xmm1 ; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 @@ -69,7 +69,7 @@ define void @knownbits_zext_in_reg(i8*) nounwind { ; X64-NEXT: movzbl %cl, %ecx ; X64-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; X64-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm1 -; X64-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; X64-NEXT: vbroadcastss {{.*#+}} xmm2 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43] ; X64-NEXT: vpand %xmm2, %xmm1, %xmm1 ; X64-NEXT: movzbl %al, %eax ; X64-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 diff --git a/test/CodeGen/X86/nontemporal-loads.ll b/test/CodeGen/X86/nontemporal-loads.ll index 56428979568..efc08ca1718 100644 --- a/test/CodeGen/X86/nontemporal-loads.ll +++ b/test/CodeGen/X86/nontemporal-loads.ll @@ -1800,35 +1800,23 @@ define <64 x i8> @test_unaligned_v64i8(<64 x i8>* %src) { define <16 x i32> @test_masked_v16i32(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) { ; SSE2-LABEL: test_masked_v16i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm10 -; SSE2-NEXT: pxor %xmm12, %xmm12 -; SSE2-NEXT: pcmpeqd %xmm12, %xmm7 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm7, %xmm8 -; SSE2-NEXT: pxor %xmm0, %xmm8 -; SSE2-NEXT: pcmpeqd %xmm12, %xmm6 -; SSE2-NEXT: movdqa %xmm6, %xmm9 -; SSE2-NEXT: pxor %xmm0, %xmm9 -; SSE2-NEXT: pcmpeqd %xmm12, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm11 -; SSE2-NEXT: pxor %xmm0, %xmm11 -; SSE2-NEXT: pcmpeqd %xmm12, %xmm4 -; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm8 +; SSE2-NEXT: pcmpeqd %xmm8, %xmm7 +; SSE2-NEXT: pcmpeqd %xmm8, %xmm6 +; SSE2-NEXT: pcmpeqd %xmm8, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm8, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: pandn (%rdi), %xmm4 -; SSE2-NEXT: pandn %xmm10, %xmm0 ; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: pand %xmm5, %xmm1 ; SSE2-NEXT: pandn 16(%rdi), %xmm5 -; SSE2-NEXT: pandn %xmm1, %xmm11 -; SSE2-NEXT: por %xmm5, %xmm11 +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: pand %xmm6, %xmm2 ; SSE2-NEXT: pandn 32(%rdi), %xmm6 -; SSE2-NEXT: pandn %xmm2, %xmm9 -; SSE2-NEXT: por %xmm6, %xmm9 +; SSE2-NEXT: por %xmm6, %xmm2 +; SSE2-NEXT: pand %xmm7, %xmm3 ; SSE2-NEXT: pandn 48(%rdi), %xmm7 -; SSE2-NEXT: pandn %xmm3, %xmm8 -; SSE2-NEXT: por %xmm7, %xmm8 -; SSE2-NEXT: movdqa %xmm11, %xmm1 -; SSE2-NEXT: movdqa %xmm9, %xmm2 -; SSE2-NEXT: movdqa %xmm8, %xmm3 +; SSE2-NEXT: por %xmm7, %xmm3 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_masked_v16i32: diff --git a/test/CodeGen/X86/paddus.ll b/test/CodeGen/X86/paddus.ll index 75b0597d389..63ef093fdd9 100644 --- a/test/CodeGen/X86/paddus.ll +++ b/test/CodeGen/X86/paddus.ll @@ -801,22 +801,20 @@ define <8 x i16> @test23(<8 x i16> %x) { ; SSE2-LABEL: test23: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] -; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: pxor %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pcmpgtw %xmm2, %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: pcmpgtw %xmm0, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: test23: ; SSSE3: # %bb.0: ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] -; SSSE3-NEXT: pxor %xmm2, %xmm0 ; SSSE3-NEXT: pxor %xmm0, %xmm2 -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pcmpgtw %xmm2, %xmm1 -; SSSE3-NEXT: por %xmm0, %xmm1 +; SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-NEXT: pcmpgtw %xmm0, %xmm1 +; SSSE3-NEXT: por %xmm2, %xmm1 ; SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSSE3-NEXT: retq ; @@ -1029,37 +1027,33 @@ define <16 x i16> @test28(<16 x i16> %x) { define <16 x i16> @test29(<16 x i16> %x) { ; SSE2-LABEL: test29: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768] -; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: pxor %xmm4, %xmm3 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtw %xmm3, %xmm2 -; SSE2-NEXT: pxor %xmm0, %xmm4 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pcmpgtw %xmm4, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [32768,32768,32768,32768,32768,32768,32768,32768] +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pxor %xmm3, %xmm4 +; SSE2-NEXT: pxor %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: pcmpgtw %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: pcmpgtw %xmm0, %xmm1 +; SSE2-NEXT: por %xmm4, %xmm1 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: test29: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768] -; SSSE3-NEXT: pxor %xmm4, %xmm0 -; SSSE3-NEXT: pxor %xmm4, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm3 -; SSSE3-NEXT: pxor %xmm4, %xmm3 -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: pcmpgtw %xmm3, %xmm2 -; SSSE3-NEXT: pxor %xmm0, %xmm4 -; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: pcmpgtw %xmm4, %xmm3 -; SSSE3-NEXT: por %xmm0, %xmm3 -; SSSE3-NEXT: por %xmm1, %xmm2 -; SSSE3-NEXT: movdqa %xmm3, %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [32768,32768,32768,32768,32768,32768,32768,32768] +; SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSSE3-NEXT: pxor %xmm3, %xmm4 +; SSSE3-NEXT: pxor %xmm1, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, %xmm2 +; SSSE3-NEXT: pcmpgtw %xmm1, %xmm2 +; SSSE3-NEXT: movdqa %xmm4, %xmm1 +; SSSE3-NEXT: pcmpgtw %xmm0, %xmm1 +; SSSE3-NEXT: por %xmm4, %xmm1 +; SSSE3-NEXT: por %xmm3, %xmm2 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSSE3-NEXT: movdqa %xmm2, %xmm1 ; SSSE3-NEXT: retq ; @@ -1343,66 +1337,58 @@ define <32 x i16> @test34(<32 x i16> %x) { define <32 x i16> @test35(<32 x i16> %x) { ; SSE2-LABEL: test35: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768] -; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm2 -; SSE2-NEXT: pxor %xmm4, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm5 -; SSE2-NEXT: pxor %xmm4, %xmm5 -; SSE2-NEXT: movdqa %xmm3, %xmm8 -; SSE2-NEXT: pcmpgtw %xmm5, %xmm8 -; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: pxor %xmm4, %xmm6 -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pcmpgtw %xmm6, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [32768,32768,32768,32768,32768,32768,32768,32768] +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: pxor %xmm5, %xmm6 ; SSE2-NEXT: movdqa %xmm1, %xmm7 -; SSE2-NEXT: pxor %xmm4, %xmm7 -; SSE2-NEXT: movdqa %xmm1, %xmm6 -; SSE2-NEXT: pcmpgtw %xmm7, %xmm6 -; SSE2-NEXT: pxor %xmm0, %xmm4 -; SSE2-NEXT: movdqa %xmm0, %xmm7 -; SSE2-NEXT: pcmpgtw %xmm4, %xmm7 -; SSE2-NEXT: por %xmm0, %xmm7 -; SSE2-NEXT: por %xmm1, %xmm6 -; SSE2-NEXT: por %xmm2, %xmm5 -; SSE2-NEXT: por %xmm3, %xmm8 -; SSE2-NEXT: movdqa %xmm7, %xmm0 -; SSE2-NEXT: movdqa %xmm6, %xmm1 -; SSE2-NEXT: movdqa %xmm5, %xmm2 +; SSE2-NEXT: pxor %xmm5, %xmm7 +; SSE2-NEXT: movdqa %xmm2, %xmm8 +; SSE2-NEXT: pxor %xmm5, %xmm8 +; SSE2-NEXT: pxor %xmm3, %xmm5 +; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: pcmpgtw %xmm3, %xmm4 ; SSE2-NEXT: movdqa %xmm8, %xmm3 +; SSE2-NEXT: pcmpgtw %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm7, %xmm2 +; SSE2-NEXT: pcmpgtw %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm6, %xmm1 +; SSE2-NEXT: pcmpgtw %xmm0, %xmm1 +; SSE2-NEXT: por %xmm6, %xmm1 +; SSE2-NEXT: por %xmm7, %xmm2 +; SSE2-NEXT: por %xmm8, %xmm3 +; SSE2-NEXT: por %xmm5, %xmm4 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm4, %xmm3 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: test35: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768] -; SSSE3-NEXT: pxor %xmm4, %xmm0 -; SSSE3-NEXT: pxor %xmm4, %xmm1 -; SSSE3-NEXT: pxor %xmm4, %xmm2 -; SSSE3-NEXT: pxor %xmm4, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, %xmm5 -; SSSE3-NEXT: pxor %xmm4, %xmm5 -; SSSE3-NEXT: movdqa %xmm3, %xmm8 -; SSSE3-NEXT: pcmpgtw %xmm5, %xmm8 -; SSSE3-NEXT: movdqa %xmm2, %xmm6 -; SSSE3-NEXT: pxor %xmm4, %xmm6 -; SSSE3-NEXT: movdqa %xmm2, %xmm5 -; SSSE3-NEXT: pcmpgtw %xmm6, %xmm5 +; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [32768,32768,32768,32768,32768,32768,32768,32768] +; SSSE3-NEXT: movdqa %xmm0, %xmm6 +; SSSE3-NEXT: pxor %xmm5, %xmm6 ; SSSE3-NEXT: movdqa %xmm1, %xmm7 -; SSSE3-NEXT: pxor %xmm4, %xmm7 -; SSSE3-NEXT: movdqa %xmm1, %xmm6 -; SSSE3-NEXT: pcmpgtw %xmm7, %xmm6 -; SSSE3-NEXT: pxor %xmm0, %xmm4 -; SSSE3-NEXT: movdqa %xmm0, %xmm7 -; SSSE3-NEXT: pcmpgtw %xmm4, %xmm7 -; SSSE3-NEXT: por %xmm0, %xmm7 -; SSSE3-NEXT: por %xmm1, %xmm6 -; SSSE3-NEXT: por %xmm2, %xmm5 -; SSSE3-NEXT: por %xmm3, %xmm8 -; SSSE3-NEXT: movdqa %xmm7, %xmm0 -; SSSE3-NEXT: movdqa %xmm6, %xmm1 -; SSSE3-NEXT: movdqa %xmm5, %xmm2 +; SSSE3-NEXT: pxor %xmm5, %xmm7 +; SSSE3-NEXT: movdqa %xmm2, %xmm8 +; SSSE3-NEXT: pxor %xmm5, %xmm8 +; SSSE3-NEXT: pxor %xmm3, %xmm5 +; SSSE3-NEXT: movdqa %xmm5, %xmm4 +; SSSE3-NEXT: pcmpgtw %xmm3, %xmm4 ; SSSE3-NEXT: movdqa %xmm8, %xmm3 +; SSSE3-NEXT: pcmpgtw %xmm2, %xmm3 +; SSSE3-NEXT: movdqa %xmm7, %xmm2 +; SSSE3-NEXT: pcmpgtw %xmm1, %xmm2 +; SSSE3-NEXT: movdqa %xmm6, %xmm1 +; SSSE3-NEXT: pcmpgtw %xmm0, %xmm1 +; SSSE3-NEXT: por %xmm6, %xmm1 +; SSSE3-NEXT: por %xmm7, %xmm2 +; SSSE3-NEXT: por %xmm8, %xmm3 +; SSSE3-NEXT: por %xmm5, %xmm4 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-NEXT: movdqa %xmm3, %xmm2 +; SSSE3-NEXT: movdqa %xmm4, %xmm3 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: test35: diff --git a/test/CodeGen/X86/psubus.ll b/test/CodeGen/X86/psubus.ll index e2089f6b0d2..a6bdfe9780c 100644 --- a/test/CodeGen/X86/psubus.ll +++ b/test/CodeGen/X86/psubus.ll @@ -792,7 +792,7 @@ define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind { ; AVX1-NEXT: vpsubd %xmm9, %xmm1, %xmm1 ; AVX1-NEXT: vpsubd %xmm11, %xmm2, %xmm2 ; AVX1-NEXT: vpsubd %xmm0, %xmm6, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43] ; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2 ; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 diff --git a/test/CodeGen/X86/sat-add.ll b/test/CodeGen/X86/sat-add.ll index 3cb11b11ec3..ec160c94f5e 100644 --- a/test/CodeGen/X86/sat-add.ll +++ b/test/CodeGen/X86/sat-add.ll @@ -679,13 +679,12 @@ define <16 x i8> @unsigned_sat_variable_v16i8_using_cmp_notval(<16 x i8> %x, <16 define <8 x i16> @unsigned_sat_variable_v8i16_using_min(<8 x i16> %x, <8 x i16> %y) { ; SSE2-LABEL: unsigned_sat_variable_v8i16_using_min: ; SSE2: # %bb.0: -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [32768,32768,32768,32768,32768,32768,32768,32768] -; SSE2-NEXT: pxor %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm3, %xmm2 -; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: pminsw %xmm2, %xmm0 -; SSE2-NEXT: pxor %xmm3, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [32767,32767,32767,32767,32767,32767,32767,32767] +; SSE2-NEXT: pxor %xmm1, %xmm3 +; SSE2-NEXT: pminsw %xmm3, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: paddw %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -717,15 +716,12 @@ define <8 x i16> @unsigned_sat_variable_v8i16_using_cmp_sum(<8 x i16> %x, <8 x i define <8 x i16> @unsigned_sat_variable_v8i16_using_cmp_notval(<8 x i16> %x, <8 x i16> %y) { ; SSE2-LABEL: unsigned_sat_variable_v8i16_using_cmp_notval: ; SSE2: # %bb.0: -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: paddw %xmm1, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768] -; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: pxor %xmm4, %xmm2 -; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtw %xmm2, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: paddw %xmm1, %xmm2 +; SSE2-NEXT: pxor {{.*}}(%rip), %xmm1 +; SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pcmpgtw %xmm1, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: unsigned_sat_variable_v8i16_using_cmp_notval: @@ -750,17 +746,15 @@ define <4 x i32> @unsigned_sat_variable_v4i32_using_min(<4 x i32> %x, <4 x i32> ; SSE2-LABEL: unsigned_sat_variable_v4i32_using_min: ; SSE2: # %bb.0: ; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm2 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: pxor %xmm3, %xmm4 -; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: pxor %xmm1, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm0, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483647,2147483647,2147483647,2147483647] +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: pandn %xmm2, %xmm4 +; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -809,15 +803,12 @@ define <4 x i32> @unsigned_sat_variable_v4i32_using_cmp_sum(<4 x i32> %x, <4 x i define <4 x i32> @unsigned_sat_variable_v4i32_using_cmp_notval(<4 x i32> %x, <4 x i32> %y) { ; SSE2-LABEL: unsigned_sat_variable_v4i32_using_cmp_notval: ; SSE2: # %bb.0: -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: paddd %xmm1, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: pxor %xmm4, %xmm2 -; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: paddd %xmm1, %xmm2 +; SSE2-NEXT: pxor {{.*}}(%rip), %xmm1 +; SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: unsigned_sat_variable_v4i32_using_cmp_notval: diff --git a/test/CodeGen/X86/setcc-lowering.ll b/test/CodeGen/X86/setcc-lowering.ll index ce057b28cc9..100461d22c9 100644 --- a/test/CodeGen/X86/setcc-lowering.ll +++ b/test/CodeGen/X86/setcc-lowering.ll @@ -45,19 +45,17 @@ define void @pr26232(i64 %a, <16 x i1> %b) { ; AVX-LABEL: pr26232: ; AVX: # %bb.0: # %allocas ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX-NEXT: .p2align 4, 0x90 ; AVX-NEXT: .LBB1_1: # %for_loop599 ; AVX-NEXT: # =>This Inner Loop Header: Depth=1 ; AVX-NEXT: xorl %eax, %eax ; AVX-NEXT: cmpq $65536, %rdi # imm = 0x10000 ; AVX-NEXT: setl %al -; AVX-NEXT: vmovd %eax, %xmm3 -; AVX-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX-NEXT: vpand %xmm0, %xmm3, %xmm3 -; AVX-NEXT: vpsllw $7, %xmm3, %xmm3 -; AVX-NEXT: vpand %xmm2, %xmm3, %xmm3 -; AVX-NEXT: vpmovmskb %xmm3, %eax +; AVX-NEXT: vmovd %eax, %xmm2 +; AVX-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX-NEXT: vpand %xmm0, %xmm2, %xmm2 +; AVX-NEXT: vpsllw $7, %xmm2, %xmm2 +; AVX-NEXT: vpmovmskb %xmm2, %eax ; AVX-NEXT: testw %ax, %ax ; AVX-NEXT: jne .LBB1_1 ; AVX-NEXT: # %bb.2: # %for_exit600 diff --git a/test/CodeGen/X86/sse2-intrinsics-canonical.ll b/test/CodeGen/X86/sse2-intrinsics-canonical.ll index 04cd7ec47a1..506fb9eb100 100644 --- a/test/CodeGen/X86/sse2-intrinsics-canonical.ll +++ b/test/CodeGen/X86/sse2-intrinsics-canonical.ll @@ -198,9 +198,9 @@ define <8 x i8> @test_x86_sse2_psubus_b_64(<8 x i8> %a0, <8 x i8> %a1) { ; ; AVX2-LABEL: test_x86_sse2_psubus_b_64: ; AVX2: ## %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; AVX2-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x15,A,A,A,A] -; AVX2-NEXT: ## fixup A - offset: 4, value: LCPI6_0, kind: FK_Data_4 +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: ## encoding: [0xc4,0xe2,0x79,0x79,0x15,A,A,A,A] +; AVX2-NEXT: ## fixup A - offset: 5, value: LCPI6_0, kind: FK_Data_4 ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm3 ## encoding: [0xc5,0xf1,0xdb,0xda] ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdb,0xc2] ; AVX2-NEXT: vpmaxuw %xmm3, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x3e,0xc3] @@ -209,9 +209,9 @@ define <8 x i8> @test_x86_sse2_psubus_b_64(<8 x i8> %a0, <8 x i8> %a1) { ; ; SKX-LABEL: test_x86_sse2_psubus_b_64: ; SKX: ## %bb.0: -; SKX-NEXT: vmovdqa LCPI6_0, %xmm2 ## EVEX TO VEX Compression xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SKX-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x15,A,A,A,A] -; SKX-NEXT: ## fixup A - offset: 4, value: LCPI6_0, kind: FK_Data_4 +; SKX-NEXT: vpbroadcastw LCPI6_0, %xmm2 ## EVEX TO VEX Compression xmm2 = [255,255,255,255,255,255,255,255] +; SKX-NEXT: ## encoding: [0xc4,0xe2,0x79,0x79,0x15,A,A,A,A] +; SKX-NEXT: ## fixup A - offset: 5, value: LCPI6_0, kind: FK_Data_4 ; SKX-NEXT: vpand %xmm2, %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xdb,0xda] ; SKX-NEXT: vpand %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdb,0xc2] ; SKX-NEXT: vpmaxuw %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x3e,0xc3] diff --git a/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll b/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll index 7cb0d3ff58f..f109d69621c 100644 --- a/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll +++ b/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll @@ -132,9 +132,9 @@ define <4 x i32> @in_constant_varx_mone_invmask(<4 x i32> *%px, <4 x i32> *%py, ; ; CHECK-SSE2-LABEL: in_constant_varx_mone_invmask: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa (%rdi), %xmm0 +; CHECK-SSE2-NEXT: movdqa (%rdx), %xmm0 ; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; CHECK-SSE2-NEXT: movdqa (%rdx), %xmm2 +; CHECK-SSE2-NEXT: movdqa (%rdi), %xmm2 ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm2 ; CHECK-SSE2-NEXT: pandn %xmm2, %xmm0 ; CHECK-SSE2-NEXT: pxor %xmm1, %xmm0 @@ -142,9 +142,9 @@ define <4 x i32> @in_constant_varx_mone_invmask(<4 x i32> *%px, <4 x i32> *%py, ; ; CHECK-XOP-LABEL: in_constant_varx_mone_invmask: ; CHECK-XOP: # %bb.0: -; CHECK-XOP-NEXT: vmovdqa (%rdi), %xmm0 +; CHECK-XOP-NEXT: vmovdqa (%rdx), %xmm0 ; CHECK-XOP-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; CHECK-XOP-NEXT: vpxor (%rdx), %xmm1, %xmm2 +; CHECK-XOP-NEXT: vpxor (%rdi), %xmm1, %xmm2 ; CHECK-XOP-NEXT: vpandn %xmm2, %xmm0, %xmm0 ; CHECK-XOP-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; CHECK-XOP-NEXT: retq diff --git a/test/CodeGen/X86/v8i1-masks.ll b/test/CodeGen/X86/v8i1-masks.ll index a799b0e6f12..7f9ae2e8518 100644 --- a/test/CodeGen/X86/v8i1-masks.ll +++ b/test/CodeGen/X86/v8i1-masks.ll @@ -44,10 +44,9 @@ define void @and_masks(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwi ; X32-AVX2-NEXT: vcmpltps %ymm0, %ymm1, %ymm1 ; X32-AVX2-NEXT: vmovups (%eax), %ymm2 ; X32-AVX2-NEXT: vcmpltps %ymm0, %ymm2, %ymm0 -; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] -; X32-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1 ; X32-AVX2-NEXT: vandps %ymm1, %ymm0, %ymm0 -; X32-AVX2-NEXT: vmovaps %ymm0, (%eax) +; X32-AVX2-NEXT: vpsrld $31, %ymm0, %ymm0 +; X32-AVX2-NEXT: vmovdqa %ymm0, (%eax) ; X32-AVX2-NEXT: vzeroupper ; X32-AVX2-NEXT: retl ; @@ -58,10 +57,9 @@ define void @and_masks(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwi ; X64-AVX2-NEXT: vcmpltps %ymm0, %ymm1, %ymm1 ; X64-AVX2-NEXT: vmovups (%rdx), %ymm2 ; X64-AVX2-NEXT: vcmpltps %ymm0, %ymm2, %ymm0 -; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] -; X64-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1 ; X64-AVX2-NEXT: vandps %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vmovaps %ymm0, (%rax) +; X64-AVX2-NEXT: vpsrld $31, %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovdqa %ymm0, (%rax) ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq %v0 = load <8 x float>, <8 x float>* %a, align 16 diff --git a/test/CodeGen/X86/vector-blend.ll b/test/CodeGen/X86/vector-blend.ll index 934d1027e9b..5008a1e865d 100644 --- a/test/CodeGen/X86/vector-blend.ll +++ b/test/CodeGen/X86/vector-blend.ll @@ -629,7 +629,7 @@ define <32 x i8> @constant_pblendvb_avx2(<32 x i8> %xyzw, <32 x i8> %abcd) { ; ; AVX1-LABEL: constant_pblendvb_avx2: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [-5.4861292804117373E+303,-5.4861292804117373E+303,-5.4861292804117373E+303,-5.4861292804117373E+303] ; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 diff --git a/test/CodeGen/X86/vector-reduce-umax.ll b/test/CodeGen/X86/vector-reduce-umax.ll index 680a5c52e63..52b42ce9bcb 100644 --- a/test/CodeGen/X86/vector-reduce-umax.ll +++ b/test/CodeGen/X86/vector-reduce-umax.ll @@ -1141,15 +1141,14 @@ define i16 @test_v8i16(<8 x i16> %a0) { ; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: pmaxsw %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: pmaxsw %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: pmaxsw %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm1 @@ -1207,20 +1206,19 @@ define i16 @test_v16i16(<16 x i16> %a0) { ; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: pmaxsw %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] ; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: pmaxsw %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: pmaxsw %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm1 @@ -1296,35 +1294,30 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; SSE2-LABEL: test_v32i16: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768] -; SSE2-NEXT: pxor %xmm4, %xmm2 -; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: pmaxsw %xmm2, %xmm0 ; SSE2-NEXT: pxor %xmm4, %xmm3 ; SSE2-NEXT: pxor %xmm4, %xmm1 ; SSE2-NEXT: pmaxsw %xmm3, %xmm1 -; SSE2-NEXT: movdqa %xmm4, %xmm2 -; SSE2-NEXT: pxor %xmm4, %xmm2 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm2 -; SSE2-NEXT: pmaxsw %xmm1, %xmm2 -; SSE2-NEXT: pxor %xmm4, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] ; SSE2-NEXT: pxor %xmm4, %xmm2 ; SSE2-NEXT: pxor %xmm4, %xmm0 ; SSE2-NEXT: pmaxsw %xmm2, %xmm0 -; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: pmaxsw %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] ; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: pxor %xmm4, %xmm1 ; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: pmaxsw %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pmaxsw %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: # kill: def $ax killed $ax killed $eax ; SSE2-NEXT: retq ; @@ -1406,47 +1399,38 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; SSE2-LABEL: test_v64i16: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [32768,32768,32768,32768,32768,32768,32768,32768] -; SSE2-NEXT: pxor %xmm8, %xmm5 -; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: pmaxsw %xmm5, %xmm1 -; SSE2-NEXT: pxor %xmm8, %xmm7 -; SSE2-NEXT: pxor %xmm8, %xmm3 -; SSE2-NEXT: pmaxsw %xmm7, %xmm3 -; SSE2-NEXT: pxor %xmm8, %xmm4 -; SSE2-NEXT: pxor %xmm8, %xmm0 -; SSE2-NEXT: pmaxsw %xmm4, %xmm0 ; SSE2-NEXT: pxor %xmm8, %xmm6 ; SSE2-NEXT: pxor %xmm8, %xmm2 ; SSE2-NEXT: pmaxsw %xmm6, %xmm2 -; SSE2-NEXT: movdqa %xmm8, %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm4 -; SSE2-NEXT: pxor %xmm4, %xmm2 -; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: pmaxsw %xmm4, %xmm0 ; SSE2-NEXT: pmaxsw %xmm2, %xmm0 -; SSE2-NEXT: pxor %xmm4, %xmm3 -; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: pxor %xmm8, %xmm7 +; SSE2-NEXT: pxor %xmm8, %xmm3 +; SSE2-NEXT: pmaxsw %xmm7, %xmm3 +; SSE2-NEXT: pxor %xmm8, %xmm5 +; SSE2-NEXT: pxor %xmm8, %xmm1 +; SSE2-NEXT: pmaxsw %xmm5, %xmm1 ; SSE2-NEXT: pmaxsw %xmm3, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: pmaxsw %xmm1, %xmm0 +; SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm8, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: pmaxsw %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: pmaxsw %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; SSE2-NEXT: pxor %xmm8, %xmm1 +; SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm8, %xmm0 -; SSE2-NEXT: pmaxsw %xmm1, %xmm0 +; SSE2-NEXT: psrld $16, %xmm0 ; SSE2-NEXT: pxor %xmm8, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: pmaxsw %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm8, %xmm0 -; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: pmaxsw %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: # kill: def $ax killed $ax killed $eax ; SSE2-NEXT: retq ; diff --git a/test/CodeGen/X86/vector-reduce-umin.ll b/test/CodeGen/X86/vector-reduce-umin.ll index 52adee5ab26..32a1cdf0f17 100644 --- a/test/CodeGen/X86/vector-reduce-umin.ll +++ b/test/CodeGen/X86/vector-reduce-umin.ll @@ -1140,15 +1140,14 @@ define i16 @test_v8i16(<8 x i16> %a0) { ; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: pminsw %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: pminsw %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: pminsw %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: pminsw %xmm0, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm1 @@ -1187,20 +1186,19 @@ define i16 @test_v16i16(<16 x i16> %a0) { ; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: pminsw %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pminsw %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] ; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: pminsw %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: pminsw %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: pminsw %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: pxor %xmm2, %xmm1 ; SSE2-NEXT: pminsw %xmm0, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm1 @@ -1253,35 +1251,30 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; SSE2-LABEL: test_v32i16: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768] -; SSE2-NEXT: pxor %xmm4, %xmm2 -; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: pminsw %xmm2, %xmm0 ; SSE2-NEXT: pxor %xmm4, %xmm3 ; SSE2-NEXT: pxor %xmm4, %xmm1 ; SSE2-NEXT: pminsw %xmm3, %xmm1 -; SSE2-NEXT: movdqa %xmm4, %xmm2 -; SSE2-NEXT: pxor %xmm4, %xmm2 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm2 -; SSE2-NEXT: pminsw %xmm1, %xmm2 -; SSE2-NEXT: pxor %xmm4, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] ; SSE2-NEXT: pxor %xmm4, %xmm2 ; SSE2-NEXT: pxor %xmm4, %xmm0 ; SSE2-NEXT: pminsw %xmm2, %xmm0 -; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: pminsw %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: pminsw %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] ; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: pminsw %xmm0, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: pxor %xmm4, %xmm1 ; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: pminsw %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pminsw %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: pminsw %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: # kill: def $ax killed $ax killed $eax ; SSE2-NEXT: retq ; @@ -1338,47 +1331,38 @@ define i16 @test_v64i16(<64 x i16> %a0) { ; SSE2-LABEL: test_v64i16: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [32768,32768,32768,32768,32768,32768,32768,32768] -; SSE2-NEXT: pxor %xmm8, %xmm5 -; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: pminsw %xmm5, %xmm1 -; SSE2-NEXT: pxor %xmm8, %xmm7 -; SSE2-NEXT: pxor %xmm8, %xmm3 -; SSE2-NEXT: pminsw %xmm7, %xmm3 -; SSE2-NEXT: pxor %xmm8, %xmm4 -; SSE2-NEXT: pxor %xmm8, %xmm0 -; SSE2-NEXT: pminsw %xmm4, %xmm0 ; SSE2-NEXT: pxor %xmm8, %xmm6 ; SSE2-NEXT: pxor %xmm8, %xmm2 ; SSE2-NEXT: pminsw %xmm6, %xmm2 -; SSE2-NEXT: movdqa %xmm8, %xmm4 ; SSE2-NEXT: pxor %xmm8, %xmm4 -; SSE2-NEXT: pxor %xmm4, %xmm2 -; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: pminsw %xmm4, %xmm0 ; SSE2-NEXT: pminsw %xmm2, %xmm0 -; SSE2-NEXT: pxor %xmm4, %xmm3 -; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: pxor %xmm8, %xmm7 +; SSE2-NEXT: pxor %xmm8, %xmm3 +; SSE2-NEXT: pminsw %xmm7, %xmm3 +; SSE2-NEXT: pxor %xmm8, %xmm5 +; SSE2-NEXT: pxor %xmm8, %xmm1 +; SSE2-NEXT: pminsw %xmm5, %xmm1 ; SSE2-NEXT: pminsw %xmm3, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: pminsw %xmm1, %xmm0 +; SSE2-NEXT: pminsw %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm8, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: pminsw %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: pminsw %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] ; SSE2-NEXT: pxor %xmm8, %xmm1 +; SSE2-NEXT: pminsw %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm8, %xmm0 -; SSE2-NEXT: pminsw %xmm1, %xmm0 +; SSE2-NEXT: psrld $16, %xmm0 ; SSE2-NEXT: pxor %xmm8, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: pminsw %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm8, %xmm0 -; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: pminsw %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: # kill: def $ax killed $ax killed $eax ; SSE2-NEXT: retq ; diff --git a/test/CodeGen/X86/vector-shift-lshr-128.ll b/test/CodeGen/X86/vector-shift-lshr-128.ll index bd77311479b..7ce33dcfe24 100644 --- a/test/CodeGen/X86/vector-shift-lshr-128.ll +++ b/test/CodeGen/X86/vector-shift-lshr-128.ll @@ -373,8 +373,8 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; SSE2-NEXT: movdqa %xmm3, %xmm4 ; SSE2-NEXT: pandn %xmm0, %xmm4 ; SSE2-NEXT: psrlw $4, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: paddb %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm3, %xmm3 @@ -382,16 +382,16 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; SSE2-NEXT: movdqa %xmm3, %xmm4 ; SSE2-NEXT: pandn %xmm0, %xmm4 ; SSE2-NEXT: psrlw $2, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: paddb %xmm1, %xmm1 ; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: pandn %xmm0, %xmm1 ; SSE2-NEXT: psrlw $1, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -488,8 +488,8 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; X32-SSE-NEXT: movdqa %xmm3, %xmm4 ; X32-SSE-NEXT: pandn %xmm0, %xmm4 ; X32-SSE-NEXT: psrlw $4, %xmm0 -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 ; X32-SSE-NEXT: pand %xmm3, %xmm0 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 ; X32-SSE-NEXT: por %xmm4, %xmm0 ; X32-SSE-NEXT: paddb %xmm1, %xmm1 ; X32-SSE-NEXT: pxor %xmm3, %xmm3 @@ -497,16 +497,16 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; X32-SSE-NEXT: movdqa %xmm3, %xmm4 ; X32-SSE-NEXT: pandn %xmm0, %xmm4 ; X32-SSE-NEXT: psrlw $2, %xmm0 -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 ; X32-SSE-NEXT: pand %xmm3, %xmm0 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 ; X32-SSE-NEXT: por %xmm4, %xmm0 ; X32-SSE-NEXT: paddb %xmm1, %xmm1 ; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm2 ; X32-SSE-NEXT: movdqa %xmm2, %xmm1 ; X32-SSE-NEXT: pandn %xmm0, %xmm1 ; X32-SSE-NEXT: psrlw $1, %xmm0 -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 ; X32-SSE-NEXT: pand %xmm2, %xmm0 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 ; X32-SSE-NEXT: por %xmm1, %xmm0 ; X32-SSE-NEXT: retl %shift = lshr <16 x i8> %a, %b diff --git a/test/CodeGen/X86/vector-shift-shl-128.ll b/test/CodeGen/X86/vector-shift-shl-128.ll index 67963b1f992..a26fccd44c8 100644 --- a/test/CodeGen/X86/vector-shift-shl-128.ll +++ b/test/CodeGen/X86/vector-shift-shl-128.ll @@ -295,8 +295,8 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; SSE2-NEXT: movdqa %xmm3, %xmm4 ; SSE2-NEXT: pandn %xmm0, %xmm4 ; SSE2-NEXT: psllw $4, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: paddb %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm3, %xmm3 @@ -304,8 +304,8 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; SSE2-NEXT: movdqa %xmm3, %xmm4 ; SSE2-NEXT: pandn %xmm0, %xmm4 ; SSE2-NEXT: psllw $2, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: por %xmm4, %xmm0 ; SSE2-NEXT: paddb %xmm1, %xmm1 ; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 @@ -405,8 +405,8 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; X32-SSE-NEXT: movdqa %xmm3, %xmm4 ; X32-SSE-NEXT: pandn %xmm0, %xmm4 ; X32-SSE-NEXT: psllw $4, %xmm0 -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 ; X32-SSE-NEXT: pand %xmm3, %xmm0 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 ; X32-SSE-NEXT: por %xmm4, %xmm0 ; X32-SSE-NEXT: paddb %xmm1, %xmm1 ; X32-SSE-NEXT: pxor %xmm3, %xmm3 @@ -414,8 +414,8 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; X32-SSE-NEXT: movdqa %xmm3, %xmm4 ; X32-SSE-NEXT: pandn %xmm0, %xmm4 ; X32-SSE-NEXT: psllw $2, %xmm0 -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 ; X32-SSE-NEXT: pand %xmm3, %xmm0 +; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 ; X32-SSE-NEXT: por %xmm4, %xmm0 ; X32-SSE-NEXT: paddb %xmm1, %xmm1 ; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm2 diff --git a/test/CodeGen/X86/vector-shuffle-256-v16.ll b/test/CodeGen/X86/vector-shuffle-256-v16.ll index 2f0be026fd9..2ade0c5c646 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -846,7 +846,7 @@ define <16 x i16> @shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_0 define <16 x i16> @shuffle_v16i16_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,0,65535,0,65535,0,65535,0,65535,0,65535,0,65535,0,65535,0] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [9.18340949E-41,9.18340949E-41,9.18340949E-41,9.18340949E-41,9.18340949E-41,9.18340949E-41,9.18340949E-41,9.18340949E-41] ; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 @@ -863,7 +863,7 @@ define <16 x i16> @shuffle_v16i16_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_3 define <16 x i16> @shuffle_v16i16_16_01_18_03_20_05_22_07_24_09_26_11_28_13_30_15(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_24_09_26_11_28_13_30_15: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,0,65535,0,65535,0,65535,0,65535,0,65535,0,65535,0,65535,0] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [9.18340949E-41,9.18340949E-41,9.18340949E-41,9.18340949E-41,9.18340949E-41,9.18340949E-41,9.18340949E-41,9.18340949E-41] ; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 diff --git a/test/CodeGen/X86/vector-trunc-math.ll b/test/CodeGen/X86/vector-trunc-math.ll index d9f186e64f1..e552f5f4036 100644 --- a/test/CodeGen/X86/vector-trunc-math.ll +++ b/test/CodeGen/X86/vector-trunc-math.ll @@ -233,7 +233,8 @@ define <16 x i8> @trunc_add_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-NEXT: vpaddq %xmm7, %xmm3, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [1.2598673968951787E-321,1.2598673968951787E-321] +; AVX1-NEXT: # xmm7 = mem[0,0] ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 ; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6 ; AVX1-NEXT: vpackusdw %xmm3, %xmm6, %xmm3 @@ -347,7 +348,7 @@ define <16 x i8> @trunc_add_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43] ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 @@ -680,22 +681,23 @@ define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX1-LABEL: trunc_add_const_v16i64_v16i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [1.2598673968951787E-321,1.2598673968951787E-321] +; AVX1-NEXT: # xmm5 = mem[0,0] +; AVX1-NEXT: vandpd %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vandpd %xmm5, %xmm3, %xmm3 ; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vandpd %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vandpd %xmm5, %xmm2, %xmm2 ; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vandpd %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vandpd %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vandpd %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vandpd %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 @@ -781,13 +783,13 @@ define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; AVX1-LABEL: trunc_add_const_v16i32_v16i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43] +; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 @@ -1106,7 +1108,8 @@ define <16 x i8> @trunc_sub_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-NEXT: vpsubq %xmm7, %xmm3, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [1.2598673968951787E-321,1.2598673968951787E-321] +; AVX1-NEXT: # xmm7 = mem[0,0] ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 ; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6 ; AVX1-NEXT: vpackusdw %xmm3, %xmm6, %xmm3 @@ -1220,7 +1223,7 @@ define <16 x i8> @trunc_sub_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43] ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 @@ -1575,7 +1578,8 @@ define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm3, %xmm7 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-NEXT: vpsubq {{.*}}(%rip), %xmm3, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [1.2598673968951787E-321,1.2598673968951787E-321] +; AVX1-NEXT: # xmm4 = mem[0,0] ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm7 ; AVX1-NEXT: vpackusdw %xmm3, %xmm7, %xmm3 @@ -1687,7 +1691,7 @@ define <16 x i8> @trunc_sub_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm1, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43] ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 @@ -2275,7 +2279,8 @@ define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; AVX1-NEXT: vpsllq $32, %xmm6, %xmm6 ; AVX1-NEXT: vpmuludq %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpaddq %xmm6, %xmm3, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [1.2598673968951787E-321,1.2598673968951787E-321] +; AVX1-NEXT: # xmm4 = mem[0,0] ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 @@ -2451,7 +2456,7 @@ define <16 x i8> @trunc_mul_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43] ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 @@ -2909,7 +2914,8 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX1-NEXT: vpmuludq %xmm6, %xmm3, %xmm3 ; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3 ; AVX1-NEXT: vpaddq %xmm3, %xmm7, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [1.2598673968951787E-321,1.2598673968951787E-321] +; AVX1-NEXT: # xmm6 = mem[0,0] ; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 ; AVX1-NEXT: vpand %xmm6, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 @@ -3049,7 +3055,7 @@ define <16 x i8> @trunc_mul_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43] ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 @@ -3351,27 +3357,28 @@ define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; ; AVX1-LABEL: trunc_and_v16i64_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 -; AVX1-NEXT: vandps %ymm5, %ymm1, %ymm1 -; AVX1-NEXT: vandps %ymm6, %ymm2, %ymm2 -; AVX1-NEXT: vandps %ymm7, %ymm3, %ymm3 +; AVX1-NEXT: vandpd %ymm4, %ymm0, %ymm0 +; AVX1-NEXT: vandpd %ymm5, %ymm1, %ymm1 +; AVX1-NEXT: vandpd %ymm6, %ymm2, %ymm2 +; AVX1-NEXT: vandpd %ymm7, %ymm3, %ymm3 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [1.2598673968951787E-321,1.2598673968951787E-321] +; AVX1-NEXT: # xmm5 = mem[0,0] +; AVX1-NEXT: vandpd %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vandpd %xmm5, %xmm3, %xmm3 ; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vandps %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vandpd %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vandpd %xmm5, %xmm2, %xmm2 ; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vandps %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vandpd %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vandpd %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vandps %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vandpd %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vandpd %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 @@ -3468,7 +3475,7 @@ define <16 x i8> @trunc_and_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43] ; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 @@ -3751,22 +3758,23 @@ define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX1-LABEL: trunc_and_const_v16i64_v16i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [1.2598673968951787E-321,1.2598673968951787E-321] +; AVX1-NEXT: # xmm5 = mem[0,0] +; AVX1-NEXT: vandpd %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vandpd %xmm5, %xmm3, %xmm3 ; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vandpd %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vandpd %xmm5, %xmm2, %xmm2 ; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vandpd %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vandpd %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vandpd %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vandpd %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 @@ -3852,13 +3860,13 @@ define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; AVX1-LABEL: trunc_and_const_v16i32_v16i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43] +; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 @@ -4153,27 +4161,28 @@ define <16 x i8> @trunc_xor_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; ; AVX1-LABEL: trunc_xor_v16i64_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vxorps %ymm4, %ymm0, %ymm0 -; AVX1-NEXT: vxorps %ymm5, %ymm1, %ymm1 -; AVX1-NEXT: vxorps %ymm6, %ymm2, %ymm2 -; AVX1-NEXT: vxorps %ymm7, %ymm3, %ymm3 +; AVX1-NEXT: vxorpd %ymm4, %ymm0, %ymm0 +; AVX1-NEXT: vxorpd %ymm5, %ymm1, %ymm1 +; AVX1-NEXT: vxorpd %ymm6, %ymm2, %ymm2 +; AVX1-NEXT: vxorpd %ymm7, %ymm3, %ymm3 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [1.2598673968951787E-321,1.2598673968951787E-321] +; AVX1-NEXT: # xmm5 = mem[0,0] +; AVX1-NEXT: vandpd %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vandpd %xmm5, %xmm3, %xmm3 ; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vandps %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vandpd %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vandpd %xmm5, %xmm2, %xmm2 ; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vandps %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vandpd %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vandpd %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vandps %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vandpd %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vandpd %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 @@ -4270,7 +4279,7 @@ define <16 x i8> @trunc_xor_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin ; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43] ; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 @@ -4553,22 +4562,23 @@ define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX1-LABEL: trunc_xor_const_v16i64_v16i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [1.2598673968951787E-321,1.2598673968951787E-321] +; AVX1-NEXT: # xmm5 = mem[0,0] +; AVX1-NEXT: vandpd %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vandpd %xmm5, %xmm3, %xmm3 ; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vandpd %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vandpd %xmm5, %xmm2, %xmm2 ; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vandpd %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vandpd %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vandpd %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vandpd %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 @@ -4654,13 +4664,13 @@ define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; AVX1-LABEL: trunc_xor_const_v16i32_v16i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43] +; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 @@ -4955,27 +4965,28 @@ define <16 x i8> @trunc_or_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind ; ; AVX1-LABEL: trunc_or_v16i64_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vorps %ymm4, %ymm0, %ymm0 -; AVX1-NEXT: vorps %ymm5, %ymm1, %ymm1 -; AVX1-NEXT: vorps %ymm6, %ymm2, %ymm2 -; AVX1-NEXT: vorps %ymm7, %ymm3, %ymm3 +; AVX1-NEXT: vorpd %ymm4, %ymm0, %ymm0 +; AVX1-NEXT: vorpd %ymm5, %ymm1, %ymm1 +; AVX1-NEXT: vorpd %ymm6, %ymm2, %ymm2 +; AVX1-NEXT: vorpd %ymm7, %ymm3, %ymm3 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [1.2598673968951787E-321,1.2598673968951787E-321] +; AVX1-NEXT: # xmm5 = mem[0,0] +; AVX1-NEXT: vandpd %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vandpd %xmm5, %xmm3, %xmm3 ; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vandps %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vandpd %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vandpd %xmm5, %xmm2, %xmm2 ; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vandps %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vandpd %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vandpd %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vandps %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vandpd %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vandpd %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 @@ -5072,7 +5083,7 @@ define <16 x i8> @trunc_or_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind ; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43] ; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 @@ -5355,22 +5366,23 @@ define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; AVX1-LABEL: trunc_or_const_v16i64_v16i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [1.2598673968951787E-321,1.2598673968951787E-321] +; AVX1-NEXT: # xmm5 = mem[0,0] +; AVX1-NEXT: vandpd %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vandpd %xmm5, %xmm3, %xmm3 ; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vandpd %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vandpd %xmm5, %xmm2, %xmm2 ; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vandpd %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vandpd %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vandpd %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vandpd %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 @@ -5456,13 +5468,13 @@ define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind { ; AVX1-LABEL: trunc_or_const_v16i32_v16i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43] +; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 diff --git a/test/CodeGen/X86/vector-trunc-packus.ll b/test/CodeGen/X86/vector-trunc-packus.ll index 91ede6cb062..61935dce8f8 100644 --- a/test/CodeGen/X86/vector-trunc-packus.ll +++ b/test/CodeGen/X86/vector-trunc-packus.ll @@ -2070,24 +2070,26 @@ define void @trunc_packus_v8i64_v8i8_store(<8 x i64> %a0, <8 x i8> *%p1) { ; AVX1-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm4, %xmm5 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm4, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 ; AVX1-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 ; AVX1-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm8 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm5, %xmm6 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm7 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm7, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm6 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm7, %xmm2 +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [1.2598673968951787E-321,1.2598673968951787E-321] +; AVX1-NEXT: # xmm3 = mem[0,0] +; AVX1-NEXT: vpand %xmm3, %xmm7, %xmm7 +; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm6, %xmm1 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm2 -; AVX1-NEXT: vpand %xmm2, %xmm6, %xmm2 -; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2 +; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm0, %xmm8, %xmm0 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 diff --git a/test/CodeGen/X86/vector-trunc-ssat.ll b/test/CodeGen/X86/vector-trunc-ssat.ll index 3e5dcc5c3c2..500d8ba1511 100644 --- a/test/CodeGen/X86/vector-trunc-ssat.ll +++ b/test/CodeGen/X86/vector-trunc-ssat.ll @@ -2001,7 +2001,8 @@ define void @trunc_ssat_v8i64_v8i8_store(<8 x i64> %a0, <8 x i8> *%p1) { ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 ; AVX1-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovapd {{.*#+}} xmm3 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [1.2598673968951787E-321,1.2598673968951787E-321] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vandpd %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vandpd %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 diff --git a/test/CodeGen/X86/vector-trunc-usat.ll b/test/CodeGen/X86/vector-trunc-usat.ll index 1bde6c3a141..5b00ab58495 100644 --- a/test/CodeGen/X86/vector-trunc-usat.ll +++ b/test/CodeGen/X86/vector-trunc-usat.ll @@ -1417,7 +1417,8 @@ define void @trunc_usat_v8i64_v8i8_store(<8 x i64> %a0, <8 x i8> *%p1) { ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 ; AVX1-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovapd {{.*#+}} xmm3 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [1.2598673968951787E-321,1.2598673968951787E-321] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vandpd %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vandpd %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 diff --git a/test/CodeGen/X86/vector-trunc.ll b/test/CodeGen/X86/vector-trunc.ll index db3692f318f..79cbb8cc924 100644 --- a/test/CodeGen/X86/vector-trunc.ll +++ b/test/CodeGen/X86/vector-trunc.ll @@ -286,13 +286,14 @@ define void @trunc8i64_8i8(<8 x i64> %a) { ; AVX1-LABEL: trunc8i64_8i8: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [1.2598673968951787E-321,1.2598673968951787E-321] +; AVX1-NEXT: # xmm3 = mem[0,0] +; AVX1-NEXT: vandpd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandpd %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vandpd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandpd %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 @@ -907,13 +908,13 @@ define void @trunc16i32_16i8(<16 x i32> %a) { ; AVX1-LABEL: trunc16i32_16i8: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43] +; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovdqu %xmm0, (%rax) diff --git a/test/CodeGen/X86/vshift-6.ll b/test/CodeGen/X86/vshift-6.ll index 5cfa38ab833..36e29abf8d7 100644 --- a/test/CodeGen/X86/vshift-6.ll +++ b/test/CodeGen/X86/vshift-6.ll @@ -50,8 +50,8 @@ define <16 x i8> @do_not_crash(i8*, i32*, i64*, i32, i64, i8) { ; X32-NEXT: movdqa %xmm2, %xmm4 ; X32-NEXT: pandn %xmm0, %xmm4 ; X32-NEXT: psllw $2, %xmm0 -; X32-NEXT: pand {{\.LCPI.*}}, %xmm0 ; X32-NEXT: pand %xmm2, %xmm0 +; X32-NEXT: pand {{\.LCPI.*}}, %xmm0 ; X32-NEXT: por %xmm4, %xmm0 ; X32-NEXT: paddb %xmm1, %xmm1 ; X32-NEXT: pcmpgtb %xmm1, %xmm3 @@ -85,8 +85,8 @@ define <16 x i8> @do_not_crash(i8*, i32*, i64*, i32, i64, i8) { ; X64-NEXT: movdqa %xmm2, %xmm4 ; X64-NEXT: pandn %xmm0, %xmm4 ; X64-NEXT: psllw $2, %xmm0 -; X64-NEXT: pand {{.*}}(%rip), %xmm0 ; X64-NEXT: pand %xmm2, %xmm0 +; X64-NEXT: pand {{.*}}(%rip), %xmm0 ; X64-NEXT: por %xmm4, %xmm0 ; X64-NEXT: paddb %xmm1, %xmm1 ; X64-NEXT: pcmpgtb %xmm1, %xmm3 diff --git a/test/CodeGen/X86/x86-interleaved-access.ll b/test/CodeGen/X86/x86-interleaved-access.ll index e4624eaf363..41d69e544aa 100644 --- a/test/CodeGen/X86/x86-interleaved-access.ll +++ b/test/CodeGen/X86/x86-interleaved-access.ll @@ -1029,7 +1029,8 @@ define <32 x i8> @interleaved_load_vf32_i8_stride3(<96 x i8>* %ptr){ ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm2 ; AVX1-NEXT: vpalignr {{.*#+}} xmm9 = xmm7[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10] ; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm6[11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7,8,9,10] -; AVX1-NEXT: vmovaps {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX1-NEXT: # ymm5 = mem[0,1,0,1] ; AVX1-NEXT: vandnps %ymm2, %ymm5, %ymm2 ; AVX1-NEXT: vandps %ymm5, %ymm8, %ymm5 ; AVX1-NEXT: vorps %ymm2, %ymm5, %ymm2 @@ -1585,7 +1586,8 @@ define <64 x i8> @interleaved_load_vf64_i8_stride3(<192 x i8>* %ptr){ ; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm12[11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7,8,9,10] ; AVX1-NEXT: vpalignr {{.*#+}} xmm11 = xmm11[11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7,8,9,10] ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm11, %ymm12 -; AVX1-NEXT: vmovaps {{.*#+}} ymm13 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm13 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; AVX1-NEXT: # ymm13 = mem[0,1,0,1] ; AVX1-NEXT: vandnps %ymm12, %ymm13, %ymm12 ; AVX1-NEXT: vandps %ymm13, %ymm14, %ymm14 ; AVX1-NEXT: vorps %ymm12, %ymm14, %ymm12 -- GitLab From aa8c49dafa13b3565af79710ef7a4933180dd84b Mon Sep 17 00:00:00 2001 From: Lang Hames Date: Mon, 15 Oct 2018 05:07:54 +0000 Subject: [PATCH 0177/1116] [ORC] Simplify naming for JITDylib definition generators. Renames: JITDylib's setFallbackDefinitionGenerator method to setGenerator. DynamicLibraryFallbackGenerator class to DynamicLibrarySearchGenerator. ReexportsFallbackDefinitionGenerator to ReexportsGenerator. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344489 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/ExecutionEngine/Orc/Core.h | 31 +++++++----- .../llvm/ExecutionEngine/Orc/ExecutionUtils.h | 32 ++++++------ lib/ExecutionEngine/Orc/Core.cpp | 49 +++++++++---------- lib/ExecutionEngine/Orc/ExecutionUtils.cpp | 20 +++++--- tools/lli/lli.cpp | 4 +- .../ExecutionEngine/Orc/CoreAPIsTest.cpp | 19 +++---- 6 files changed, 80 insertions(+), 75 deletions(-) diff --git a/include/llvm/ExecutionEngine/Orc/Core.h b/include/llvm/ExecutionEngine/Orc/Core.h index 24cdeeae42e..67b16894f6c 100644 --- a/include/llvm/ExecutionEngine/Orc/Core.h +++ b/include/llvm/ExecutionEngine/Orc/Core.h @@ -395,15 +395,22 @@ reexports(JITDylib &SourceJD, SymbolAliasMap Aliases) { Expected buildSimpleReexportsAliasMap(JITDylib &SourceJD, const SymbolNameSet &Symbols); -class ReexportsFallbackDefinitionGenerator { +/// ReexportsGenerator can be used with JITDylib::setGenerator to automatically +/// re-export a subset of the source JITDylib's symbols in the target. +class ReexportsGenerator { public: using SymbolPredicate = std::function; - ReexportsFallbackDefinitionGenerator(JITDylib &BackingJD, - SymbolPredicate Allow); + + /// Create a reexports generator. If an Allow predicate is passed, only + /// symbols for which the predicate returns true will be reexported. If no + /// Allow predicate is passed, all symbols will be exported. + ReexportsGenerator(JITDylib &SourceJD, + SymbolPredicate Allow = SymbolPredicate()); + SymbolNameSet operator()(JITDylib &JD, const SymbolNameSet &Names); private: - JITDylib &BackingJD; + JITDylib &SourceJD; SymbolPredicate Allow; }; @@ -478,7 +485,7 @@ class JITDylib { friend class ExecutionSession; friend class MaterializationResponsibility; public: - using FallbackDefinitionGeneratorFunction = std::function; using AsynchronousSymbolQuerySet = @@ -495,12 +502,12 @@ public: /// Get a reference to the ExecutionSession for this JITDylib. ExecutionSession &getExecutionSession() const { return ES; } - /// Set a fallback defenition generator. If set, lookup and lookupFlags will - /// pass the unresolved symbols set to the fallback definition generator, - /// allowing it to add a new definition to the JITDylib. - void setFallbackDefinitionGenerator( - FallbackDefinitionGeneratorFunction FallbackDefinitionGenerator) { - this->FallbackDefinitionGenerator = std::move(FallbackDefinitionGenerator); + /// Set a definition generator. If set, whenever a symbol fails to resolve + /// within this JITDylib, lookup and lookupFlags will pass the unresolved + /// symbols set to the definition generator. The generator can optionally + /// add a definition for the unresolved symbols to the dylib. + void setGenerator(GeneratorFunction DefGenerator) { + this->DefGenerator = std::move(DefGenerator); } /// Set the search order to be used when fixing up definitions in JITDylib. @@ -667,7 +674,7 @@ private: SymbolMap Symbols; UnmaterializedInfosMap UnmaterializedInfos; MaterializingInfosMap MaterializingInfos; - FallbackDefinitionGeneratorFunction FallbackDefinitionGenerator; + GeneratorFunction DefGenerator; JITDylibList SearchOrder; }; diff --git a/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h b/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h index 52250662a95..662ed7b78e4 100644 --- a/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h +++ b/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h @@ -212,32 +212,30 @@ public: /// If an instance of this class is attached to a JITDylib as a fallback /// definition generator, then any symbol found in the given DynamicLibrary that /// passes the 'Allow' predicate will be added to the JITDylib. -class DynamicLibraryFallbackGenerator { +class DynamicLibrarySearchGenerator { public: using SymbolPredicate = std::function; - static bool AllowAll(SymbolStringPtr Name) { return true; } - - /// Create a DynamicLibraryFallbackGenerator that searches for symbols in the + /// Create a DynamicLibrarySearchGenerator that searches for symbols in the /// given sys::DynamicLibrary. - /// Only symbols that match the 'Allow' predicate will be searched for. - DynamicLibraryFallbackGenerator(sys::DynamicLibrary Dylib, - const DataLayout &DL, - SymbolPredicate Allow = AllowAll); + /// If the Allow predicate is given then only symbols matching the predicate + /// will be searched for in the DynamicLibrary. If the predicate is not given + /// then all symbols will be searched for. + DynamicLibrarySearchGenerator(sys::DynamicLibrary Dylib, const DataLayout &DL, + SymbolPredicate Allow = SymbolPredicate()); /// Permanently loads the library at the given path and, on success, returns - /// a DynamicLibraryFallbackGenerator that will search it for symbol - /// definitions matching the Allow predicate. - /// On failure returns the reason the library failed to load. - static Expected + /// a DynamicLibrarySearchGenerator that will search it for symbol definitions + /// in the library. On failure returns the reason the library failed to load. + static Expected Load(const char *FileName, const DataLayout &DL, - SymbolPredicate Allow = AllowAll); + SymbolPredicate Allow = SymbolPredicate()); - /// Creates a DynamicLibraryFallbackGenerator that searches for symbols in + /// Creates a DynamicLibrarySearchGenerator that searches for symbols in /// the current process. - static Expected - CreateForCurrentProcess(const DataLayout &DL, - SymbolPredicate Allow = AllowAll) { + static Expected + GetForCurrentProcess(const DataLayout &DL, + SymbolPredicate Allow = SymbolPredicate()) { return Load(nullptr, DL, std::move(Allow)); } diff --git a/lib/ExecutionEngine/Orc/Core.cpp b/lib/ExecutionEngine/Orc/Core.cpp index c9cfacef61b..3fa28a5af6f 100644 --- a/lib/ExecutionEngine/Orc/Core.cpp +++ b/lib/ExecutionEngine/Orc/Core.cpp @@ -686,26 +686,26 @@ buildSimpleReexportsAliasMap(JITDylib &SourceJD, const SymbolNameSet &Symbols) { return Result; } -ReexportsFallbackDefinitionGenerator::ReexportsFallbackDefinitionGenerator( - JITDylib &BackingJD, SymbolPredicate Allow) - : BackingJD(BackingJD), Allow(std::move(Allow)) {} +ReexportsGenerator::ReexportsGenerator(JITDylib &SourceJD, + SymbolPredicate Allow) + : SourceJD(SourceJD), Allow(std::move(Allow)) {} -SymbolNameSet ReexportsFallbackDefinitionGenerator:: -operator()(JITDylib &JD, const SymbolNameSet &Names) { +SymbolNameSet ReexportsGenerator::operator()(JITDylib &JD, + const SymbolNameSet &Names) { orc::SymbolNameSet Added; orc::SymbolAliasMap AliasMap; - auto Flags = BackingJD.lookupFlags(Names); + auto Flags = SourceJD.lookupFlags(Names); for (auto &KV : Flags) { - if (!Allow(KV.first)) + if (Allow && !Allow(KV.first)) continue; AliasMap[KV.first] = SymbolAliasMapEntry(KV.first, KV.second); Added.insert(KV.first); } if (!Added.empty()) - cantFail(JD.define(reexports(BackingJD, AliasMap))); + cantFail(JD.define(reexports(SourceJD, AliasMap))); return Added; } @@ -1117,10 +1117,10 @@ SymbolFlagsMap JITDylib::lookupFlags(const SymbolNameSet &Names) { return ES.runSessionLocked([&, this]() { SymbolFlagsMap Result; auto Unresolved = lookupFlagsImpl(Result, Names); - if (FallbackDefinitionGenerator && !Unresolved.empty()) { - auto FallbackDefs = FallbackDefinitionGenerator(*this, Unresolved); - if (!FallbackDefs.empty()) { - auto Unresolved2 = lookupFlagsImpl(Result, FallbackDefs); + if (DefGenerator && !Unresolved.empty()) { + auto NewDefs = DefGenerator(*this, Unresolved); + if (!NewDefs.empty()) { + auto Unresolved2 = lookupFlagsImpl(Result, NewDefs); (void)Unresolved2; assert(Unresolved2.empty() && "All fallback defs should have been found by lookupFlagsImpl"); @@ -1156,14 +1156,13 @@ void JITDylib::lodgeQuery(std::shared_ptr &Q, assert(Q && "Query can not be null"); lodgeQueryImpl(Q, Unresolved, MatchNonExportedInJD, MatchNonExported, MUs); - if (FallbackDefinitionGenerator && !Unresolved.empty()) { - auto FallbackDefs = FallbackDefinitionGenerator(*this, Unresolved); - if (!FallbackDefs.empty()) { - for (auto &D : FallbackDefs) + if (DefGenerator && !Unresolved.empty()) { + auto NewDefs = DefGenerator(*this, Unresolved); + if (!NewDefs.empty()) { + for (auto &D : NewDefs) Unresolved.erase(D); - lodgeQueryImpl(Q, FallbackDefs, MatchNonExportedInJD, MatchNonExported, - MUs); - assert(FallbackDefs.empty() && + lodgeQueryImpl(Q, NewDefs, MatchNonExportedInJD, MatchNonExported, MUs); + assert(NewDefs.empty() && "All fallback defs should have been found by lookupImpl"); } } @@ -1250,15 +1249,15 @@ SymbolNameSet JITDylib::legacyLookup(std::shared_ptr Q, SymbolNameSet Unresolved = std::move(Names); ES.runSessionLocked([&, this]() { ActionFlags = lookupImpl(Q, MUs, Unresolved); - if (FallbackDefinitionGenerator && !Unresolved.empty()) { + if (DefGenerator && !Unresolved.empty()) { assert(ActionFlags == None && "ActionFlags set but unresolved symbols remain?"); - auto FallbackDefs = FallbackDefinitionGenerator(*this, Unresolved); - if (!FallbackDefs.empty()) { - for (auto &D : FallbackDefs) + auto NewDefs = DefGenerator(*this, Unresolved); + if (!NewDefs.empty()) { + for (auto &D : NewDefs) Unresolved.erase(D); - ActionFlags = lookupImpl(Q, MUs, FallbackDefs); - assert(FallbackDefs.empty() && + ActionFlags = lookupImpl(Q, MUs, NewDefs); + assert(NewDefs.empty() && "All fallback defs should have been found by lookupImpl"); } } diff --git a/lib/ExecutionEngine/Orc/ExecutionUtils.cpp b/lib/ExecutionEngine/Orc/ExecutionUtils.cpp index 6a180106240..667237373ca 100644 --- a/lib/ExecutionEngine/Orc/ExecutionUtils.cpp +++ b/lib/ExecutionEngine/Orc/ExecutionUtils.cpp @@ -178,21 +178,22 @@ Error LocalCXXRuntimeOverrides2::enable(JITDylib &JD, return JD.define(absoluteSymbols(std::move(RuntimeInterposes))); } -DynamicLibraryFallbackGenerator::DynamicLibraryFallbackGenerator( +DynamicLibrarySearchGenerator::DynamicLibrarySearchGenerator( sys::DynamicLibrary Dylib, const DataLayout &DL, SymbolPredicate Allow) : Dylib(std::move(Dylib)), Allow(std::move(Allow)), GlobalPrefix(DL.getGlobalPrefix()) {} -Expected DynamicLibraryFallbackGenerator::Load( - const char *FileName, const DataLayout &DL, SymbolPredicate Allow) { +Expected +DynamicLibrarySearchGenerator::Load(const char *FileName, const DataLayout &DL, + SymbolPredicate Allow) { std::string ErrMsg; auto Lib = sys::DynamicLibrary::getPermanentLibrary(FileName, &ErrMsg); if (!Lib.isValid()) return make_error(std::move(ErrMsg), inconvertibleErrorCode()); - return DynamicLibraryFallbackGenerator(std::move(Lib), DL, std::move(Allow)); + return DynamicLibrarySearchGenerator(std::move(Lib), DL, std::move(Allow)); } -SymbolNameSet DynamicLibraryFallbackGenerator:: +SymbolNameSet DynamicLibrarySearchGenerator:: operator()(JITDylib &JD, const SymbolNameSet &Names) { orc::SymbolNameSet Added; orc::SymbolMap NewSymbols; @@ -200,7 +201,10 @@ operator()(JITDylib &JD, const SymbolNameSet &Names) { bool HasGlobalPrefix = (GlobalPrefix != '\0'); for (auto &Name : Names) { - if (!Allow(Name) || (*Name).empty()) + if ((*Name).empty()) + continue; + + if (Allow && !Allow(Name)) continue; if (HasGlobalPrefix && (*Name).front() != GlobalPrefix) @@ -215,8 +219,8 @@ operator()(JITDylib &JD, const SymbolNameSet &Names) { } } - // Add any new symbols to JD. Since the fallback generator is only called for - // symbols that are not already defined, this will never trigger a duplicate + // Add any new symbols to JD. Since the generator is only called for symbols + // that are not already defined, this will never trigger a duplicate // definition error, so we can wrap this call in a 'cantFail'. if (!NewSymbols.empty()) cantFail(JD.define(absoluteSymbols(std::move(NewSymbols)))); diff --git a/tools/lli/lli.cpp b/tools/lli/lli.cpp index 4794fe532a5..d633fe6f800 100644 --- a/tools/lli/lli.cpp +++ b/tools/lli/lli.cpp @@ -793,8 +793,8 @@ int runOrcLazyJIT(const char *ProgName) { } return Dump(std::move(TSM), R); }); - J->getMainJITDylib().setFallbackDefinitionGenerator(ExitOnErr( - orc::DynamicLibraryFallbackGenerator::CreateForCurrentProcess(DL))); + J->getMainJITDylib().setGenerator( + ExitOnErr(orc::DynamicLibrarySearchGenerator::GetForCurrentProcess(DL))); orc::MangleAndInterner Mangle(J->getExecutionSession(), DL); orc::LocalCXXRuntimeOverrides2 CXXRuntimeOverrides; diff --git a/unittests/ExecutionEngine/Orc/CoreAPIsTest.cpp b/unittests/ExecutionEngine/Orc/CoreAPIsTest.cpp index c8fa6ef5297..1ccc4755957 100644 --- a/unittests/ExecutionEngine/Orc/CoreAPIsTest.cpp +++ b/unittests/ExecutionEngine/Orc/CoreAPIsTest.cpp @@ -342,17 +342,15 @@ TEST_F(CoreAPIsStandardTest, TestThatReExportsDontUnnecessarilyMaterialize) { EXPECT_FALSE(BarMaterialized) << "Bar should not have been materialized"; } -TEST_F(CoreAPIsStandardTest, TestReexportsFallbackGenerator) { - // Test that a re-exports fallback generator can dynamically generate - // reexports. +TEST_F(CoreAPIsStandardTest, TestReexportsGenerator) { + // Test that a re-exports generator can dynamically generate reexports. auto &JD2 = ES.createJITDylib("JD2"); cantFail(JD2.define(absoluteSymbols({{Foo, FooSym}, {Bar, BarSym}}))); auto Filter = [this](SymbolStringPtr Name) { return Name != Bar; }; - JD.setFallbackDefinitionGenerator( - ReexportsFallbackDefinitionGenerator(JD2, Filter)); + JD.setGenerator(ReexportsGenerator(JD2, Filter)); auto Flags = JD.lookupFlags({Foo, Bar, Baz}); EXPECT_EQ(Flags.size(), 1U) << "Unexpected number of results"; @@ -679,14 +677,13 @@ TEST_F(CoreAPIsStandardTest, DefineMaterializingSymbol) { << "Expected Bar == BarSym"; } -TEST_F(CoreAPIsStandardTest, FallbackDefinitionGeneratorTest) { +TEST_F(CoreAPIsStandardTest, GeneratorTest) { cantFail(JD.define(absoluteSymbols({{Foo, FooSym}}))); - JD.setFallbackDefinitionGenerator( - [&](JITDylib &JD2, const SymbolNameSet &Names) { - cantFail(JD2.define(absoluteSymbols({{Bar, BarSym}}))); - return SymbolNameSet({Bar}); - }); + JD.setGenerator([&](JITDylib &JD2, const SymbolNameSet &Names) { + cantFail(JD2.define(absoluteSymbols({{Bar, BarSym}}))); + return SymbolNameSet({Bar}); + }); auto Result = cantFail(ES.lookup({&JD}, {Foo, Bar})); -- GitLab From 15ca92098aa137c257b7dfea86c7b4daa8eaf1af Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 15 Oct 2018 05:31:24 +0000 Subject: [PATCH 0178/1116] [X86] Autogenerate checks. NFC git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344490 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/CodeGen/X86/fold-vex.ll | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/test/CodeGen/X86/fold-vex.ll b/test/CodeGen/X86/fold-vex.ll index 006db6effdf..c7b376a053d 100644 --- a/test/CodeGen/X86/fold-vex.ll +++ b/test/CodeGen/X86/fold-vex.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; Use CPU parameters to ensure that a CPU-specific attribute is not overriding the AVX definition. ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s @@ -14,18 +15,20 @@ ; unless specially configured on some CPUs such as AMD Family 10H. define <4 x i32> @test1(<4 x i32>* %p0, <4 x i32> %in1) nounwind { +; CHECK-LABEL: test1: +; CHECK: # %bb.0: +; CHECK-NEXT: vandps (%rdi), %xmm0, %xmm0 +; CHECK-NEXT: retq +; +; SSE-LABEL: test1: +; SSE: # %bb.0: +; SSE-NEXT: movups (%rdi), %xmm1 +; SSE-NEXT: andps %xmm1, %xmm0 +; SSE-NEXT: retq %in0 = load <4 x i32>, <4 x i32>* %p0, align 2 %a = and <4 x i32> %in0, %in1 ret <4 x i32> %a -; CHECK-LABEL: @test1 -; CHECK-NOT: vmovups -; CHECK: vandps (%rdi), %xmm0, %xmm0 -; CHECK-NEXT: ret -; SSE-LABEL: @test1 -; SSE: movups (%rdi), %xmm1 -; SSE-NEXT: andps %xmm1, %xmm0 -; SSE-NEXT: ret } -- GitLab From ffc5ec8c8122b984871c59516426cdd8ad18e7ea Mon Sep 17 00:00:00 2001 From: Bjorn Pettersson Date: Mon, 15 Oct 2018 08:36:03 +0000 Subject: [PATCH 0179/1116] [TwoAddressInstructionPass] Replace subregister uses when processing tied operands Summary: TwoAddressInstruction pass typically rewrites %1:short = foo %0.sub_lo:long as %1:short = COPY %0.sub_lo:long %1:short = foo %1:short when having tied operands. If there are extra un-tied operands that uses the same reg and subreg, such as the second and third inputs to fie here: %1:short = fie %0.sub_lo:long, %0.sub_hi:long, %0.sub_lo:long then there was a bug which replaced the register %0 also for the un-tied operand, but without changing the subregister indices. So we used to get: %1:short = COPY %0.sub_lo:long %1:short = fie %1, %1.sub_hi:short, %1.sub_lo:short With this fix we instead get: %1:short = COPY %0.sub_lo:long %1:short = fie %1, %0.sub_hi:long, %1 Reviewers: arsenm, JesperAntonsson, kparzysz, MatzeB Reviewed By: MatzeB Subscribers: bjope, kparzysz, wdng, llvm-commits Differential Revision: https://reviews.llvm.org/D36224 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344492 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/TwoAddressInstructionPass.cpp | 21 ++++--- .../CodeGen/Hexagon/two-addr-tied-subregs.mir | 56 +++++++++++++++++++ 2 files changed, 69 insertions(+), 8 deletions(-) create mode 100644 test/CodeGen/Hexagon/two-addr-tied-subregs.mir diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp index 99ccb0f9c9f..2e2fe72e539 100644 --- a/lib/CodeGen/TwoAddressInstructionPass.cpp +++ b/lib/CodeGen/TwoAddressInstructionPass.cpp @@ -1608,23 +1608,28 @@ TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI, } if (AllUsesCopied) { + bool ReplacedAllUntiedUses = true; if (!IsEarlyClobber) { // Replace other (un-tied) uses of regB with LastCopiedReg. for (MachineOperand &MO : MI->operands()) { - if (MO.isReg() && MO.getReg() == RegB && - MO.isUse()) { - if (MO.isKill()) { - MO.setIsKill(false); - RemovedKillFlag = true; + if (MO.isReg() && MO.getReg() == RegB && MO.isUse()) { + if (MO.getSubReg() == SubRegB) { + if (MO.isKill()) { + MO.setIsKill(false); + RemovedKillFlag = true; + } + MO.setReg(LastCopiedReg); + MO.setSubReg(0); + } else { + ReplacedAllUntiedUses = false; } - MO.setReg(LastCopiedReg); - MO.setSubReg(MO.getSubReg()); } } } // Update live variables for regB. - if (RemovedKillFlag && LV && LV->getVarInfo(RegB).removeKill(*MI)) { + if (RemovedKillFlag && ReplacedAllUntiedUses && + LV && LV->getVarInfo(RegB).removeKill(*MI)) { MachineBasicBlock::iterator PrevMI = MI; --PrevMI; LV->addVirtualRegisterKilled(RegB, *PrevMI); diff --git a/test/CodeGen/Hexagon/two-addr-tied-subregs.mir b/test/CodeGen/Hexagon/two-addr-tied-subregs.mir new file mode 100644 index 00000000000..87e117c461b --- /dev/null +++ b/test/CodeGen/Hexagon/two-addr-tied-subregs.mir @@ -0,0 +1,56 @@ +# RUN: llc -march hexagon -run-pass livevars -run-pass twoaddressinstruction -verify-machineinstrs -o - %s | FileCheck %s + + +############################################################################### + +--- +name: test1 +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $d0 + + %0:doubleregs = COPY killed $d0 + %1:intregs = S2_lsr_i_r_acc %0.isub_lo, %0.isub_lo, 16 + +... + +# Verify that both uses if %0.isub_lo are replaced here. +# (we used to get %1:intregs = S2_lsr_i_r_acc %1, %1.isub_lo, 16) +# +# CHECK-LABEL: name: test1 +# CHECK: bb.0.entry: +# CHECK: %0:doubleregs = COPY killed $d0 +# CHECK-NEXT: %1:intregs = COPY killed %0.isub_lo +# CHECK-NEXT: %1:intregs = S2_lsr_i_r_acc %1, %1, 16 + + +############################################################################### + +--- +name: test2 +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $d0 + + %0:doubleregs = COPY killed $d0 + %1:intregs = S2_lsr_i_r_acc %0.isub_lo, %0.isub_hi, 16 + +... + +# Verify that the use of %0.isub_hi isn't replaced here. +# (we used to get %1:intregs = S2_lsr_i_r_acc %1, %1.isub_hi, 16) +# +# We also used to get an incorrect "killed" for %0 in the second COPY. +# So we verify that we do not get machine verifier complaints here. +# An improvement could be to get a "killed" attribute on the last +# use of %0.isub_hi, but we do not need it for the IR to be valid. +# +# CHECK-LABEL: name: test2 +# CHECK: bb.0.entry: +# CHECK: %0:doubleregs = COPY killed $d0 +# CHECK-NEXT: %1:intregs = COPY %0.isub_lo +# CHECK-NEXT: %1:intregs = S2_lsr_i_r_acc %1, %0.isub_hi, 16 + +############################################################################### -- GitLab From 919972ec1f052ca0b29942c71745336298d694ef Mon Sep 17 00:00:00 2001 From: Guillaume Chatelet Date: Mon, 15 Oct 2018 09:09:19 +0000 Subject: [PATCH 0180/1116] [llvm-exegesis][NFC] Return many CodeTemplates instead of one. Summary: This is part one of the change where I simply changed the signature of the functions. More work need to be done to actually produce more than one CodeTemplate per instruction. Reviewers: courbet Subscribers: tschuett, llvm-commits Differential Revision: https://reviews.llvm.org/D53209 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344493 91177308-0d34-0410-b5e6-96231b3b80d8 --- tools/llvm-exegesis/lib/Latency.cpp | 24 +++---- tools/llvm-exegesis/lib/Latency.h | 8 +-- tools/llvm-exegesis/lib/SnippetGenerator.cpp | 67 +++++++++++-------- tools/llvm-exegesis/lib/SnippetGenerator.h | 23 ++++--- tools/llvm-exegesis/lib/Uops.cpp | 12 ++-- tools/llvm-exegesis/lib/Uops.h | 4 +- tools/llvm-exegesis/lib/X86/Target.cpp | 18 ++--- .../X86/SnippetGeneratorTest.cpp | 12 ++-- 8 files changed, 90 insertions(+), 78 deletions(-) diff --git a/tools/llvm-exegesis/lib/Latency.cpp b/tools/llvm-exegesis/lib/Latency.cpp index f6786b123ad..040b42b53e2 100644 --- a/tools/llvm-exegesis/lib/Latency.cpp +++ b/tools/llvm-exegesis/lib/Latency.cpp @@ -22,9 +22,9 @@ namespace exegesis { LatencySnippetGenerator::~LatencySnippetGenerator() = default; -llvm::Expected -LatencySnippetGenerator::generateTwoInstructionPrototype( - const Instruction &Instr) const { +llvm::Expected> +generateTwoInstructionPrototypes(const LLVMState &State, + const Instruction &Instr) { std::vector Opcodes; Opcodes.resize(State.getInstrInfo().getNumOpcodes()); std::iota(Opcodes.begin(), Opcodes.end(), 0U); @@ -50,23 +50,23 @@ LatencySnippetGenerator::generateTwoInstructionPrototype( State.getInstrInfo().getName(OtherOpcode)); CT.Instructions.push_back(std::move(ThisIT)); CT.Instructions.push_back(std::move(OtherIT)); - return std::move(CT); + return getSingleton(CT); } return llvm::make_error( "Infeasible : Didn't find any scheme to make the instruction serial"); } -llvm::Expected -LatencySnippetGenerator::generateCodeTemplate(const Instruction &Instr) const { +llvm::Expected> +LatencySnippetGenerator::generateCodeTemplates(const Instruction &Instr) const { if (Instr.hasMemoryOperands()) return llvm::make_error( "Infeasible : has memory operands"); - if (auto CT = generateSelfAliasingCodeTemplate(Instr)) - return CT; - else - llvm::consumeError(CT.takeError()); - // No self aliasing, trying to create a dependency through another opcode. - return generateTwoInstructionPrototype(Instr); + return llvm::handleExpected( // + generateSelfAliasingCodeTemplates(Instr), + [this, &Instr]() { + return generateTwoInstructionPrototypes(State, Instr); + }, + [](const BenchmarkFailure &) { /*Consume Error*/ }); } const char *LatencyBenchmarkRunner::getCounterName() const { diff --git a/tools/llvm-exegesis/lib/Latency.h b/tools/llvm-exegesis/lib/Latency.h index 83c798f60f3..f78f12615c7 100644 --- a/tools/llvm-exegesis/lib/Latency.h +++ b/tools/llvm-exegesis/lib/Latency.h @@ -26,12 +26,8 @@ public: LatencySnippetGenerator(const LLVMState &State) : SnippetGenerator(State) {} ~LatencySnippetGenerator() override; - llvm::Expected - generateCodeTemplate(const Instruction &Instr) const override; - -private: - llvm::Expected - generateTwoInstructionPrototype(const Instruction &Instr) const; + llvm::Expected> + generateCodeTemplates(const Instruction &Instr) const override; }; class LatencyBenchmarkRunner : public BenchmarkRunner { diff --git a/tools/llvm-exegesis/lib/SnippetGenerator.cpp b/tools/llvm-exegesis/lib/SnippetGenerator.cpp index f7a76d88ccf..9b577fd65a9 100644 --- a/tools/llvm-exegesis/lib/SnippetGenerator.cpp +++ b/tools/llvm-exegesis/lib/SnippetGenerator.cpp @@ -22,6 +22,12 @@ namespace exegesis { +std::vector getSingleton(CodeTemplate &CT) { + std::vector Result; + Result.push_back(std::move(CT)); + return Result; +} + SnippetGeneratorFailure::SnippetGeneratorFailure(const llvm::Twine &S) : llvm::StringError(S, llvm::inconvertibleErrorCode()) {} @@ -31,26 +37,28 @@ SnippetGenerator::~SnippetGenerator() = default; llvm::Expected> SnippetGenerator::generateConfigurations(const Instruction &Instr) const { - if (auto E = generateCodeTemplate(Instr)) { - CodeTemplate &CT = E.get(); + if (auto E = generateCodeTemplates(Instr)) { const auto &RATC = State.getRATC(); - const llvm::BitVector &ForbiddenRegs = - CT.ScratchSpacePointerInReg - ? RATC.getRegister(CT.ScratchSpacePointerInReg).aliasedBits() - : RATC.emptyRegisters(); std::vector Output; - // TODO: Generate as many BenchmarkCode as needed. - { - BenchmarkCode BC; - BC.Info = CT.Info; - for (InstructionTemplate &IT : CT.Instructions) { - randomizeUnsetVariables(ForbiddenRegs, IT); - BC.Instructions.push_back(IT.build()); + for (CodeTemplate &CT : E.get()) { + const llvm::BitVector &ForbiddenRegs = + CT.ScratchSpacePointerInReg + ? RATC.getRegister(CT.ScratchSpacePointerInReg).aliasedBits() + : RATC.emptyRegisters(); + // TODO: Generate as many BenchmarkCode as needed. + { + BenchmarkCode BC; + BC.Info = CT.Info; + for (InstructionTemplate &IT : CT.Instructions) { + randomizeUnsetVariables(ForbiddenRegs, IT); + BC.Instructions.push_back(IT.build()); + } + if (CT.ScratchSpacePointerInReg) + BC.LiveIns.push_back(CT.ScratchSpacePointerInReg); + BC.RegisterInitialValues = + computeRegisterInitialValues(CT.Instructions); + Output.push_back(std::move(BC)); } - if (CT.ScratchSpacePointerInReg) - BC.LiveIns.push_back(CT.ScratchSpacePointerInReg); - BC.RegisterInitialValues = computeRegisterInitialValues(CT.Instructions); - Output.push_back(std::move(BC)); } return Output; } else @@ -99,13 +107,14 @@ std::vector SnippetGenerator::computeRegisterInitialValues( return RIV; } -llvm::Expected SnippetGenerator::generateSelfAliasingCodeTemplate( - const Instruction &Instr) const { +llvm::Expected> +generateSelfAliasingCodeTemplates(const Instruction &Instr) { const AliasingConfigurations SelfAliasing(Instr, Instr); - if (SelfAliasing.empty()) { + if (SelfAliasing.empty()) return llvm::make_error("empty self aliasing"); - } - CodeTemplate CT; + std::vector Result; + Result.emplace_back(); + CodeTemplate &CT = Result.back(); InstructionTemplate IT(Instr); if (SelfAliasing.hasImplicitAliasing()) { CT.Info = "implicit Self cycles, picking random values."; @@ -116,16 +125,18 @@ llvm::Expected SnippetGenerator::generateSelfAliasingCodeTemplate( setRandomAliasing(SelfAliasing, IT, IT); } CT.Instructions.push_back(std::move(IT)); - return std::move(CT); + return Result; } -llvm::Expected -SnippetGenerator::generateUnconstrainedCodeTemplate(const Instruction &Instr, - llvm::StringRef Msg) const { - CodeTemplate CT; +llvm::Expected> +generateUnconstrainedCodeTemplates(const Instruction &Instr, + llvm::StringRef Msg) { + std::vector Result; + Result.emplace_back(); + CodeTemplate &CT = Result.back(); CT.Info = llvm::formatv("{0}, repeating an unconstrained assignment", Msg); CT.Instructions.emplace_back(Instr); - return std::move(CT); + return Result; } std::mt19937 &randomGenerator() { diff --git a/tools/llvm-exegesis/lib/SnippetGenerator.h b/tools/llvm-exegesis/lib/SnippetGenerator.h index c9a19cd0eeb..e48cf0cfeb0 100644 --- a/tools/llvm-exegesis/lib/SnippetGenerator.h +++ b/tools/llvm-exegesis/lib/SnippetGenerator.h @@ -30,6 +30,17 @@ namespace exegesis { +std::vector getSingleton(CodeTemplate &CT); + +// Generates code templates that has a self-dependency. +llvm::Expected> +generateSelfAliasingCodeTemplates(const Instruction &Instr); + +// Generates code templates without assignment constraints. +llvm::Expected> +generateUnconstrainedCodeTemplates(const Instruction &Instr, + llvm::StringRef Msg); + // A class representing failures that happened during Benchmark, they are used // to report informations to the user. class SnippetGeneratorFailure : public llvm::StringError { @@ -55,18 +66,10 @@ public: protected: const LLVMState &State; - // Generates a single code template that has a self-dependency. - llvm::Expected - generateSelfAliasingCodeTemplate(const Instruction &Instr) const; - // Generates a single code template without assignment constraints. - llvm::Expected - generateUnconstrainedCodeTemplate(const Instruction &Instr, - llvm::StringRef Msg) const; - private: // API to be implemented by subclasses. - virtual llvm::Expected - generateCodeTemplate(const Instruction &Instr) const = 0; + virtual llvm::Expected> + generateCodeTemplates(const Instruction &Instr) const = 0; }; // A global Random Number Generator to randomize configurations. diff --git a/tools/llvm-exegesis/lib/Uops.cpp b/tools/llvm-exegesis/lib/Uops.cpp index 1a701d169eb..a3ada77ef8c 100644 --- a/tools/llvm-exegesis/lib/Uops.cpp +++ b/tools/llvm-exegesis/lib/Uops.cpp @@ -124,8 +124,8 @@ void UopsSnippetGenerator::instantiateMemoryOperands( "not enough scratch space"); } -llvm::Expected -UopsSnippetGenerator::generateCodeTemplate(const Instruction &Instr) const { +llvm::Expected> +UopsSnippetGenerator::generateCodeTemplates(const Instruction &Instr) const { CodeTemplate CT; const llvm::BitVector *ScratchSpaceAliasedRegs = nullptr; if (Instr.hasMemoryOperands()) { @@ -153,13 +153,13 @@ UopsSnippetGenerator::generateCodeTemplate(const Instruction &Instr) const { CT.Info = "instruction is parallel, repeating a random one."; CT.Instructions.push_back(std::move(IT)); instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions); - return std::move(CT); + return getSingleton(CT); } if (SelfAliasing.hasImplicitAliasing()) { CT.Info = "instruction is serial, repeating a random one."; CT.Instructions.push_back(std::move(IT)); instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions); - return std::move(CT); + return getSingleton(CT); } const auto TiedVariables = getVariablesWithTiedOperands(Instr); if (!TiedVariables.empty()) { @@ -181,7 +181,7 @@ UopsSnippetGenerator::generateCodeTemplate(const Instruction &Instr) const { CT.Instructions.push_back(std::move(TmpIT)); } instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions); - return std::move(CT); + return getSingleton(CT); } const auto &ReservedRegisters = State.getRATC().reservedRegisters(); // No tied variables, we pick random values for defs. @@ -218,7 +218,7 @@ UopsSnippetGenerator::generateCodeTemplate(const Instruction &Instr) const { "instruction has no tied variables picking Uses different from defs"; CT.Instructions.push_back(std::move(IT)); instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions); - return std::move(CT); + return getSingleton(CT); } std::vector diff --git a/tools/llvm-exegesis/lib/Uops.h b/tools/llvm-exegesis/lib/Uops.h index 1cfa8242078..e6f6d4a09cb 100644 --- a/tools/llvm-exegesis/lib/Uops.h +++ b/tools/llvm-exegesis/lib/Uops.h @@ -25,8 +25,8 @@ public: UopsSnippetGenerator(const LLVMState &State) : SnippetGenerator(State) {} ~UopsSnippetGenerator() override; - llvm::Expected - generateCodeTemplate(const Instruction &Instr) const override; + llvm::Expected> + generateCodeTemplates(const Instruction &Instr) const override; static constexpr const size_t kMinNumDifferentAddresses = 6; diff --git a/tools/llvm-exegesis/lib/X86/Target.cpp b/tools/llvm-exegesis/lib/X86/Target.cpp index 0e9a6de95ce..20bb65ebde5 100644 --- a/tools/llvm-exegesis/lib/X86/Target.cpp +++ b/tools/llvm-exegesis/lib/X86/Target.cpp @@ -38,14 +38,14 @@ class X86LatencySnippetGenerator : public LatencySnippetGenerator { public: using LatencySnippetGenerator::LatencySnippetGenerator; - llvm::Expected - generateCodeTemplate(const Instruction &Instr) const override { + llvm::Expected> + generateCodeTemplates(const Instruction &Instr) const override { if (auto E = IsInvalidOpcode(Instr)) return std::move(E); switch (GetX86FPFlags(Instr)) { case llvm::X86II::NotFP: - return LatencySnippetGenerator::generateCodeTemplate(Instr); + return LatencySnippetGenerator::generateCodeTemplates(Instr); case llvm::X86II::ZeroArgFP: case llvm::X86II::OneArgFP: case llvm::X86II::SpecialFP: @@ -58,7 +58,7 @@ public: // - `ST(0) = fsqrt(ST(0))` (OneArgFPRW) // - `ST(0) = ST(0) + ST(i)` (TwoArgFP) // They are intrinsically serial and do not modify the state of the stack. - return generateSelfAliasingCodeTemplate(Instr); + return generateSelfAliasingCodeTemplates(Instr); default: llvm_unreachable("Unknown FP Type!"); } @@ -69,14 +69,14 @@ class X86UopsSnippetGenerator : public UopsSnippetGenerator { public: using UopsSnippetGenerator::UopsSnippetGenerator; - llvm::Expected - generateCodeTemplate(const Instruction &Instr) const override { + llvm::Expected> + generateCodeTemplates(const Instruction &Instr) const override { if (auto E = IsInvalidOpcode(Instr)) return std::move(E); switch (GetX86FPFlags(Instr)) { case llvm::X86II::NotFP: - return UopsSnippetGenerator::generateCodeTemplate(Instr); + return UopsSnippetGenerator::generateCodeTemplates(Instr); case llvm::X86II::ZeroArgFP: case llvm::X86II::OneArgFP: case llvm::X86II::SpecialFP: @@ -88,12 +88,12 @@ public: // - `ST(0) = ST(0) + ST(i)` (TwoArgFP) // They are intrinsically serial and do not modify the state of the stack. // We generate the same code for latency and uops. - return generateSelfAliasingCodeTemplate(Instr); + return generateSelfAliasingCodeTemplates(Instr); case llvm::X86II::CompareFP: case llvm::X86II::CondMovFP: // We can compute uops for any FP instruction that does not grow or shrink // the stack (either do not touch the stack or push as much as they pop). - return generateUnconstrainedCodeTemplate( + return generateUnconstrainedCodeTemplates( Instr, "instruction does not grow/shrink the FP stack"); default: llvm_unreachable("Unknown FP Type!"); diff --git a/unittests/tools/llvm-exegesis/X86/SnippetGeneratorTest.cpp b/unittests/tools/llvm-exegesis/X86/SnippetGeneratorTest.cpp index 4df489df06f..6cc24a02cfc 100644 --- a/unittests/tools/llvm-exegesis/X86/SnippetGeneratorTest.cpp +++ b/unittests/tools/llvm-exegesis/X86/SnippetGeneratorTest.cpp @@ -60,9 +60,11 @@ protected: CodeTemplate checkAndGetCodeTemplate(unsigned Opcode) { randomGenerator().seed(0); // Initialize seed. const Instruction Instr(State, Opcode); - auto CodeTemplateOrError = Generator.generateCodeTemplate(Instr); + auto CodeTemplateOrError = Generator.generateCodeTemplates(Instr); EXPECT_FALSE(CodeTemplateOrError.takeError()); // Valid configuration. - return std::move(CodeTemplateOrError.get()); + auto &CodeTemplate = CodeTemplateOrError.get(); + EXPECT_EQ(CodeTemplate.size(), 1U); + return std::move(CodeTemplate.front()); } SnippetGeneratorT Generator; @@ -240,7 +242,7 @@ TEST_F(UopsSnippetGeneratorTest, MemoryUse_Movsb) { // MOVSB writes to scratch memory register. const unsigned Opcode = llvm::X86::MOVSB; const Instruction Instr(State, Opcode); - auto Error = Generator.generateCodeTemplate(Instr).takeError(); + auto Error = Generator.generateCodeTemplates(Instr).takeError(); EXPECT_TRUE((bool)Error); llvm::consumeError(std::move(Error)); } @@ -254,8 +256,8 @@ public: } private: - llvm::Expected - generateCodeTemplate(const Instruction &Instr) const override { + llvm::Expected> + generateCodeTemplates(const Instruction &Instr) const override { return llvm::make_error("not implemented", llvm::inconvertibleErrorCode()); } -- GitLab From dc5c9c28094836e848bb845d24a3b1e933342aaa Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Mon, 15 Oct 2018 09:17:09 +0000 Subject: [PATCH 0181/1116] [TI removal] Remove TerminatorInst as an input parameter from all public LLVM APIs. There weren't very many. We still have the instruction visitor, and APIs with TerminatorInst as a return type or an output parameter. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344494 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Analysis/CFG.h | 3 +-- include/llvm/Transforms/Utils/BasicBlockUtils.h | 2 +- lib/Analysis/CFG.cpp | 3 ++- lib/Transforms/Utils/BreakCriticalEdges.cpp | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/include/llvm/Analysis/CFG.h b/include/llvm/Analysis/CFG.h index cccdd163741..caae0b6e2a8 100644 --- a/include/llvm/Analysis/CFG.h +++ b/include/llvm/Analysis/CFG.h @@ -25,7 +25,6 @@ class DominatorTree; class Function; class Instruction; class LoopInfo; -class TerminatorInst; /// Analyze the specified function to find all of the loop backedges in the /// function and return them. This is a relatively cheap (compared to @@ -46,7 +45,7 @@ unsigned GetSuccessorNumber(const BasicBlock *BB, const BasicBlock *Succ); /// edges from a block with multiple successors to a block with multiple /// predecessors. /// -bool isCriticalEdge(const TerminatorInst *TI, unsigned SuccNum, +bool isCriticalEdge(const Instruction *TI, unsigned SuccNum, bool AllowIdenticalEdges = false); /// Determine whether instruction 'To' is reachable from 'From', diff --git a/include/llvm/Transforms/Utils/BasicBlockUtils.h b/include/llvm/Transforms/Utils/BasicBlockUtils.h index dee1541f9d2..f0ba5c6c9c2 100644 --- a/include/llvm/Transforms/Utils/BasicBlockUtils.h +++ b/include/llvm/Transforms/Utils/BasicBlockUtils.h @@ -128,7 +128,7 @@ struct CriticalEdgeSplittingOptions { /// IndirectBrInst. Splitting these edges will almost always create an invalid /// program because the address of the new block won't be the one that is jumped /// to. -BasicBlock *SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum, +BasicBlock *SplitCriticalEdge(Instruction *TI, unsigned SuccNum, const CriticalEdgeSplittingOptions &Options = CriticalEdgeSplittingOptions()); diff --git a/lib/Analysis/CFG.cpp b/lib/Analysis/CFG.cpp index a319be8092f..aaea5995429 100644 --- a/lib/Analysis/CFG.cpp +++ b/lib/Analysis/CFG.cpp @@ -85,8 +85,9 @@ unsigned llvm::GetSuccessorNumber(const BasicBlock *BB, /// isCriticalEdge - Return true if the specified edge is a critical edge. /// Critical edges are edges from a block with multiple successors to a block /// with multiple predecessors. -bool llvm::isCriticalEdge(const TerminatorInst *TI, unsigned SuccNum, +bool llvm::isCriticalEdge(const Instruction *TI, unsigned SuccNum, bool AllowIdenticalEdges) { + assert(TI->isTerminator() && "Must be a terminator to have successors!"); assert(SuccNum < TI->getNumSuccessors() && "Illegal edge specification!"); if (TI->getNumSuccessors() == 1) return false; diff --git a/lib/Transforms/Utils/BreakCriticalEdges.cpp b/lib/Transforms/Utils/BreakCriticalEdges.cpp index 63b37e37943..c3d67087ae7 100644 --- a/lib/Transforms/Utils/BreakCriticalEdges.cpp +++ b/lib/Transforms/Utils/BreakCriticalEdges.cpp @@ -130,7 +130,7 @@ static void createPHIsForSplitLoopExit(ArrayRef Preds, } BasicBlock * -llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum, +llvm::SplitCriticalEdge(Instruction *TI, unsigned SuccNum, const CriticalEdgeSplittingOptions &Options) { if (!isCriticalEdge(TI, SuccNum, Options.MergeIdenticalEdges)) return nullptr; -- GitLab From 7c0f083bcb9e916d3b65a7894db7dd602e2d8536 Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Mon, 15 Oct 2018 09:17:38 +0000 Subject: [PATCH 0182/1116] [TI removal] Remove a unnecessary use of `TerminatorInst` from an IR header. NFC. Part of the removal of `TerminatorInst` from the type hierarchy. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344495 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/IR/CFG.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/llvm/IR/CFG.h b/include/llvm/IR/CFG.h index fd384ef4949..4140c8a212e 100644 --- a/include/llvm/IR/CFG.h +++ b/include/llvm/IR/CFG.h @@ -73,7 +73,7 @@ public: inline reference operator*() const { assert(!It.atEnd() && "pred_iterator out of range!"); - return cast(*It)->getParent(); + return cast(*It)->getParent(); } inline pointer *operator->() const { return &operator*(); } -- GitLab From 9d078e56967c22e40709bd25deab73b981ee7f09 Mon Sep 17 00:00:00 2001 From: Guillaume Chatelet Date: Mon, 15 Oct 2018 09:21:21 +0000 Subject: [PATCH 0183/1116] [llvm-exegesis] Fix missing std::move. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344496 91177308-0d34-0410-b5e6-96231b3b80d8 --- tools/llvm-exegesis/lib/SnippetGenerator.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/llvm-exegesis/lib/SnippetGenerator.cpp b/tools/llvm-exegesis/lib/SnippetGenerator.cpp index 9b577fd65a9..feee61d113c 100644 --- a/tools/llvm-exegesis/lib/SnippetGenerator.cpp +++ b/tools/llvm-exegesis/lib/SnippetGenerator.cpp @@ -125,7 +125,7 @@ generateSelfAliasingCodeTemplates(const Instruction &Instr) { setRandomAliasing(SelfAliasing, IT, IT); } CT.Instructions.push_back(std::move(IT)); - return Result; + return std::move(Result); } llvm::Expected> @@ -136,7 +136,7 @@ generateUnconstrainedCodeTemplates(const Instruction &Instr, CodeTemplate &CT = Result.back(); CT.Info = llvm::formatv("{0}, repeating an unconstrained assignment", Msg); CT.Instructions.emplace_back(Instr); - return Result; + return std::move(Result); } std::mt19937 &randomGenerator() { -- GitLab From f2c212eed34f9b1f4fa601de8c02f31f6bfe3f48 Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Mon, 15 Oct 2018 09:33:40 +0000 Subject: [PATCH 0184/1116] [TI removal] Just use Instruction in the CFG printer code. NFC. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344497 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Analysis/CFGPrinter.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/llvm/Analysis/CFGPrinter.h b/include/llvm/Analysis/CFGPrinter.h index a4b642b9ea3..5996dd90bcf 100644 --- a/include/llvm/Analysis/CFGPrinter.h +++ b/include/llvm/Analysis/CFGPrinter.h @@ -150,7 +150,7 @@ struct DOTGraphTraits : public DefaultDOTGraphTraits { /// Display the raw branch weights from PGO. std::string getEdgeAttributes(const BasicBlock *Node, succ_const_iterator I, const Function *F) { - const TerminatorInst *TI = Node->getTerminator(); + const Instruction *TI = Node->getTerminator(); if (TI->getNumSuccessors() == 1) return ""; -- GitLab From ce1e09bcf53d60db89215c1800c1a6e2562d5b0b Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Mon, 15 Oct 2018 09:34:05 +0000 Subject: [PATCH 0185/1116] [TI removal] Remove `TerminatorInst` from BasicBlockUtils.h This requires updating a number of .cpp files to adapt to the new API. I've just systematically updated all uses of `TerminatorInst` within these files te `Instruction` so thta I won't have to touch them again in the future. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344498 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../llvm/Transforms/Utils/BasicBlockUtils.h | 18 ++++++------- .../Instrumentation/AddressSanitizer.cpp | 13 +++++----- .../Instrumentation/EfficiencySanitizer.cpp | 2 +- .../Instrumentation/HWAddressSanitizer.cpp | 2 +- lib/Transforms/Scalar/JumpThreading.cpp | 16 ++++++------ lib/Transforms/Utils/BasicBlockUtils.cpp | 25 ++++++++++--------- lib/Transforms/Utils/CallPromotionUtils.cpp | 4 +-- lib/Transforms/Utils/LibCallsShrinkWrap.cpp | 2 +- lib/Transforms/Utils/LowerMemIntrinsics.cpp | 2 +- 9 files changed, 43 insertions(+), 41 deletions(-) diff --git a/include/llvm/Transforms/Utils/BasicBlockUtils.h b/include/llvm/Transforms/Utils/BasicBlockUtils.h index f0ba5c6c9c2..a0fc18825a5 100644 --- a/include/llvm/Transforms/Utils/BasicBlockUtils.h +++ b/include/llvm/Transforms/Utils/BasicBlockUtils.h @@ -148,7 +148,7 @@ inline bool SplitCriticalEdge(BasicBlock *Succ, pred_iterator PI, const CriticalEdgeSplittingOptions &Options = CriticalEdgeSplittingOptions()) { bool MadeChange = false; - TerminatorInst *TI = (*PI)->getTerminator(); + Instruction *TI = (*PI)->getTerminator(); for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) if (TI->getSuccessor(i) == Succ) MadeChange |= !!SplitCriticalEdge(TI, i, Options); @@ -162,7 +162,7 @@ inline BasicBlock * SplitCriticalEdge(BasicBlock *Src, BasicBlock *Dst, const CriticalEdgeSplittingOptions &Options = CriticalEdgeSplittingOptions()) { - TerminatorInst *TI = Src->getTerminator(); + Instruction *TI = Src->getTerminator(); unsigned i = 0; while (true) { assert(i != TI->getNumSuccessors() && "Edge doesn't exist!"); @@ -257,11 +257,11 @@ ReturnInst *FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB, /// Returns the NewBasicBlock's terminator. /// /// Updates DT and LI if given. -TerminatorInst *SplitBlockAndInsertIfThen(Value *Cond, Instruction *SplitBefore, - bool Unreachable, - MDNode *BranchWeights = nullptr, - DominatorTree *DT = nullptr, - LoopInfo *LI = nullptr); +Instruction *SplitBlockAndInsertIfThen(Value *Cond, Instruction *SplitBefore, + bool Unreachable, + MDNode *BranchWeights = nullptr, + DominatorTree *DT = nullptr, + LoopInfo *LI = nullptr); /// SplitBlockAndInsertIfThenElse is similar to SplitBlockAndInsertIfThen, /// but also creates the ElseBlock. @@ -278,8 +278,8 @@ TerminatorInst *SplitBlockAndInsertIfThen(Value *Cond, Instruction *SplitBefore, /// SplitBefore /// Tail void SplitBlockAndInsertIfThenElse(Value *Cond, Instruction *SplitBefore, - TerminatorInst **ThenTerm, - TerminatorInst **ElseTerm, + Instruction **ThenTerm, + Instruction **ElseTerm, MDNode *BranchWeights = nullptr); /// Check whether BB is the merge point of a if-region. diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp index b832417154e..ad07b608934 100644 --- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -1443,7 +1443,7 @@ static void instrumentMaskedLoadOrStore(AddressSanitizer *Pass, } else { IRBuilder<> IRB(I); Value *MaskElem = IRB.CreateExtractElement(Mask, Idx); - TerminatorInst *ThenTerm = SplitBlockAndInsertIfThen(MaskElem, I, false); + Instruction *ThenTerm = SplitBlockAndInsertIfThen(MaskElem, I, false); InsertBefore = ThenTerm; } @@ -1596,8 +1596,9 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns, Value *TagCheck = IRB.CreateICmpEQ(Tag, ConstantInt::get(IntptrTy, kMyriadDDRTag)); - TerminatorInst *TagCheckTerm = SplitBlockAndInsertIfThen( - TagCheck, InsertBefore, false, MDBuilder(*C).createBranchWeights(1, 100000)); + Instruction *TagCheckTerm = + SplitBlockAndInsertIfThen(TagCheck, InsertBefore, false, + MDBuilder(*C).createBranchWeights(1, 100000)); assert(cast(TagCheckTerm)->isUnconditional()); IRB.SetInsertPoint(TagCheckTerm); InsertBefore = TagCheckTerm; @@ -1613,12 +1614,12 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns, Value *Cmp = IRB.CreateICmpNE(ShadowValue, CmpVal); size_t Granularity = 1ULL << Mapping.Scale; - TerminatorInst *CrashTerm = nullptr; + Instruction *CrashTerm = nullptr; if (ClAlwaysSlowPath || (TypeSize < 8 * Granularity)) { // We use branch weights for the slow path check, to indicate that the slow // path is rarely taken. This seems to be the case for SPEC benchmarks. - TerminatorInst *CheckTerm = SplitBlockAndInsertIfThen( + Instruction *CheckTerm = SplitBlockAndInsertIfThen( Cmp, InsertBefore, false, MDBuilder(*C).createBranchWeights(1, 100000)); assert(cast(CheckTerm)->isUnconditional()); BasicBlock *NextBB = CheckTerm->getSuccessor(0); @@ -3116,7 +3117,7 @@ void FunctionStackPoisoner::processStaticAllocas() { // Value *Cmp = IRBRet.CreateICmpNE(FakeStack, Constant::getNullValue(IntptrTy)); - TerminatorInst *ThenTerm, *ElseTerm; + Instruction *ThenTerm, *ElseTerm; SplitBlockAndInsertIfThenElse(Cmp, Ret, &ThenTerm, &ElseTerm); IRBuilder<> IRBPoison(ThenTerm); diff --git a/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp b/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp index 0ab915de60d..db438e78ded 100644 --- a/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp +++ b/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp @@ -887,7 +887,7 @@ bool EfficiencySanitizer::instrumentFastpathWorkingSet( Value *OldValue = IRB.CreateLoad(IRB.CreateIntToPtr(ShadowPtr, ShadowPtrTy)); // The AND and CMP will be turned into a TEST instruction by the compiler. Value *Cmp = IRB.CreateICmpNE(IRB.CreateAnd(OldValue, ValueMask), ValueMask); - TerminatorInst *CmpTerm = SplitBlockAndInsertIfThen(Cmp, I, false); + Instruction *CmpTerm = SplitBlockAndInsertIfThen(Cmp, I, false); // FIXME: do I need to call SetCurrentDebugLocation? IRB.SetInsertPoint(CmpTerm); // We use OR to set the shadow bits to avoid corrupting the middle 6 bits, diff --git a/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp index 510b1b058d0..63bd8ee35c6 100644 --- a/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp +++ b/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp @@ -467,7 +467,7 @@ void HWAddressSanitizer::instrumentMemAccessInline(Value *PtrLong, bool IsWrite, TagMismatch = IRB.CreateAnd(TagMismatch, TagNotIgnored); } - TerminatorInst *CheckTerm = + Instruction *CheckTerm = SplitBlockAndInsertIfThen(TagMismatch, InsertBefore, !Recover, MDBuilder(*C).createBranchWeights(1, 100000)); diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp index 1fc8f3988f9..849ff71e198 100644 --- a/lib/Transforms/Scalar/JumpThreading.cpp +++ b/lib/Transforms/Scalar/JumpThreading.cpp @@ -947,7 +947,7 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors( /// Since we can pick an arbitrary destination, we pick the successor with the /// fewest predecessors. This should reduce the in-degree of the others. static unsigned GetBestDestForJumpOnUndef(BasicBlock *BB) { - TerminatorInst *BBTerm = BB->getTerminator(); + Instruction *BBTerm = BB->getTerminator(); unsigned MinSucc = 0; BasicBlock *TestBB = BBTerm->getSuccessor(MinSucc); // Compute the successor with the minimum number of predecessors. @@ -988,7 +988,7 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) { // because now the condition in this block can be threaded through // predecessors of our predecessor block. if (BasicBlock *SinglePred = BB->getSinglePredecessor()) { - const TerminatorInst *TI = SinglePred->getTerminator(); + const Instruction *TI = SinglePred->getTerminator(); if (!TI->isExceptionalTerminator() && TI->getNumSuccessors() == 1 && SinglePred != BB && !hasAddressTakenAndUsed(BB)) { // If SinglePred was a loop header, BB becomes one. @@ -1080,7 +1080,7 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) { std::vector Updates; // Fold the branch/switch. - TerminatorInst *BBTerm = BB->getTerminator(); + Instruction *BBTerm = BB->getTerminator(); Updates.reserve(BBTerm->getNumSuccessors()); for (unsigned i = 0, e = BBTerm->getNumSuccessors(); i != e; ++i) { if (i == BestSucc) continue; @@ -1549,7 +1549,7 @@ FindMostPopularDest(BasicBlock *BB, // successor list. if (!SamePopularity.empty()) { SamePopularity.push_back(MostPopularDest); - TerminatorInst *TI = BB->getTerminator(); + Instruction *TI = BB->getTerminator(); for (unsigned i = 0; ; ++i) { assert(i != TI->getNumSuccessors() && "Didn't find any successor!"); @@ -1669,7 +1669,7 @@ bool JumpThreadingPass::ProcessThreadableEdges(Value *Cond, BasicBlock *BB, } // Finally update the terminator. - TerminatorInst *Term = BB->getTerminator(); + Instruction *Term = BB->getTerminator(); BranchInst::Create(OnlyDest, Term); Term->eraseFromParent(); DTU->applyUpdates(Updates); @@ -2006,7 +2006,7 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB, // Update the terminator of PredBB to jump to NewBB instead of BB. This // eliminates predecessors from BB, which requires us to simplify any PHI // nodes in BB. - TerminatorInst *PredTerm = PredBB->getTerminator(); + Instruction *PredTerm = PredBB->getTerminator(); for (unsigned i = 0, e = PredTerm->getNumSuccessors(); i != e; ++i) if (PredTerm->getSuccessor(i) == BB) { BB->removePredecessor(PredBB, true); @@ -2115,7 +2115,7 @@ BasicBlock *JumpThreadingPass::SplitBlockPreds(BasicBlock *BB, } bool JumpThreadingPass::doesBlockHaveProfileData(BasicBlock *BB) { - const TerminatorInst *TI = BB->getTerminator(); + const Instruction *TI = BB->getTerminator(); assert(TI->getNumSuccessors() > 1 && "not a split"); MDNode *WeightsNode = TI->getMetadata(LLVMContext::MD_prof); @@ -2538,7 +2538,7 @@ bool JumpThreadingPass::TryToUnfoldSelectInCurrBB(BasicBlock *BB) { if (!SI) continue; // Expand the select. - TerminatorInst *Term = + Instruction *Term = SplitBlockAndInsertIfThen(SI->getCondition(), SI, false); BasicBlock *SplitBB = SI->getParent(); BasicBlock *NewBB = Term->getParent(); diff --git a/lib/Transforms/Utils/BasicBlockUtils.cpp b/lib/Transforms/Utils/BasicBlockUtils.cpp index 99914fcf81b..11a0114150f 100644 --- a/lib/Transforms/Utils/BasicBlockUtils.cpp +++ b/lib/Transforms/Utils/BasicBlockUtils.cpp @@ -52,7 +52,7 @@ void llvm::DeleteDeadBlock(BasicBlock *BB, DomTreeUpdater *DTU) { assert((pred_begin(BB) == pred_end(BB) || // Can delete self loop. BB->getSinglePredecessor() == BB) && "Block is not dead!"); - TerminatorInst *BBTerm = BB->getTerminator(); + Instruction *BBTerm = BB->getTerminator(); std::vector Updates; // Loop through all of our successors and make sure they know that one @@ -270,7 +270,7 @@ BasicBlock *llvm::SplitEdge(BasicBlock *BB, BasicBlock *Succ, DominatorTree *DT, unsigned SuccNum = GetSuccessorNumber(BB, Succ); // If this is a critical edge, let SplitCriticalEdge do it. - TerminatorInst *LatchTerm = BB->getTerminator(); + Instruction *LatchTerm = BB->getTerminator(); if (SplitCriticalEdge( LatchTerm, SuccNum, CriticalEdgeSplittingOptions(DT, LI, MSSAU).setPreserveLCSSA())) @@ -298,7 +298,7 @@ llvm::SplitAllCriticalEdges(Function &F, const CriticalEdgeSplittingOptions &Options) { unsigned NumBroken = 0; for (BasicBlock &BB : F) { - TerminatorInst *TI = BB.getTerminator(); + Instruction *TI = BB.getTerminator(); if (TI->getNumSuccessors() > 1 && !isa(TI)) for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) if (SplitCriticalEdge(TI, i, Options)) @@ -705,16 +705,17 @@ ReturnInst *llvm::FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB, return cast(NewRet); } -TerminatorInst * -llvm::SplitBlockAndInsertIfThen(Value *Cond, Instruction *SplitBefore, - bool Unreachable, MDNode *BranchWeights, - DominatorTree *DT, LoopInfo *LI) { +Instruction *llvm::SplitBlockAndInsertIfThen(Value *Cond, + Instruction *SplitBefore, + bool Unreachable, + MDNode *BranchWeights, + DominatorTree *DT, LoopInfo *LI) { BasicBlock *Head = SplitBefore->getParent(); BasicBlock *Tail = Head->splitBasicBlock(SplitBefore->getIterator()); - TerminatorInst *HeadOldTerm = Head->getTerminator(); + Instruction *HeadOldTerm = Head->getTerminator(); LLVMContext &C = Head->getContext(); BasicBlock *ThenBlock = BasicBlock::Create(C, "", Head->getParent(), Tail); - TerminatorInst *CheckTerm; + Instruction *CheckTerm; if (Unreachable) CheckTerm = new UnreachableInst(C, ThenBlock); else @@ -749,12 +750,12 @@ llvm::SplitBlockAndInsertIfThen(Value *Cond, Instruction *SplitBefore, } void llvm::SplitBlockAndInsertIfThenElse(Value *Cond, Instruction *SplitBefore, - TerminatorInst **ThenTerm, - TerminatorInst **ElseTerm, + Instruction **ThenTerm, + Instruction **ElseTerm, MDNode *BranchWeights) { BasicBlock *Head = SplitBefore->getParent(); BasicBlock *Tail = Head->splitBasicBlock(SplitBefore->getIterator()); - TerminatorInst *HeadOldTerm = Head->getTerminator(); + Instruction *HeadOldTerm = Head->getTerminator(); LLVMContext &C = Head->getContext(); BasicBlock *ThenBlock = BasicBlock::Create(C, "", Head->getParent(), Tail); BasicBlock *ElseBlock = BasicBlock::Create(C, "", Head->getParent(), Tail); diff --git a/lib/Transforms/Utils/CallPromotionUtils.cpp b/lib/Transforms/Utils/CallPromotionUtils.cpp index 261ab87c3e7..4db579156d9 100644 --- a/lib/Transforms/Utils/CallPromotionUtils.cpp +++ b/lib/Transforms/Utils/CallPromotionUtils.cpp @@ -270,8 +270,8 @@ static Instruction *versionCallSite(CallSite CS, Value *Callee, // Create an if-then-else structure. The original instruction is moved into // the "else" block, and a clone of the original instruction is placed in the // "then" block. - TerminatorInst *ThenTerm = nullptr; - TerminatorInst *ElseTerm = nullptr; + Instruction *ThenTerm = nullptr; + Instruction *ElseTerm = nullptr; SplitBlockAndInsertIfThenElse(Cond, CS.getInstruction(), &ThenTerm, &ElseTerm, BranchWeights); BasicBlock *ThenBlock = ThenTerm->getParent(); diff --git a/lib/Transforms/Utils/LibCallsShrinkWrap.cpp b/lib/Transforms/Utils/LibCallsShrinkWrap.cpp index 9832a6f24e1..e1592c86763 100644 --- a/lib/Transforms/Utils/LibCallsShrinkWrap.cpp +++ b/lib/Transforms/Utils/LibCallsShrinkWrap.cpp @@ -487,7 +487,7 @@ void LibCallsShrinkWrap::shrinkWrapCI(CallInst *CI, Value *Cond) { MDNode *BranchWeights = MDBuilder(CI->getContext()).createBranchWeights(1, 2000); - TerminatorInst *NewInst = + Instruction *NewInst = SplitBlockAndInsertIfThen(Cond, CI, false, BranchWeights, DT); BasicBlock *CallBB = NewInst->getParent(); CallBB->setName("cdce.call"); diff --git a/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/lib/Transforms/Utils/LowerMemIntrinsics.cpp index 03006ef3a2d..661b4fa5bcb 100644 --- a/lib/Transforms/Utils/LowerMemIntrinsics.cpp +++ b/lib/Transforms/Utils/LowerMemIntrinsics.cpp @@ -301,7 +301,7 @@ static void createMemMoveLoop(Instruction *InsertBefore, // the appropriate conditional branches when the loop is built. ICmpInst *PtrCompare = new ICmpInst(InsertBefore, ICmpInst::ICMP_ULT, SrcAddr, DstAddr, "compare_src_dst"); - TerminatorInst *ThenTerm, *ElseTerm; + Instruction *ThenTerm, *ElseTerm; SplitBlockAndInsertIfThenElse(PtrCompare, InsertBefore, &ThenTerm, &ElseTerm); -- GitLab From ac346921b5695fdac3883c83262295399b7df0ca Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Mon, 15 Oct 2018 09:34:31 +0000 Subject: [PATCH 0186/1116] [TI removal] Remove a dead forward declaration of TerminatorInst. NFC. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344499 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Transforms/Scalar.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/llvm/Transforms/Scalar.h b/include/llvm/Transforms/Scalar.h index 9491e1bbac9..fe4ff621c6f 100644 --- a/include/llvm/Transforms/Scalar.h +++ b/include/llvm/Transforms/Scalar.h @@ -26,7 +26,6 @@ class ModulePass; class Pass; class GetElementPtrInst; class PassInfo; -class TerminatorInst; class TargetLowering; class TargetMachine; -- GitLab From fc6649b88c16738a266813e6ce32e022d16a5439 Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Mon, 15 Oct 2018 09:47:26 +0000 Subject: [PATCH 0187/1116] [TI removal] Remove `TerminatorInst` from SparsePropagation.h and related code. This is simple as we just need to replace the type and move to the concept of visiting a "terminator" rather than a specific instruction subclass. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344500 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Analysis/SparsePropagation.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/include/llvm/Analysis/SparsePropagation.h b/include/llvm/Analysis/SparsePropagation.h index 04e94f7cd52..02a2e64268b 100644 --- a/include/llvm/Analysis/SparsePropagation.h +++ b/include/llvm/Analysis/SparsePropagation.h @@ -189,12 +189,12 @@ private: /// getFeasibleSuccessors - Return a vector of booleans to indicate which /// successors are reachable from a given terminator instruction. - void getFeasibleSuccessors(TerminatorInst &TI, SmallVectorImpl &Succs, + void getFeasibleSuccessors(Instruction &TI, SmallVectorImpl &Succs, bool AggressiveUndef); void visitInst(Instruction &I); void visitPHINode(PHINode &I); - void visitTerminatorInst(TerminatorInst &TI); + void visitTerminator(Instruction &TI); }; //===----------------------------------------------------------------------===// @@ -286,7 +286,7 @@ void SparseSolver::markEdgeExecutable( template void SparseSolver::getFeasibleSuccessors( - TerminatorInst &TI, SmallVectorImpl &Succs, bool AggressiveUndef) { + Instruction &TI, SmallVectorImpl &Succs, bool AggressiveUndef) { Succs.resize(TI.getNumSuccessors()); if (TI.getNumSuccessors() == 0) return; @@ -374,7 +374,7 @@ template bool SparseSolver::isEdgeFeasible( BasicBlock *From, BasicBlock *To, bool AggressiveUndef) { SmallVector SuccFeasible; - TerminatorInst *TI = From->getTerminator(); + Instruction *TI = From->getTerminator(); getFeasibleSuccessors(*TI, SuccFeasible, AggressiveUndef); for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) @@ -385,8 +385,8 @@ bool SparseSolver::isEdgeFeasible( } template -void SparseSolver::visitTerminatorInst( - TerminatorInst &TI) { +void SparseSolver::visitTerminator( + Instruction &TI) { SmallVector SuccFeasible; getFeasibleSuccessors(TI, SuccFeasible, true); @@ -465,8 +465,8 @@ void SparseSolver::visitInst(Instruction &I) { if (ChangedValue.second != LatticeFunc->getUntrackedVal()) UpdateState(ChangedValue.first, ChangedValue.second); - if (TerminatorInst *TI = dyn_cast(&I)) - visitTerminatorInst(*TI); + if (I.isTerminator()) + visitTerminator(I); } template -- GitLab From aa517f562fbd28786bcdf69f9e8f365e791fad80 Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Mon, 15 Oct 2018 10:00:15 +0000 Subject: [PATCH 0188/1116] [TI removal] Remove `TerminatorInst` from GVN.h and GVN.cpp. This is the last interesting usage in all of LLVM's headers. The remaining usages in headers are the core typesystem bits (Core.h, instruction types, and InstVisitor) and as the return of `BasicBlock::getTerminator`. The latter is the big remaining API point that I'll remove after mass updates to user code. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344501 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Transforms/Scalar/GVN.h | 2 +- lib/Transforms/Scalar/GVN.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/llvm/Transforms/Scalar/GVN.h b/include/llvm/Transforms/Scalar/GVN.h index c01a1d77e96..784de7f9fe2 100644 --- a/include/llvm/Transforms/Scalar/GVN.h +++ b/include/llvm/Transforms/Scalar/GVN.h @@ -237,7 +237,7 @@ private: } // List of critical edges to be split between iterations. - SmallVector, 4> toSplit; + SmallVector, 4> toSplit; // Helper functions of redundant load elimination bool processLoad(LoadInst *L); diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp index d6c2824a299..c080c2a1813 100644 --- a/lib/Transforms/Scalar/GVN.cpp +++ b/lib/Transforms/Scalar/GVN.cpp @@ -2341,7 +2341,7 @@ bool GVN::splitCriticalEdges() { if (toSplit.empty()) return false; do { - std::pair Edge = toSplit.pop_back_val(); + std::pair Edge = toSplit.pop_back_val(); SplitCriticalEdge(Edge.first, Edge.second, CriticalEdgeSplittingOptions(DT)); } while (!toSplit.empty()); -- GitLab From 2aaf7228e0e39c5afc21788b832430f0154f185b Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Mon, 15 Oct 2018 10:04:59 +0000 Subject: [PATCH 0189/1116] [TI removal] Make variables declared as `TerminatorInst` and initialized by `getTerminator()` calls instead be declared as `Instruction`. This is the biggest remaining chunk of the usage of `getTerminator()` that insists on the narrow type and so is an easy batch of updates. Several files saw more extensive updates where this would cascade to requiring API updates within the file to use `Instruction` instead of `TerminatorInst`. All of these were trivial in nature (pervasively using `Instruction` instead just worked). git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344502 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Analysis/BranchProbabilityInfo.cpp | 10 +-- lib/Analysis/CFG.cpp | 2 +- lib/Analysis/EHPersonalities.cpp | 2 +- lib/Analysis/InlineCost.cpp | 2 +- lib/Analysis/LoopInfo.cpp | 4 +- lib/Analysis/MemorySSAUpdater.cpp | 2 +- lib/Analysis/ScalarEvolution.cpp | 2 +- lib/CodeGen/Analysis.cpp | 2 +- lib/CodeGen/SelectionDAG/FastISel.cpp | 2 +- .../SelectionDAG/SelectionDAGBuilder.cpp | 2 +- lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp | 2 +- lib/CodeGen/WinEHPrepare.cpp | 4 +- lib/IR/BasicBlock.cpp | 4 +- lib/IR/Dominators.cpp | 2 +- lib/IR/SafepointIRVerifier.cpp | 2 +- lib/IR/Verifier.cpp | 2 +- lib/Target/AMDGPU/SIAnnotateControlFlow.cpp | 2 +- lib/Target/NVPTX/NVPTXAllocaHoisting.cpp | 2 +- lib/Target/PowerPC/PPCISelDAGToDAG.cpp | 2 +- .../WebAssemblyLowerEmscriptenEHSjLj.cpp | 2 +- lib/Target/X86/X86WinEHState.cpp | 2 +- lib/Transforms/IPO/HotColdSplitting.cpp | 2 +- lib/Transforms/IPO/LoopExtractor.cpp | 4 +- lib/Transforms/IPO/PartialInlining.cpp | 2 +- lib/Transforms/IPO/PruneEH.cpp | 2 +- lib/Transforms/IPO/SampleProfile.cpp | 2 +- .../InstCombine/InstCombineCalls.cpp | 2 +- lib/Transforms/InstCombine/InstCombinePHI.cpp | 4 +- .../InstCombine/InstructionCombining.cpp | 4 +- lib/Transforms/Instrumentation/CFGMST.h | 2 +- .../Instrumentation/GCOVProfiling.cpp | 6 +- .../Instrumentation/PGOInstrumentation.cpp | 8 +- lib/Transforms/Scalar/ADCE.cpp | 8 +- lib/Transforms/Scalar/CallSiteSplitting.cpp | 2 +- .../Scalar/DeadStoreElimination.cpp | 2 +- lib/Transforms/Scalar/LoopUnrollPass.cpp | 2 +- lib/Transforms/Scalar/LoopUnswitch.cpp | 21 +++-- lib/Transforms/Scalar/PlaceSafepoints.cpp | 6 +- .../Scalar/RewriteStatepointsForGC.cpp | 12 +-- lib/Transforms/Scalar/SCCP.cpp | 2 +- lib/Transforms/Scalar/SROA.cpp | 4 +- lib/Transforms/Scalar/SimpleLoopUnswitch.cpp | 2 +- lib/Transforms/Scalar/StructurizeCFG.cpp | 2 +- .../Scalar/TailRecursionElimination.cpp | 2 +- lib/Transforms/Utils/BreakCriticalEdges.cpp | 2 +- lib/Transforms/Utils/CloneFunction.cpp | 4 +- lib/Transforms/Utils/CodeExtractor.cpp | 8 +- lib/Transforms/Utils/EscapeEnumerator.cpp | 2 +- lib/Transforms/Utils/FlattenCFG.cpp | 6 +- lib/Transforms/Utils/FunctionComparator.cpp | 6 +- lib/Transforms/Utils/InlineFunction.cpp | 2 +- lib/Transforms/Utils/Local.cpp | 6 +- lib/Transforms/Utils/LoopRotationUtils.cpp | 2 +- lib/Transforms/Utils/LoopSimplify.cpp | 4 +- lib/Transforms/Utils/LoopUnroll.cpp | 2 +- lib/Transforms/Utils/LoopUnrollAndJam.cpp | 2 +- lib/Transforms/Utils/SimplifyCFG.cpp | 80 +++++++++---------- lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp | 2 +- 58 files changed, 143 insertions(+), 144 deletions(-) diff --git a/lib/Analysis/BranchProbabilityInfo.cpp b/lib/Analysis/BranchProbabilityInfo.cpp index 54a657073f0..7f544b27fe9 100644 --- a/lib/Analysis/BranchProbabilityInfo.cpp +++ b/lib/Analysis/BranchProbabilityInfo.cpp @@ -135,7 +135,7 @@ static const uint32_t IH_NONTAKEN_WEIGHT = 1; /// Add \p BB to PostDominatedByUnreachable set if applicable. void BranchProbabilityInfo::updatePostDominatedByUnreachable(const BasicBlock *BB) { - const TerminatorInst *TI = BB->getTerminator(); + const Instruction *TI = BB->getTerminator(); if (TI->getNumSuccessors() == 0) { if (isa(TI) || // If this block is terminated by a call to @@ -167,7 +167,7 @@ BranchProbabilityInfo::updatePostDominatedByUnreachable(const BasicBlock *BB) { void BranchProbabilityInfo::updatePostDominatedByColdCall(const BasicBlock *BB) { assert(!PostDominatedByColdCall.count(BB)); - const TerminatorInst *TI = BB->getTerminator(); + const Instruction *TI = BB->getTerminator(); if (TI->getNumSuccessors() == 0) return; @@ -202,7 +202,7 @@ BranchProbabilityInfo::updatePostDominatedByColdCall(const BasicBlock *BB) { /// Predict that a successor which leads necessarily to an /// unreachable-terminated block as extremely unlikely. bool BranchProbabilityInfo::calcUnreachableHeuristics(const BasicBlock *BB) { - const TerminatorInst *TI = BB->getTerminator(); + const Instruction *TI = BB->getTerminator(); (void) TI; assert(TI->getNumSuccessors() > 1 && "expected more than one successor!"); assert(!isa(TI) && @@ -246,7 +246,7 @@ bool BranchProbabilityInfo::calcUnreachableHeuristics(const BasicBlock *BB) { // heuristic. The probability of the edge coming to unreachable block is // set to min of metadata and unreachable heuristic. bool BranchProbabilityInfo::calcMetadataWeights(const BasicBlock *BB) { - const TerminatorInst *TI = BB->getTerminator(); + const Instruction *TI = BB->getTerminator(); assert(TI->getNumSuccessors() > 1 && "expected more than one successor!"); if (!(isa(TI) || isa(TI) || isa(TI))) return false; @@ -348,7 +348,7 @@ bool BranchProbabilityInfo::calcMetadataWeights(const BasicBlock *BB) { /// Return true if we could compute the weights for cold edges. /// Return false, otherwise. bool BranchProbabilityInfo::calcColdCallHeuristics(const BasicBlock *BB) { - const TerminatorInst *TI = BB->getTerminator(); + const Instruction *TI = BB->getTerminator(); (void) TI; assert(TI->getNumSuccessors() > 1 && "expected more than one successor!"); assert(!isa(TI) && diff --git a/lib/Analysis/CFG.cpp b/lib/Analysis/CFG.cpp index aaea5995429..aa880a62b75 100644 --- a/lib/Analysis/CFG.cpp +++ b/lib/Analysis/CFG.cpp @@ -71,7 +71,7 @@ void llvm::FindFunctionBackedges(const Function &F, /// successor. unsigned llvm::GetSuccessorNumber(const BasicBlock *BB, const BasicBlock *Succ) { - const TerminatorInst *Term = BB->getTerminator(); + const Instruction *Term = BB->getTerminator(); #ifndef NDEBUG unsigned e = Term->getNumSuccessors(); #endif diff --git a/lib/Analysis/EHPersonalities.cpp b/lib/Analysis/EHPersonalities.cpp index 2d35a3fa911..0df73aeebbd 100644 --- a/lib/Analysis/EHPersonalities.cpp +++ b/lib/Analysis/EHPersonalities.cpp @@ -120,7 +120,7 @@ DenseMap llvm::colorEHFunclets(Function &F) { << "\'.\n"); BasicBlock *SuccColor = Color; - TerminatorInst *Terminator = Visiting->getTerminator(); + Instruction *Terminator = Visiting->getTerminator(); if (auto *CatchRet = dyn_cast(Terminator)) { Value *ParentPad = CatchRet->getCatchSwitchParentPad(); if (isa(ParentPad)) diff --git a/lib/Analysis/InlineCost.cpp b/lib/Analysis/InlineCost.cpp index fb032e0404c..1b5150a0d18 100644 --- a/lib/Analysis/InlineCost.cpp +++ b/lib/Analysis/InlineCost.cpp @@ -1831,7 +1831,7 @@ InlineResult CallAnalyzer::analyzeCall(CallSite CS) { if (!IR) return IR; - TerminatorInst *TI = BB->getTerminator(); + Instruction *TI = BB->getTerminator(); // Add in the live successors by first checking whether we have terminator // that may be simplified based on the values simplified by this call. diff --git a/lib/Analysis/LoopInfo.cpp b/lib/Analysis/LoopInfo.cpp index 99ff25a3fd3..4b174b66d1e 100644 --- a/lib/Analysis/LoopInfo.cpp +++ b/lib/Analysis/LoopInfo.cpp @@ -219,7 +219,7 @@ MDNode *Loop::getLoopID() const { SmallVector LatchesBlocks; getLoopLatches(LatchesBlocks); for (BasicBlock *BB : LatchesBlocks) { - TerminatorInst *TI = BB->getTerminator(); + Instruction *TI = BB->getTerminator(); MDNode *MD = TI->getMetadata(LLVMContext::MD_loop); if (!MD) @@ -250,7 +250,7 @@ void Loop::setLoopID(MDNode *LoopID) const { "The loop should have no single latch at this point"); BasicBlock *H = getHeader(); for (BasicBlock *BB : this->blocks()) { - TerminatorInst *TI = BB->getTerminator(); + Instruction *TI = BB->getTerminator(); for (BasicBlock *Successor : successors(TI)) { if (Successor == H) TI->setMetadata(LLVMContext::MD_loop, LoopID); diff --git a/lib/Analysis/MemorySSAUpdater.cpp b/lib/Analysis/MemorySSAUpdater.cpp index 51a5733a3ef..880dc2f2785 100644 --- a/lib/Analysis/MemorySSAUpdater.cpp +++ b/lib/Analysis/MemorySSAUpdater.cpp @@ -1104,7 +1104,7 @@ void MemorySSAUpdater::removeBlocks( const SmallPtrSetImpl &DeadBlocks) { // First delete all uses of BB in MemoryPhis. for (BasicBlock *BB : DeadBlocks) { - TerminatorInst *TI = BB->getTerminator(); + Instruction *TI = BB->getTerminator(); assert(TI && "Basic block expected to have a terminator instruction"); for (BasicBlock *Succ : successors(TI)) if (!DeadBlocks.count(Succ)) diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp index 193020ed92f..4a30447f647 100644 --- a/lib/Analysis/ScalarEvolution.cpp +++ b/lib/Analysis/ScalarEvolution.cpp @@ -7078,7 +7078,7 @@ ScalarEvolution::computeExitLimit(const Loop *L, BasicBlock *ExitingBlock, return getCouldNotCompute(); bool IsOnlyExit = (L->getExitingBlock() != nullptr); - TerminatorInst *Term = ExitingBlock->getTerminator(); + Instruction *Term = ExitingBlock->getTerminator(); if (BranchInst *BI = dyn_cast(Term)) { assert(BI->isConditional() && "If unconditional, it can't be in loop!"); bool ExitIfTrue = !L->contains(BI->getSuccessor(0)); diff --git a/lib/CodeGen/Analysis.cpp b/lib/CodeGen/Analysis.cpp index b769e92590f..aae04a573af 100644 --- a/lib/CodeGen/Analysis.cpp +++ b/lib/CodeGen/Analysis.cpp @@ -471,7 +471,7 @@ static bool nextRealType(SmallVectorImpl &SubTypes, bool llvm::isInTailCallPosition(ImmutableCallSite CS, const TargetMachine &TM) { const Instruction *I = CS.getInstruction(); const BasicBlock *ExitBB = I->getParent(); - const TerminatorInst *Term = ExitBB->getTerminator(); + const Instruction *Term = ExitBB->getTerminator(); const ReturnInst *Ret = dyn_cast(Term); // The block must end in a return statement or unreachable. diff --git a/lib/CodeGen/SelectionDAG/FastISel.cpp b/lib/CodeGen/SelectionDAG/FastISel.cpp index ad416017470..542cc10371e 100644 --- a/lib/CodeGen/SelectionDAG/FastISel.cpp +++ b/lib/CodeGen/SelectionDAG/FastISel.cpp @@ -2223,7 +2223,7 @@ unsigned FastISel::fastEmitZExtFromI1(MVT VT, unsigned Op0, bool Op0IsKill) { /// might result in multiple MBB's for one BB. As such, the start of the /// BB might correspond to a different MBB than the end. bool FastISel::handlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) { - const TerminatorInst *TI = LLVMBB->getTerminator(); + const Instruction *TI = LLVMBB->getTerminator(); SmallPtrSet SuccsHandled; FuncInfo.OrigNumPHINodesToUpdate = FuncInfo.PHINodesToUpdate.size(); diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index f7866665bcb..1a99ef734f1 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -9249,7 +9249,7 @@ void SelectionDAGISel::LowerArguments(const Function &F) { /// the end. void SelectionDAGBuilder::HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) { - const TerminatorInst *TI = LLVMBB->getTerminator(); + const Instruction *TI = LLVMBB->getTerminator(); SmallPtrSet SuccsHandled; diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp index feb57eeafe7..2b4a590f19f 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -451,7 +451,7 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) { if (!succ_empty(&BB)) continue; - const TerminatorInst *Term = BB.getTerminator(); + const Instruction *Term = BB.getTerminator(); if (isa(Term) || isa(Term)) continue; diff --git a/lib/CodeGen/WinEHPrepare.cpp b/lib/CodeGen/WinEHPrepare.cpp index a3243235854..6a15240fa6e 100644 --- a/lib/CodeGen/WinEHPrepare.cpp +++ b/lib/CodeGen/WinEHPrepare.cpp @@ -218,7 +218,7 @@ static void calculateStateNumbersForInvokes(const Function *Fn, // to. If the unwind edge came from an invoke, return null. static const BasicBlock *getEHPadFromPredecessor(const BasicBlock *BB, Value *ParentPad) { - const TerminatorInst *TI = BB->getTerminator(); + const Instruction *TI = BB->getTerminator(); if (isa(TI)) return nullptr; if (auto *CatchSwitch = dyn_cast(TI)) { @@ -977,7 +977,7 @@ void WinEHPrepare::removeImplausibleInstructions(Function &F) { break; } - TerminatorInst *TI = BB->getTerminator(); + Instruction *TI = BB->getTerminator(); // CatchPadInst and CleanupPadInst can't transfer control to a ReturnInst. bool IsUnreachableRet = isa(TI) && FuncletPad; // The token consumed by a CatchReturnInst must match the funclet token. diff --git a/lib/IR/BasicBlock.cpp b/lib/IR/BasicBlock.cpp index d04af9261e3..03fb5ccaffc 100644 --- a/lib/IR/BasicBlock.cpp +++ b/lib/IR/BasicBlock.cpp @@ -437,7 +437,7 @@ BasicBlock *BasicBlock::splitBasicBlock(iterator I, const Twine &BBName) { } void BasicBlock::replaceSuccessorsPhiUsesWith(BasicBlock *New) { - TerminatorInst *TI = getTerminator(); + Instruction *TI = getTerminator(); if (!TI) // Cope with being called on a BasicBlock that doesn't have a terminator // yet. Clang's CodeGenFunction::EmitReturnBlock() likes to do this. @@ -468,7 +468,7 @@ const LandingPadInst *BasicBlock::getLandingPadInst() const { } Optional BasicBlock::getIrrLoopHeaderWeight() const { - const TerminatorInst *TI = getTerminator(); + const Instruction *TI = getTerminator(); if (MDNode *MDIrrLoopHeader = TI->getMetadata(LLVMContext::MD_irr_loop)) { MDString *MDName = cast(MDIrrLoopHeader->getOperand(0)); diff --git a/lib/IR/Dominators.cpp b/lib/IR/Dominators.cpp index c78f220439a..cf9f5759ba5 100644 --- a/lib/IR/Dominators.cpp +++ b/lib/IR/Dominators.cpp @@ -41,7 +41,7 @@ static constexpr bool ExpensiveChecksEnabled = false; #endif bool BasicBlockEdge::isSingleEdge() const { - const TerminatorInst *TI = Start->getTerminator(); + const Instruction *TI = Start->getTerminator(); unsigned NumEdgesToEnd = 0; for (unsigned int i = 0, n = TI->getNumSuccessors(); i < n; ++i) { if (TI->getSuccessor(i) == End) diff --git a/lib/IR/SafepointIRVerifier.cpp b/lib/IR/SafepointIRVerifier.cpp index 7af48f5301f..d2102138d79 100644 --- a/lib/IR/SafepointIRVerifier.cpp +++ b/lib/IR/SafepointIRVerifier.cpp @@ -134,7 +134,7 @@ public: // Top-down walk of the dominator tree ReversePostOrderTraversal RPOT(&F); for (const BasicBlock *BB : RPOT) { - const TerminatorInst *TI = BB->getTerminator(); + const Instruction *TI = BB->getTerminator(); assert(TI && "blocks must be well formed"); // For conditional branches, we can perform simple conditional propagation on diff --git a/lib/IR/Verifier.cpp b/lib/IR/Verifier.cpp index 8304ec6e8f4..d96555ca5f9 100644 --- a/lib/IR/Verifier.cpp +++ b/lib/IR/Verifier.cpp @@ -3450,7 +3450,7 @@ void Verifier::visitEHPadPredecessors(Instruction &I) { Instruction *ToPad = &I; Value *ToPadParent = getParentPad(ToPad); for (BasicBlock *PredBB : predecessors(BB)) { - TerminatorInst *TI = PredBB->getTerminator(); + Instruction *TI = PredBB->getTerminator(); Value *FromPad; if (auto *II = dyn_cast(TI)) { Assert(II->getUnwindDest() == BB && II->getNormalDest() != BB, diff --git a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp index c52313f84ef..8248dbe1b0f 100644 --- a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp +++ b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp @@ -288,7 +288,7 @@ Value *SIAnnotateControlFlow::handleLoopCondition( } } - TerminatorInst *Insert = From->getTerminator(); + Instruction *Insert = From->getTerminator(); Value *PhiArg = CallInst::Create(Break, Broken, "", Insert); NewPhi->setIncomingValue(i, PhiArg); } diff --git a/lib/Target/NVPTX/NVPTXAllocaHoisting.cpp b/lib/Target/NVPTX/NVPTXAllocaHoisting.cpp index bed52293197..bf922eb8a19 100644 --- a/lib/Target/NVPTX/NVPTXAllocaHoisting.cpp +++ b/lib/Target/NVPTX/NVPTXAllocaHoisting.cpp @@ -41,7 +41,7 @@ public: bool NVPTXAllocaHoisting::runOnFunction(Function &function) { bool functionModified = false; Function::iterator I = function.begin(); - TerminatorInst *firstTerminatorInst = (I++)->getTerminator(); + Instruction *firstTerminatorInst = (I++)->getTerminator(); for (Function::iterator E = function.end(); I != E; ++I) { for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) { diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index 5ec7b102884..8861de6f0d8 100644 --- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -490,7 +490,7 @@ static unsigned getBranchHint(unsigned PCC, FunctionLoweringInfo *FuncInfo, if (!FuncInfo->BPI) return PPC::BR_NO_HINT; const BasicBlock *BB = FuncInfo->MBB->getBasicBlock(); - const TerminatorInst *BBTerm = BB->getTerminator(); + const Instruction *BBTerm = BB->getTerminator(); if (BBTerm->getNumSuccessors() != 2) return PPC::BR_NO_HINT; diff --git a/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp index b5a88129c6b..f0d24075801 100644 --- a/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp +++ b/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp @@ -1030,7 +1030,7 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runSjLjOnFunction(Function &F) { // Free setjmpTable buffer before each return instruction for (BasicBlock &BB : F) { - TerminatorInst *TI = BB.getTerminator(); + Instruction *TI = BB.getTerminator(); if (isa(TI)) CallInst::CreateFree(SetjmpTable, TI); } diff --git a/lib/Target/X86/X86WinEHState.cpp b/lib/Target/X86/X86WinEHState.cpp index dde9c734f49..c11e7e365a1 100644 --- a/lib/Target/X86/X86WinEHState.cpp +++ b/lib/Target/X86/X86WinEHState.cpp @@ -369,7 +369,7 @@ void WinEHStatePass::emitExceptionRegistrationRecord(Function *F) { // Insert an unlink before all returns. for (BasicBlock &BB : *F) { - TerminatorInst *T = BB.getTerminator(); + Instruction *T = BB.getTerminator(); if (!isa(T)) continue; Builder.SetInsertPoint(T); diff --git a/lib/Transforms/IPO/HotColdSplitting.cpp b/lib/Transforms/IPO/HotColdSplitting.cpp index 810fdf418a2..9d2634f1bc9 100644 --- a/lib/Transforms/IPO/HotColdSplitting.cpp +++ b/lib/Transforms/IPO/HotColdSplitting.cpp @@ -104,7 +104,7 @@ static bool isSingleEntrySingleExit(BasicBlock *Entry, const BasicBlock *Exit, bool blockEndsInUnreachable(const BasicBlock &BB) { if (BB.empty()) return true; - const TerminatorInst *I = BB.getTerminator(); + const Instruction *I = BB.getTerminator(); if (isa(I) || isa(I)) return true; // Unreachable blocks do not have any successor. diff --git a/lib/Transforms/IPO/LoopExtractor.cpp b/lib/Transforms/IPO/LoopExtractor.cpp index 8c86f7cb806..733235d45a0 100644 --- a/lib/Transforms/IPO/LoopExtractor.cpp +++ b/lib/Transforms/IPO/LoopExtractor.cpp @@ -104,8 +104,8 @@ bool LoopExtractor::runOnLoop(Loop *L, LPPassManager &LPM) { bool ShouldExtractLoop = false; // Extract the loop if the entry block doesn't branch to the loop header. - TerminatorInst *EntryTI = - L->getHeader()->getParent()->getEntryBlock().getTerminator(); + Instruction *EntryTI = + L->getHeader()->getParent()->getEntryBlock().getTerminator(); if (!isa(EntryTI) || !cast(EntryTI)->isUnconditional() || EntryTI->getSuccessor(0) != L->getHeader()) { diff --git a/lib/Transforms/IPO/PartialInlining.cpp b/lib/Transforms/IPO/PartialInlining.cpp index 709222dbec0..11c4bbc437c 100644 --- a/lib/Transforms/IPO/PartialInlining.cpp +++ b/lib/Transforms/IPO/PartialInlining.cpp @@ -556,7 +556,7 @@ PartialInlinerImpl::computeOutliningInfo(Function *F) { }; auto IsReturnBlock = [](BasicBlock *BB) { - TerminatorInst *TI = BB->getTerminator(); + Instruction *TI = BB->getTerminator(); return isa(TI); }; diff --git a/lib/Transforms/IPO/PruneEH.cpp b/lib/Transforms/IPO/PruneEH.cpp index 2caee294221..ae586c01747 100644 --- a/lib/Transforms/IPO/PruneEH.cpp +++ b/lib/Transforms/IPO/PruneEH.cpp @@ -107,7 +107,7 @@ static bool runImpl(CallGraphSCC &SCC, CallGraph &CG) { continue; for (const BasicBlock &BB : *F) { - const TerminatorInst *TI = BB.getTerminator(); + const Instruction *TI = BB.getTerminator(); if (CheckUnwind && TI->mayThrow()) { SCCMightUnwind = true; } else if (CheckReturn && isa(TI)) { diff --git a/lib/Transforms/IPO/SampleProfile.cpp b/lib/Transforms/IPO/SampleProfile.cpp index 4a69a0c2806..a78e0d459c8 100644 --- a/lib/Transforms/IPO/SampleProfile.cpp +++ b/lib/Transforms/IPO/SampleProfile.cpp @@ -1297,7 +1297,7 @@ void SampleProfileLoader::propagateWeights(Function &F) { } } } - TerminatorInst *TI = BB->getTerminator(); + Instruction *TI = BB->getTerminator(); if (TI->getNumSuccessors() == 1) continue; if (!isa(TI) && !isa(TI)) diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp index 714c6176884..6d2ac2274de 100644 --- a/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -3732,7 +3732,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { // Scan down this block to see if there is another stack restore in the // same block without an intervening call/alloca. BasicBlock::iterator BI(II); - TerminatorInst *TI = II->getParent()->getTerminator(); + Instruction *TI = II->getParent()->getTerminator(); bool CannotRemove = false; for (++BI; &*BI != TI; ++BI) { if (isa(BI)) { diff --git a/lib/Transforms/InstCombine/InstCombinePHI.cpp b/lib/Transforms/InstCombine/InstCombinePHI.cpp index 0289abe472e..94745094c15 100644 --- a/lib/Transforms/InstCombine/InstCombinePHI.cpp +++ b/lib/Transforms/InstCombine/InstCombinePHI.cpp @@ -652,7 +652,7 @@ Instruction *InstCombiner::FoldPHIArgLoadIntoPHI(PHINode &PN) { Instruction *InstCombiner::FoldPHIArgZextsIntoPHI(PHINode &Phi) { // We cannot create a new instruction after the PHI if the terminator is an // EHPad because there is no valid insertion point. - if (TerminatorInst *TI = Phi.getParent()->getTerminator()) + if (Instruction *TI = Phi.getParent()->getTerminator()) if (TI->isEHPad()) return nullptr; @@ -726,7 +726,7 @@ Instruction *InstCombiner::FoldPHIArgZextsIntoPHI(PHINode &Phi) { Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) { // We cannot create a new instruction after the PHI if the terminator is an // EHPad because there is no valid insertion point. - if (TerminatorInst *TI = PN.getParent()->getTerminator()) + if (Instruction *TI = PN.getParent()->getTerminator()) if (TI->isEHPad()) return nullptr; diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp index 00ffe9e2dc2..ae7d08149c6 100644 --- a/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -2347,7 +2347,7 @@ tryToMoveFreeBeforeNullTest(CallInst &FI) { return nullptr; // Validate the rest of constraint #1 by matching on the pred branch. - TerminatorInst *TI = PredBB->getTerminator(); + Instruction *TI = PredBB->getTerminator(); BasicBlock *TrueBB, *FalseBB; ICmpInst::Predicate Pred; if (!match(TI, m_Br(m_ICmp(Pred, m_Specific(Op), m_Zero()), TrueBB, FalseBB))) @@ -3285,7 +3285,7 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, const DataLayout &DL, // Recursively visit successors. If this is a branch or switch on a // constant, only visit the reachable successor. - TerminatorInst *TI = BB->getTerminator(); + Instruction *TI = BB->getTerminator(); if (BranchInst *BI = dyn_cast(TI)) { if (BI->isConditional() && isa(BI->getCondition())) { bool CondVal = cast(BI->getCondition())->getZExtValue(); diff --git a/lib/Transforms/Instrumentation/CFGMST.h b/lib/Transforms/Instrumentation/CFGMST.h index cc9b149d0b6..e178ef386e6 100644 --- a/lib/Transforms/Instrumentation/CFGMST.h +++ b/lib/Transforms/Instrumentation/CFGMST.h @@ -119,7 +119,7 @@ public: static const uint32_t CriticalEdgeMultiplier = 1000; for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { - TerminatorInst *TI = BB->getTerminator(); + Instruction *TI = BB->getTerminator(); uint64_t BBWeight = (BFI != nullptr ? BFI->getBlockFreq(&*BB).getFrequency() : 2); uint64_t Weight = 2; diff --git a/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/lib/Transforms/Instrumentation/GCOVProfiling.cpp index a060dd53513..ee546a9a828 100644 --- a/lib/Transforms/Instrumentation/GCOVProfiling.cpp +++ b/lib/Transforms/Instrumentation/GCOVProfiling.cpp @@ -578,7 +578,7 @@ void GCOVProfiler::emitProfileNotes() { for (auto &BB : F) { GCOVBlock &Block = Func.getBlock(&BB); - TerminatorInst *TI = BB.getTerminator(); + Instruction *TI = BB.getTerminator(); if (int successors = TI->getNumSuccessors()) { for (int i = 0; i != successors; ++i) { Block.addEdge(Func.getBlock(TI->getSuccessor(i))); @@ -646,7 +646,7 @@ bool GCOVProfiler::emitProfileArcs() { DenseMap, unsigned> EdgeToCounter; unsigned Edges = 0; for (auto &BB : F) { - TerminatorInst *TI = BB.getTerminator(); + Instruction *TI = BB.getTerminator(); if (isa(TI)) { EdgeToCounter[{&BB, nullptr}] = Edges++; } else { @@ -690,7 +690,7 @@ bool GCOVProfiler::emitProfileArcs() { Count = Builder.CreateAdd(Count, Builder.getInt64(1)); Builder.CreateStore(Count, Phi); - TerminatorInst *TI = BB.getTerminator(); + Instruction *TI = BB.getTerminator(); if (isa(TI)) { auto It = EdgeToCounter.find({&BB, nullptr}); assert(It != EdgeToCounter.end()); diff --git a/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index ac851f660d9..4790c9e5cfe 100644 --- a/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -586,7 +586,7 @@ void FuncPGOInstrumentation::computeCFGHash() { std::vector Indexes; JamCRC JC; for (auto &BB : F) { - const TerminatorInst *TI = BB.getTerminator(); + const Instruction *TI = BB.getTerminator(); for (unsigned I = 0, E = TI->getNumSuccessors(); I != E; ++I) { BasicBlock *Succ = TI->getSuccessor(I); auto BI = findBBInfo(Succ); @@ -698,7 +698,7 @@ BasicBlock *FuncPGOInstrumentation::getInstrBB(Edge *E) { // Instrument the SrcBB if it has a single successor, // otherwise, the DestBB if this is not a critical edge. - TerminatorInst *TI = SrcBB->getTerminator(); + Instruction *TI = SrcBB->getTerminator(); if (TI->getNumSuccessors() <= 1) return SrcBB; if (!E->IsCritical) @@ -1167,7 +1167,7 @@ void PGOUseFunc::setBranchWeights() { // Generate MD_prof metadata for every branch instruction. LLVM_DEBUG(dbgs() << "\nSetting branch weights.\n"); for (auto &BB : F) { - TerminatorInst *TI = BB.getTerminator(); + Instruction *TI = BB.getTerminator(); if (TI->getNumSuccessors() < 2) continue; if (!(isa(TI) || isa(TI) || @@ -1213,7 +1213,7 @@ void PGOUseFunc::annotateIrrLoopHeaderWeights() { // to become an irreducible loop header after the indirectbr tail // duplication. if (BFI->isIrrLoopHeader(&BB) || isIndirectBrTarget(&BB)) { - TerminatorInst *TI = BB.getTerminator(); + Instruction *TI = BB.getTerminator(); const UseBBInfo &BBCountInfo = getBBInfo(&BB); setIrrLoopHeaderMetadata(M, TI, BBCountInfo.CountValue); } diff --git a/lib/Transforms/Scalar/ADCE.cpp b/lib/Transforms/Scalar/ADCE.cpp index 883d2e17350..b0602d96798 100644 --- a/lib/Transforms/Scalar/ADCE.cpp +++ b/lib/Transforms/Scalar/ADCE.cpp @@ -103,7 +103,7 @@ struct BlockInfoType { BasicBlock *BB = nullptr; /// Cache of BB->getTerminator(). - TerminatorInst *Terminator = nullptr; + Instruction *Terminator = nullptr; /// Post-order numbering of reverse control flow graph. unsigned PostOrder; @@ -206,7 +206,7 @@ bool AggressiveDeadCodeElimination::performDeadCodeElimination() { return removeDeadInstructions(); } -static bool isUnconditionalBranch(TerminatorInst *Term) { +static bool isUnconditionalBranch(Instruction *Term) { auto *BR = dyn_cast(Term); return BR && BR->isUnconditional(); } @@ -277,7 +277,7 @@ void AggressiveDeadCodeElimination::initialize() { // treat all edges to a block already seen as loop back edges // and mark the branch live it if there is a back edge. for (auto *BB: depth_first_ext(&F.getEntryBlock(), State)) { - TerminatorInst *Term = BB->getTerminator(); + Instruction *Term = BB->getTerminator(); if (isLive(Term)) continue; @@ -643,7 +643,7 @@ void AggressiveDeadCodeElimination::computeReversePostOrder() { void AggressiveDeadCodeElimination::makeUnconditional(BasicBlock *BB, BasicBlock *Target) { - TerminatorInst *PredTerm = BB->getTerminator(); + Instruction *PredTerm = BB->getTerminator(); // Collect the live debug info scopes attached to this instruction. if (const DILocation *DL = PredTerm->getDebugLoc()) collectLiveScopes(*DL); diff --git a/lib/Transforms/Scalar/CallSiteSplitting.cpp b/lib/Transforms/Scalar/CallSiteSplitting.cpp index 54385155cd2..e82682e08ab 100644 --- a/lib/Transforms/Scalar/CallSiteSplitting.cpp +++ b/lib/Transforms/Scalar/CallSiteSplitting.cpp @@ -248,7 +248,7 @@ static void copyMustTailReturn(BasicBlock *SplitBB, Instruction *CI, ReturnInst* RI = dyn_cast(&*II); assert(RI && "`musttail` call must be followed by `ret` instruction"); - TerminatorInst *TI = SplitBB->getTerminator(); + Instruction *TI = SplitBB->getTerminator(); Value *V = NewCI; if (BCI) V = cloneInstForMustTail(BCI, TI, V); diff --git a/lib/Transforms/Scalar/DeadStoreElimination.cpp b/lib/Transforms/Scalar/DeadStoreElimination.cpp index 545b0060c13..69112f3cee2 100644 --- a/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -643,7 +643,7 @@ static void findUnconditionalPreds(SmallVectorImpl &Blocks, for (pred_iterator I = pred_begin(BB), E = pred_end(BB); I != E; ++I) { BasicBlock *Pred = *I; if (Pred == BB) continue; - TerminatorInst *PredTI = Pred->getTerminator(); + Instruction *PredTI = Pred->getTerminator(); if (PredTI->getNumSuccessors() != 1) continue; diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp index d980cde49b6..34d2b2a8b27 100644 --- a/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -540,7 +540,7 @@ static Optional analyzeLoopUnrollCost( } } - TerminatorInst *TI = BB->getTerminator(); + Instruction *TI = BB->getTerminator(); // Add in the live successors by first checking whether we have terminator // that may be simplified based on the values simplified by this call. diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp index f67bff7fe93..13e6bd13754 100644 --- a/lib/Transforms/Scalar/LoopUnswitch.cpp +++ b/lib/Transforms/Scalar/LoopUnswitch.cpp @@ -246,11 +246,11 @@ namespace { bool TryTrivialLoopUnswitch(bool &Changed); bool UnswitchIfProfitable(Value *LoopCond, Constant *Val, - TerminatorInst *TI = nullptr); + Instruction *TI = nullptr); void UnswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val, - BasicBlock *ExitBlock, TerminatorInst *TI); + BasicBlock *ExitBlock, Instruction *TI); void UnswitchNontrivialCondition(Value *LIC, Constant *OnVal, Loop *L, - TerminatorInst *TI); + Instruction *TI); void RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC, Constant *Val, bool isEqual); @@ -258,8 +258,7 @@ namespace { void EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val, BasicBlock *TrueDest, BasicBlock *FalseDest, - BranchInst *OldBranch, - TerminatorInst *TI); + BranchInst *OldBranch, Instruction *TI); void SimplifyCode(std::vector &Worklist, Loop *L); @@ -713,7 +712,7 @@ bool LoopUnswitch::processCurrentLoop() { // loop. for (Loop::block_iterator I = currentLoop->block_begin(), E = currentLoop->block_end(); I != E; ++I) { - TerminatorInst *TI = (*I)->getTerminator(); + Instruction *TI = (*I)->getTerminator(); // Unswitching on a potentially uninitialized predicate is not // MSan-friendly. Limit this to the cases when the original predicate is @@ -876,7 +875,7 @@ static BasicBlock *isTrivialLoopExitBlock(Loop *L, BasicBlock *BB) { /// simplify the loop. If we decide that this is profitable, /// unswitch the loop, reprocess the pieces, then return true. bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val, - TerminatorInst *TI) { + Instruction *TI) { // Check to see if it would be profitable to unswitch current loop. if (!BranchesInfo.CostAllowsUnswitching()) { LLVM_DEBUG(dbgs() << "NOT unswitching loop %" @@ -931,7 +930,7 @@ void LoopUnswitch::EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val, BasicBlock *TrueDest, BasicBlock *FalseDest, BranchInst *OldBranch, - TerminatorInst *TI) { + Instruction *TI) { assert(OldBranch->isUnconditional() && "Preheader is not split correctly"); assert(TrueDest != FalseDest && "Branch targets should be different"); // Insert a conditional branch on LIC to the two preheaders. The original @@ -996,7 +995,7 @@ void LoopUnswitch::EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val, /// outside of the loop and updating loop info. void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val, BasicBlock *ExitBlock, - TerminatorInst *TI) { + Instruction *TI) { LLVM_DEBUG(dbgs() << "loop-unswitch: Trivial-Unswitch loop %" << loopHeader->getName() << " [" << L->getBlocks().size() << " blocks] in Function " @@ -1054,7 +1053,7 @@ void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val, /// condition. bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) { BasicBlock *CurrentBB = currentLoop->getHeader(); - TerminatorInst *CurrentTerm = CurrentBB->getTerminator(); + Instruction *CurrentTerm = CurrentBB->getTerminator(); LLVMContext &Context = CurrentBB->getContext(); // If loop header has only one reachable successor (currently via an @@ -1227,7 +1226,7 @@ void LoopUnswitch::SplitExitEdges(Loop *L, /// Split it into loop versions and test the condition outside of either loop. /// Return the loops created as Out1/Out2. void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, - Loop *L, TerminatorInst *TI) { + Loop *L, Instruction *TI) { Function *F = loopHeader->getParent(); LLVM_DEBUG(dbgs() << "loop-unswitch: Unswitching loop %" << loopHeader->getName() << " [" << L->getBlocks().size() diff --git a/lib/Transforms/Scalar/PlaceSafepoints.cpp b/lib/Transforms/Scalar/PlaceSafepoints.cpp index 7f9aad24883..fd2eb85fd7b 100644 --- a/lib/Transforms/Scalar/PlaceSafepoints.cpp +++ b/lib/Transforms/Scalar/PlaceSafepoints.cpp @@ -105,7 +105,7 @@ struct PlaceBackedgeSafepointsImpl : public FunctionPass { /// The output of the pass - gives a list of each backedge (described by /// pointing at the branch) which need a poll inserted. - std::vector PollLocations; + std::vector PollLocations; /// True unless we're running spp-no-calls in which case we need to disable /// the call-dependent placement opts. @@ -348,7 +348,7 @@ bool PlaceBackedgeSafepointsImpl::runOnLoop(Loop *L) { // Safepoint insertion would involve creating a new basic block (as the // target of the current backedge) which does the safepoint (of all live // variables) and branches to the true header - TerminatorInst *Term = Pred->getTerminator(); + Instruction *Term = Pred->getTerminator(); LLVM_DEBUG(dbgs() << "[LSP] terminator instruction: " << *Term); @@ -535,7 +535,7 @@ bool PlaceSafepoints::runOnFunction(Function &F) { // Insert a poll at each point the analysis pass identified // The poll location must be the terminator of a loop latch block. - for (TerminatorInst *Term : PollLocations) { + for (Instruction *Term : PollLocations) { // We are inserting a poll, the function is modified Modified = true; diff --git a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp index 5e23a8a3dcd..cf2ce03049a 100644 --- a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp +++ b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp @@ -1851,13 +1851,13 @@ static void relocationViaAlloca( StoreInst *Store = new StoreInst(Def, Alloca); if (Instruction *Inst = dyn_cast(Def)) { if (InvokeInst *Invoke = dyn_cast(Inst)) { - // InvokeInst is a TerminatorInst so the store need to be inserted - // into its normal destination block. + // InvokeInst is a terminator so the store need to be inserted into its + // normal destination block. BasicBlock *NormalDest = Invoke->getNormalDest(); Store->insertBefore(NormalDest->getFirstNonPHI()); } else { assert(!Inst->isTerminator() && - "The only TerminatorInst that can produce a value is " + "The only terminator that can produce a value is " "InvokeInst which is handled above."); Store->insertAfter(Inst); } @@ -2584,7 +2584,7 @@ bool RewriteStatepointsForGC::runOnFunction(Function &F, DominatorTree &DT, // increase the liveset of any statepoint we move over. This is profitable // as long as all statepoints are in rare blocks. If we had in-register // lowering for live values this would be a much safer transform. - auto getConditionInst = [](TerminatorInst *TI) -> Instruction* { + auto getConditionInst = [](Instruction *TI) -> Instruction * { if (auto *BI = dyn_cast(TI)) if (BI->isConditional()) return dyn_cast(BI->getCondition()); @@ -2592,7 +2592,7 @@ bool RewriteStatepointsForGC::runOnFunction(Function &F, DominatorTree &DT, return nullptr; }; for (BasicBlock &BB : F) { - TerminatorInst *TI = BB.getTerminator(); + Instruction *TI = BB.getTerminator(); if (auto *Cond = getConditionInst(TI)) // TODO: Handle more than just ICmps here. We should be able to move // most instructions without side effects or memory access. @@ -2675,7 +2675,7 @@ static SetVector computeKillSet(BasicBlock *BB) { /// Check that the items in 'Live' dominate 'TI'. This is used as a basic /// sanity check for the liveness computation. static void checkBasicSSA(DominatorTree &DT, SetVector &Live, - TerminatorInst *TI, bool TermOkay = false) { + Instruction *TI, bool TermOkay = false) { for (Value *V : Live) { if (auto *I = dyn_cast(V)) { // The terminator can be a member of the LiveOut set. LLVM's definition diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp index 7196bc82edc..11e5549c332 100644 --- a/lib/Transforms/Scalar/SCCP.cpp +++ b/lib/Transforms/Scalar/SCCP.cpp @@ -1614,7 +1614,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { // Check to see if we have a branch or switch on an undefined value. If so // we force the branch to go one way or the other to make the successor // values live. It doesn't really matter which way we force it. - TerminatorInst *TI = BB.getTerminator(); + Instruction *TI = BB.getTerminator(); if (auto *BI = dyn_cast(TI)) { if (!BI->isConditional()) continue; if (!getValueState(BI->getCondition()).isUnknown()) diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp index 6e991409bf0..0f43ee6bbd7 100644 --- a/lib/Transforms/Scalar/SROA.cpp +++ b/lib/Transforms/Scalar/SROA.cpp @@ -1211,7 +1211,7 @@ static bool isSafePHIToSpeculate(PHINode &PN) { // predecessor blocks. The only thing to watch out for is that we can't put // a possibly trapping load in the predecessor if it is a critical edge. for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) { - TerminatorInst *TI = PN.getIncomingBlock(Idx)->getTerminator(); + Instruction *TI = PN.getIncomingBlock(Idx)->getTerminator(); Value *InVal = PN.getIncomingValue(Idx); // If the value is produced by the terminator of the predecessor (an @@ -1275,7 +1275,7 @@ static void speculatePHINodeLoads(PHINode &PN) { continue; } - TerminatorInst *TI = Pred->getTerminator(); + Instruction *TI = Pred->getTerminator(); IRBuilderTy PredBuilder(TI); LoadInst *Load = PredBuilder.CreateLoad( diff --git a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp index 17035f469da..6c4773aa92e 100644 --- a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -783,7 +783,7 @@ static bool unswitchAllTrivialConditions(Loop &L, DominatorTree &DT, [](Instruction &I) { return I.mayHaveSideEffects(); })) return Changed; - TerminatorInst *CurrentTerm = CurrentBB->getTerminator(); + Instruction *CurrentTerm = CurrentBB->getTerminator(); if (auto *SI = dyn_cast(CurrentTerm)) { // Don't bother trying to unswitch past a switch with a constant diff --git a/lib/Transforms/Scalar/StructurizeCFG.cpp b/lib/Transforms/Scalar/StructurizeCFG.cpp index f58f79f8b14..2bfd9927411 100644 --- a/lib/Transforms/Scalar/StructurizeCFG.cpp +++ b/lib/Transforms/Scalar/StructurizeCFG.cpp @@ -636,7 +636,7 @@ void StructurizeCFG::setPhiValues() { /// Remove phi values from all successors and then remove the terminator. void StructurizeCFG::killTerminator(BasicBlock *BB) { - TerminatorInst *Term = BB->getTerminator(); + Instruction *Term = BB->getTerminator(); if (!Term) return; diff --git a/lib/Transforms/Scalar/TailRecursionElimination.cpp b/lib/Transforms/Scalar/TailRecursionElimination.cpp index 6a77a2d414f..0f6db21f73b 100644 --- a/lib/Transforms/Scalar/TailRecursionElimination.cpp +++ b/lib/Transforms/Scalar/TailRecursionElimination.cpp @@ -702,7 +702,7 @@ static bool foldReturnAndProcessPred( SmallVector UncondBranchPreds; for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) { BasicBlock *Pred = *PI; - TerminatorInst *PTI = Pred->getTerminator(); + Instruction *PTI = Pred->getTerminator(); if (BranchInst *BI = dyn_cast(PTI)) if (BI->isUnconditional()) UncondBranchPreds.push_back(BI); diff --git a/lib/Transforms/Utils/BreakCriticalEdges.cpp b/lib/Transforms/Utils/BreakCriticalEdges.cpp index c3d67087ae7..fafc9aaba5c 100644 --- a/lib/Transforms/Utils/BreakCriticalEdges.cpp +++ b/lib/Transforms/Utils/BreakCriticalEdges.cpp @@ -318,7 +318,7 @@ findIBRPredecessor(BasicBlock *BB, SmallVectorImpl &OtherPreds) { BasicBlock *IBB = nullptr; for (unsigned Pred = 0, E = PN->getNumIncomingValues(); Pred != E; ++Pred) { BasicBlock *PredBB = PN->getIncomingBlock(Pred); - TerminatorInst *PredTerm = PredBB->getTerminator(); + Instruction *PredTerm = PredBB->getTerminator(); switch (PredTerm->getOpcode()) { case Instruction::IndirectBr: if (IBB) diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp index a9257a8c670..000af808945 100644 --- a/lib/Transforms/Utils/CloneFunction.cpp +++ b/lib/Transforms/Utils/CloneFunction.cpp @@ -365,7 +365,7 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB, } // Finally, clone over the terminator. - const TerminatorInst *OldTI = BB->getTerminator(); + const Instruction *OldTI = BB->getTerminator(); bool TerminatorDone = false; if (const BranchInst *BI = dyn_cast(OldTI)) { if (BI->isConditional()) { @@ -414,7 +414,7 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB, CodeInfo->OperandBundleCallSites.push_back(NewInst); // Recursively clone any reachable successor blocks. - const TerminatorInst *TI = BB->getTerminator(); + const Instruction *TI = BB->getTerminator(); for (const BasicBlock *Succ : successors(TI)) ToClone.push_back(Succ); } diff --git a/lib/Transforms/Utils/CodeExtractor.cpp b/lib/Transforms/Utils/CodeExtractor.cpp index 7f26c53ecf3..0e9e3219033 100644 --- a/lib/Transforms/Utils/CodeExtractor.cpp +++ b/lib/Transforms/Utils/CodeExtractor.cpp @@ -566,7 +566,7 @@ void CodeExtractor::severSplitPHINodes(BasicBlock *&Header) { // changing them to branch to NewBB instead. for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) if (Blocks.count(PN->getIncomingBlock(i))) { - TerminatorInst *TI = PN->getIncomingBlock(i)->getTerminator(); + Instruction *TI = PN->getIncomingBlock(i)->getTerminator(); TI->replaceUsesOfWith(OldPred, NewBB); } @@ -778,7 +778,7 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs, Value *Idx[2]; Idx[0] = Constant::getNullValue(Type::getInt32Ty(header->getContext())); Idx[1] = ConstantInt::get(Type::getInt32Ty(header->getContext()), i); - TerminatorInst *TI = newFunction->begin()->getTerminator(); + Instruction *TI = newFunction->begin()->getTerminator(); GetElementPtrInst *GEP = GetElementPtrInst::Create( StructTy, &*AI, Idx, "gep_" + inputs[i]->getName(), TI); RewriteVal = new LoadInst(GEP, "loadgep_" + inputs[i]->getName(), TI); @@ -972,7 +972,7 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer, unsigned switchVal = 0; for (BasicBlock *Block : Blocks) { - TerminatorInst *TI = Block->getTerminator(); + Instruction *TI = Block->getTerminator(); for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) if (!Blocks.count(TI->getSuccessor(i))) { BasicBlock *OldTarget = TI->getSuccessor(i); @@ -1078,7 +1078,7 @@ void CodeExtractor::calculateNewCallTerminatorWeights( using BlockNode = BlockFrequencyInfoImplBase::BlockNode; // Update the branch weights for the exit block. - TerminatorInst *TI = CodeReplacer->getTerminator(); + Instruction *TI = CodeReplacer->getTerminator(); SmallVector BranchWeights(TI->getNumSuccessors(), 0); // Block Frequency distribution with dummy node. diff --git a/lib/Transforms/Utils/EscapeEnumerator.cpp b/lib/Transforms/Utils/EscapeEnumerator.cpp index c9c96fbe5da..762a374c135 100644 --- a/lib/Transforms/Utils/EscapeEnumerator.cpp +++ b/lib/Transforms/Utils/EscapeEnumerator.cpp @@ -37,7 +37,7 @@ IRBuilder<> *EscapeEnumerator::Next() { // Branches and invokes do not escape, only unwind, resume, and return // do. - TerminatorInst *TI = CurBB->getTerminator(); + Instruction *TI = CurBB->getTerminator(); if (!isa(TI) && !isa(TI)) continue; diff --git a/lib/Transforms/Utils/FlattenCFG.cpp b/lib/Transforms/Utils/FlattenCFG.cpp index 3c6c9c9a5df..d9778f4a1fb 100644 --- a/lib/Transforms/Utils/FlattenCFG.cpp +++ b/lib/Transforms/Utils/FlattenCFG.cpp @@ -232,7 +232,7 @@ bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder) { if (!FirstCondBlock || !LastCondBlock || (FirstCondBlock == LastCondBlock)) return false; - TerminatorInst *TBB = LastCondBlock->getTerminator(); + Instruction *TBB = LastCondBlock->getTerminator(); BasicBlock *PS1 = TBB->getSuccessor(0); BasicBlock *PS2 = TBB->getSuccessor(1); BranchInst *PBI1 = dyn_cast(PS1->getTerminator()); @@ -325,7 +325,7 @@ bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder) { bool FlattenCFGOpt::CompareIfRegionBlock(BasicBlock *Head1, BasicBlock *Head2, BasicBlock *Block1, BasicBlock *Block2) { - TerminatorInst *PTI2 = Head2->getTerminator(); + Instruction *PTI2 = Head2->getTerminator(); Instruction *PBI2 = &Head2->front(); bool eq1 = (Block1 == Head1); @@ -421,7 +421,7 @@ bool FlattenCFGOpt::MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder) { if ((IfTrue2 != SecondEntryBlock) && (IfFalse2 != SecondEntryBlock)) return false; - TerminatorInst *PTI2 = SecondEntryBlock->getTerminator(); + Instruction *PTI2 = SecondEntryBlock->getTerminator(); Instruction *PBI2 = &SecondEntryBlock->front(); if (!CompareIfRegionBlock(FirstEntryBlock, SecondEntryBlock, IfTrue1, diff --git a/lib/Transforms/Utils/FunctionComparator.cpp b/lib/Transforms/Utils/FunctionComparator.cpp index 69203f9f248..ef991d715fd 100644 --- a/lib/Transforms/Utils/FunctionComparator.cpp +++ b/lib/Transforms/Utils/FunctionComparator.cpp @@ -867,8 +867,8 @@ int FunctionComparator::compare() { if (int Res = cmpBasicBlocks(BBL, BBR)) return Res; - const TerminatorInst *TermL = BBL->getTerminator(); - const TerminatorInst *TermR = BBR->getTerminator(); + const Instruction *TermL = BBL->getTerminator(); + const Instruction *TermR = BBR->getTerminator(); assert(TermL->getNumSuccessors() == TermR->getNumSuccessors()); for (unsigned i = 0, e = TermL->getNumSuccessors(); i != e; ++i) { @@ -938,7 +938,7 @@ FunctionComparator::FunctionHash FunctionComparator::functionHash(Function &F) { for (auto &Inst : *BB) { H.add(Inst.getOpcode()); } - const TerminatorInst *Term = BB->getTerminator(); + const Instruction *Term = BB->getTerminator(); for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) { if (!VisitedBBs.insert(Term->getSuccessor(i)).second) continue; diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp index f8226f529ee..bda2ee2d8a3 100644 --- a/lib/Transforms/Utils/InlineFunction.cpp +++ b/lib/Transforms/Utils/InlineFunction.cpp @@ -2247,7 +2247,7 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, // Change the branch that used to go to AfterCallBB to branch to the first // basic block of the inlined function. // - TerminatorInst *Br = OrigBB->getTerminator(); + Instruction *Br = OrigBB->getTerminator(); assert(Br && Br->getOpcode() == Instruction::Br && "splitBasicBlock broken!"); Br->setOperand(0, &*FirstNewBlock); diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp index 879145cea6b..04db1c8c4c7 100644 --- a/lib/Transforms/Utils/Local.cpp +++ b/lib/Transforms/Utils/Local.cpp @@ -105,7 +105,7 @@ STATISTIC(NumRemoved, "Number of unreachable basic blocks removed"); bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions, const TargetLibraryInfo *TLI, DomTreeUpdater *DTU) { - TerminatorInst *T = BB->getTerminator(); + Instruction *T = BB->getTerminator(); IRBuilder<> Builder(T); // Branch - See if we are conditional jumping on constant @@ -2101,7 +2101,7 @@ static bool markAliveBlocks(Function &F, } } - TerminatorInst *Terminator = BB->getTerminator(); + Instruction *Terminator = BB->getTerminator(); if (auto *II = dyn_cast(Terminator)) { // Turn invokes that call 'nounwind' functions into ordinary calls. Value *Callee = II->getCalledValue(); @@ -2176,7 +2176,7 @@ static bool markAliveBlocks(Function &F, } void llvm::removeUnwindEdge(BasicBlock *BB, DomTreeUpdater *DTU) { - TerminatorInst *TI = BB->getTerminator(); + Instruction *TI = BB->getTerminator(); if (auto *II = dyn_cast(TI)) { changeToCall(II, DTU); diff --git a/lib/Transforms/Utils/LoopRotationUtils.cpp b/lib/Transforms/Utils/LoopRotationUtils.cpp index a6320d8dbf4..73f67f3219d 100644 --- a/lib/Transforms/Utils/LoopRotationUtils.cpp +++ b/lib/Transforms/Utils/LoopRotationUtils.cpp @@ -299,7 +299,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { // For the rest of the instructions, either hoist to the OrigPreheader if // possible or create a clone in the OldPreHeader if not. - TerminatorInst *LoopEntryBranch = OrigPreheader->getTerminator(); + Instruction *LoopEntryBranch = OrigPreheader->getTerminator(); // Record all debug intrinsics preceding LoopEntryBranch to avoid duplication. using DbgIntrinsicHash = diff --git a/lib/Transforms/Utils/LoopSimplify.cpp b/lib/Transforms/Utils/LoopSimplify.cpp index fc59cafa331..380f4fca54d 100644 --- a/lib/Transforms/Utils/LoopSimplify.cpp +++ b/lib/Transforms/Utils/LoopSimplify.cpp @@ -435,7 +435,7 @@ static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader, unsigned LoopMDKind = BEBlock->getContext().getMDKindID("llvm.loop"); MDNode *LoopMD = nullptr; for (unsigned i = 0, e = BackedgeBlocks.size(); i != e; ++i) { - TerminatorInst *TI = BackedgeBlocks[i]->getTerminator(); + Instruction *TI = BackedgeBlocks[i]->getTerminator(); if (!LoopMD) LoopMD = TI->getMetadata(LoopMDKind); TI->setMetadata(LoopMDKind, nullptr); @@ -488,7 +488,7 @@ ReprocessLoop: << P->getName() << "\n"); // Zap the dead pred's terminator and replace it with unreachable. - TerminatorInst *TI = P->getTerminator(); + Instruction *TI = P->getTerminator(); changeToUnreachable(TI, /*UseLLVMTrap=*/false, PreserveLCSSA); Changed = true; } diff --git a/lib/Transforms/Utils/LoopUnroll.cpp b/lib/Transforms/Utils/LoopUnroll.cpp index a8ec75c0baf..877e0e4dcf9 100644 --- a/lib/Transforms/Utils/LoopUnroll.cpp +++ b/lib/Transforms/Utils/LoopUnroll.cpp @@ -781,7 +781,7 @@ LoopUnrollResult llvm::UnrollLoop( // there is no such latch. NewIDom = Latches.back(); for (BasicBlock *IterLatch : Latches) { - TerminatorInst *Term = IterLatch->getTerminator(); + Instruction *Term = IterLatch->getTerminator(); if (isa(Term) && cast(Term)->isConditional()) { NewIDom = IterLatch; break; diff --git a/lib/Transforms/Utils/LoopUnrollAndJam.cpp b/lib/Transforms/Utils/LoopUnrollAndJam.cpp index 1ce2f844489..c17a64f0187 100644 --- a/lib/Transforms/Utils/LoopUnrollAndJam.cpp +++ b/lib/Transforms/Utils/LoopUnrollAndJam.cpp @@ -72,7 +72,7 @@ static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop, for (BasicBlock *BB : ForeBlocks) { if (BB == SubLoopPreHeader) continue; - TerminatorInst *TI = BB->getTerminator(); + Instruction *TI = BB->getTerminator(); for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) if (!ForeBlocks.count(TI->getSuccessor(i))) return false; diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp index ebbcf800254..8dad6176c51 100644 --- a/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/lib/Transforms/Utils/SimplifyCFG.cpp @@ -175,13 +175,13 @@ class SimplifyCFGOpt { const SimplifyCFGOptions &Options; bool Resimplify; - Value *isValueEqualityComparison(TerminatorInst *TI); + Value *isValueEqualityComparison(Instruction *TI); BasicBlock *GetValueEqualityComparisonCases( - TerminatorInst *TI, std::vector &Cases); - bool SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI, + Instruction *TI, std::vector &Cases); + bool SimplifyEqualityComparisonWithOnlyPredecessor(Instruction *TI, BasicBlock *Pred, IRBuilder<> &Builder); - bool FoldValueComparisonIntoPredecessors(TerminatorInst *TI, + bool FoldValueComparisonIntoPredecessors(Instruction *TI, IRBuilder<> &Builder); bool SimplifyReturn(ReturnInst *RI, IRBuilder<> &Builder); @@ -219,7 +219,7 @@ public: /// Return true if it is safe to merge these two /// terminator instructions together. static bool -SafeToMergeTerminators(TerminatorInst *SI1, TerminatorInst *SI2, +SafeToMergeTerminators(Instruction *SI1, Instruction *SI2, SmallSetVector *FailBlocks = nullptr) { if (SI1 == SI2) return false; // Can't merge with self! @@ -670,7 +670,7 @@ private: } // end anonymous namespace -static void EraseTerminatorInstAndDCECond(TerminatorInst *TI) { +static void EraseTerminatorAndDCECond(Instruction *TI) { Instruction *Cond = nullptr; if (SwitchInst *SI = dyn_cast(TI)) { Cond = dyn_cast(SI->getCondition()); @@ -688,7 +688,7 @@ static void EraseTerminatorInstAndDCECond(TerminatorInst *TI) { /// Return true if the specified terminator checks /// to see if a value is equal to constant integer value. -Value *SimplifyCFGOpt::isValueEqualityComparison(TerminatorInst *TI) { +Value *SimplifyCFGOpt::isValueEqualityComparison(Instruction *TI) { Value *CV = nullptr; if (SwitchInst *SI = dyn_cast(TI)) { // Do not permit merging of large switch instructions into their @@ -716,7 +716,7 @@ Value *SimplifyCFGOpt::isValueEqualityComparison(TerminatorInst *TI) { /// Given a value comparison instruction, /// decode all of the 'cases' that it represents and return the 'default' block. BasicBlock *SimplifyCFGOpt::GetValueEqualityComparisonCases( - TerminatorInst *TI, std::vector &Cases) { + Instruction *TI, std::vector &Cases) { if (SwitchInst *SI = dyn_cast(TI)) { Cases.reserve(SI->getNumCases()); for (auto Case : SI->cases()) @@ -806,7 +806,7 @@ static void setBranchWeights(Instruction *I, uint32_t TrueWeight, /// determines the outcome of this comparison. If so, simplify TI. This does a /// very limited form of jump threading. bool SimplifyCFGOpt::SimplifyEqualityComparisonWithOnlyPredecessor( - TerminatorInst *TI, BasicBlock *Pred, IRBuilder<> &Builder) { + Instruction *TI, BasicBlock *Pred, IRBuilder<> &Builder) { Value *PredVal = isValueEqualityComparison(Pred->getTerminator()); if (!PredVal) return false; // Not a value comparison in predecessor. @@ -854,7 +854,7 @@ bool SimplifyCFGOpt::SimplifyEqualityComparisonWithOnlyPredecessor( << "Through successor TI: " << *TI << "Leaving: " << *NI << "\n"); - EraseTerminatorInstAndDCECond(TI); + EraseTerminatorAndDCECond(TI); return true; } @@ -936,7 +936,7 @@ bool SimplifyCFGOpt::SimplifyEqualityComparisonWithOnlyPredecessor( << "Through successor TI: " << *TI << "Leaving: " << *NI << "\n"); - EraseTerminatorInstAndDCECond(TI); + EraseTerminatorAndDCECond(TI); return true; } @@ -971,10 +971,10 @@ static inline bool HasBranchWeights(const Instruction *I) { return false; } -/// Get Weights of a given TerminatorInst, the default weight is at the front +/// Get Weights of a given terminator, the default weight is at the front /// of the vector. If TI is a conditional eq, we need to swap the branch-weight /// metadata. -static void GetBranchWeights(TerminatorInst *TI, +static void GetBranchWeights(Instruction *TI, SmallVectorImpl &Weights) { MDNode *MD = TI->getMetadata(LLVMContext::MD_prof); assert(MD); @@ -1008,7 +1008,7 @@ static void FitWeights(MutableArrayRef Weights) { /// (either a switch or a branch on "X == c"). /// See if any of the predecessors of the terminator block are value comparisons /// on the same value. If so, and if safe to do so, fold them together. -bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI, +bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(Instruction *TI, IRBuilder<> &Builder) { BasicBlock *BB = TI->getParent(); Value *CV = isValueEqualityComparison(TI); // CondVal @@ -1020,7 +1020,7 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI, BasicBlock *Pred = Preds.pop_back_val(); // See if the predecessor is a comparison with the same value. - TerminatorInst *PTI = Pred->getTerminator(); + Instruction *PTI = Pred->getTerminator(); Value *PCV = isValueEqualityComparison(PTI); // PredCondVal if (PCV == CV && TI != PTI) { @@ -1197,7 +1197,7 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI, setBranchWeights(NewSI, MDWeights); } - EraseTerminatorInstAndDCECond(PTI); + EraseTerminatorAndDCECond(PTI); // Okay, last check. If BB is still a successor of PSI, then we must // have an infinite loop case. If so, add an infinitely looping block @@ -1413,7 +1413,7 @@ HoistTerminator: for (BasicBlock *Succ : successors(BB1)) AddPredecessorToBlock(Succ, BIParent, BB1); - EraseTerminatorInstAndDCECond(BI); + EraseTerminatorAndDCECond(BI); return true; } @@ -2247,7 +2247,7 @@ static bool FoldCondBranchOnPHI(BranchInst *BI, const DataLayout &DL, // Loop over all of the edges from PredBB to BB, changing them to branch // to EdgeBB instead. - TerminatorInst *PredBBTI = PredBB->getTerminator(); + Instruction *PredBBTI = PredBB->getTerminator(); for (unsigned i = 0, e = PredBBTI->getNumSuccessors(); i != e; ++i) if (PredBBTI->getSuccessor(i) == BB) { BB->removePredecessor(PredBB); @@ -2408,7 +2408,7 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI, // At this point, IfBlock1 and IfBlock2 are both empty, so our if statement // has been flattened. Change DomBlock to jump directly to our new block to // avoid other simplifycfg's kicking in on the diamond. - TerminatorInst *OldTI = DomBlock->getTerminator(); + Instruction *OldTI = DomBlock->getTerminator(); Builder.SetInsertPoint(OldTI); Builder.CreateBr(BB); OldTI->eraseFromParent(); @@ -2442,7 +2442,7 @@ static bool SimplifyCondBranchToTwoReturns(BranchInst *BI, TrueSucc->removePredecessor(BI->getParent()); FalseSucc->removePredecessor(BI->getParent()); Builder.CreateRetVoid(); - EraseTerminatorInstAndDCECond(BI); + EraseTerminatorAndDCECond(BI); return true; } @@ -2498,7 +2498,7 @@ static bool SimplifyCondBranchToTwoReturns(BranchInst *BI, << "\n " << *BI << "NewRet = " << *RI << "TRUEBLOCK: " << *TrueSucc << "FALSEBLOCK: " << *FalseSucc); - EraseTerminatorInstAndDCECond(BI); + EraseTerminatorAndDCECond(BI); return true; } @@ -2822,7 +2822,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, unsigned BonusInstThreshold) { } // Change PBI from Conditional to Unconditional. BranchInst *New_PBI = BranchInst::Create(TrueDest, PBI); - EraseTerminatorInstAndDCECond(PBI); + EraseTerminatorAndDCECond(PBI); PBI = New_PBI; } @@ -3417,7 +3417,7 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI, // Takes care of updating the successors and removing the old terminator. // Also makes sure not to introduce new successors by assuming that edges to // non-successor TrueBBs and FalseBBs aren't reachable. -static bool SimplifyTerminatorOnSelect(TerminatorInst *OldTerm, Value *Cond, +static bool SimplifyTerminatorOnSelect(Instruction *OldTerm, Value *Cond, BasicBlock *TrueBB, BasicBlock *FalseBB, uint32_t TrueWeight, uint32_t FalseWeight) { @@ -3472,7 +3472,7 @@ static bool SimplifyTerminatorOnSelect(TerminatorInst *OldTerm, Value *Cond, Builder.CreateBr(FalseBB); } - EraseTerminatorInstAndDCECond(OldTerm); + EraseTerminatorAndDCECond(OldTerm); return true; } @@ -3715,7 +3715,7 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, IRBuilder<> &Builder, BasicBlock *NewBB = BB->splitBasicBlock(BI->getIterator(), "switch.early.test"); // Remove the uncond branch added to the old block. - TerminatorInst *OldTI = BB->getTerminator(); + Instruction *OldTI = BB->getTerminator(); Builder.SetInsertPoint(OldTI); if (TrueWhenEqual) @@ -3759,7 +3759,7 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, IRBuilder<> &Builder, } // Erase the old branch instruction. - EraseTerminatorInstAndDCECond(BI); + EraseTerminatorAndDCECond(BI); LLVM_DEBUG(dbgs() << " ** 'icmp' chain result is:\n" << *BB << '\n'); return true; @@ -4007,7 +4007,7 @@ static bool removeEmptyCleanup(CleanupReturnInst *RI) { if (UnwindDest == nullptr) { removeUnwindEdge(PredBB); } else { - TerminatorInst *TI = PredBB->getTerminator(); + Instruction *TI = PredBB->getTerminator(); TI->replaceUsesOfWith(BB, UnwindDest); } } @@ -4076,7 +4076,7 @@ bool SimplifyCFGOpt::SimplifyReturn(ReturnInst *RI, IRBuilder<> &Builder) { SmallVector CondBranchPreds; for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) { BasicBlock *P = *PI; - TerminatorInst *PTI = P->getTerminator(); + Instruction *PTI = P->getTerminator(); if (BranchInst *BI = dyn_cast(PTI)) { if (BI->isUnconditional()) UncondBranchPreds.push_back(P); @@ -4181,7 +4181,7 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) { SmallVector Preds(pred_begin(BB), pred_end(BB)); for (unsigned i = 0, e = Preds.size(); i != e; ++i) { - TerminatorInst *TI = Preds[i]->getTerminator(); + Instruction *TI = Preds[i]->getTerminator(); IRBuilder<> Builder(TI); if (auto *BI = dyn_cast(TI)) { if (BI->isUnconditional()) { @@ -4193,10 +4193,10 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) { } else { if (BI->getSuccessor(0) == BB) { Builder.CreateBr(BI->getSuccessor(1)); - EraseTerminatorInstAndDCECond(BI); + EraseTerminatorAndDCECond(BI); } else if (BI->getSuccessor(1) == BB) { Builder.CreateBr(BI->getSuccessor(0)); - EraseTerminatorInstAndDCECond(BI); + EraseTerminatorAndDCECond(BI); Changed = true; } } @@ -4438,7 +4438,7 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, AssumptionCache *AC, SplitBlock(&*NewDefault, &NewDefault->front()); auto *OldTI = NewDefault->getTerminator(); new UnreachableInst(SI->getContext(), OldTI); - EraseTerminatorInstAndDCECond(OldTI); + EraseTerminatorAndDCECond(OldTI); return true; } @@ -4649,12 +4649,12 @@ GetCaseResults(SwitchInst *SI, ConstantInt *CaseVal, BasicBlock *CaseDest, SmallDenseMap ConstantPool; ConstantPool.insert(std::make_pair(SI->getCondition(), CaseVal)); for (Instruction &I :CaseDest->instructionsWithoutDebug()) { - if (TerminatorInst *T = dyn_cast(&I)) { + if (I.isTerminator()) { // If the terminator is a simple branch, continue to the next block. - if (T->getNumSuccessors() != 1 || T->isExceptionalTerminator()) + if (I.getNumSuccessors() != 1 || I.isExceptionalTerminator()) return false; Pred = CaseDest; - CaseDest = T->getSuccessor(0); + CaseDest = I.getSuccessor(0); } else if (Constant *C = ConstantFold(&I, DL, ConstantPool)) { // Instruction is side-effect free and constant. @@ -5663,14 +5663,14 @@ bool SimplifyCFGOpt::SimplifyIndirectBr(IndirectBrInst *IBI) { if (IBI->getNumDestinations() == 0) { // If the indirectbr has no successors, change it to unreachable. new UnreachableInst(IBI->getContext(), IBI); - EraseTerminatorInstAndDCECond(IBI); + EraseTerminatorAndDCECond(IBI); return true; } if (IBI->getNumDestinations() == 1) { // If the indirectbr has one successor, change it to a direct branch. BranchInst::Create(IBI->getDestination(0), IBI); - EraseTerminatorInstAndDCECond(IBI); + EraseTerminatorAndDCECond(IBI); return true; } @@ -5892,7 +5892,7 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) { } else { // If Successor #1 has multiple preds, we may be able to conditionally // execute Successor #0 if it branches to Successor #1. - TerminatorInst *Succ0TI = BI->getSuccessor(0)->getTerminator(); + Instruction *Succ0TI = BI->getSuccessor(0)->getTerminator(); if (Succ0TI->getNumSuccessors() == 1 && Succ0TI->getSuccessor(0) == BI->getSuccessor(1)) if (SpeculativelyExecuteBB(BI, BI->getSuccessor(0), TTI)) @@ -5901,7 +5901,7 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) { } else if (BI->getSuccessor(1)->getSinglePredecessor()) { // If Successor #0 has multiple preds, we may be able to conditionally // execute Successor #1 if it branches to Successor #0. - TerminatorInst *Succ1TI = BI->getSuccessor(1)->getTerminator(); + Instruction *Succ1TI = BI->getSuccessor(1)->getTerminator(); if (Succ1TI->getNumSuccessors() == 1 && Succ1TI->getSuccessor(0) == BI->getSuccessor(0)) if (SpeculativelyExecuteBB(BI, BI->getSuccessor(1), TTI)) @@ -5991,7 +5991,7 @@ static bool removeUndefIntroducingPredecessor(BasicBlock *BB) { for (PHINode &PHI : BB->phis()) for (unsigned i = 0, e = PHI.getNumIncomingValues(); i != e; ++i) if (passingValueIsAlwaysUndefined(PHI.getIncomingValue(i), &PHI)) { - TerminatorInst *T = PHI.getIncomingBlock(i)->getTerminator(); + Instruction *T = PHI.getIncomingBlock(i)->getTerminator(); IRBuilder<> Builder(T); if (BranchInst *BI = dyn_cast(T)) { BB->removePredecessor(PHI.getIncomingBlock(i)); diff --git a/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp b/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp index b6307acb947..0f42694e193 100644 --- a/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp +++ b/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp @@ -268,7 +268,7 @@ VPRegionBlock *PlainCFGBuilder::buildPlainCFG() { // Set VPBB successors. We create empty VPBBs for successors if they don't // exist already. Recipes will be created when the successor is visited // during the RPO traversal. - TerminatorInst *TI = BB->getTerminator(); + Instruction *TI = BB->getTerminator(); assert(TI && "Terminator expected."); unsigned NumSuccs = TI->getNumSuccessors(); -- GitLab From 2b7e80d846d5029709839555a1e1f7eabd17fb8f Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Mon, 15 Oct 2018 10:10:54 +0000 Subject: [PATCH 0190/1116] [TI removal] Rework `InstVisitor` to support visiting instructions that are terminators without relying on the specific `TerminatorInst` type. This required cleaning up two users of `InstVisitor`s usage of `TerminatorInst` as well. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344503 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/IR/InstVisitor.h | 43 +++++++++++++++++++++++++--------- lib/IR/Verifier.cpp | 32 ++++++++++++------------- lib/Transforms/Scalar/SCCP.cpp | 16 ++++++------- 3 files changed, 56 insertions(+), 35 deletions(-) diff --git a/include/llvm/IR/InstVisitor.h b/include/llvm/IR/InstVisitor.h index 55536f237d4..554417f984a 100644 --- a/include/llvm/IR/InstVisitor.h +++ b/include/llvm/IR/InstVisitor.h @@ -166,15 +166,6 @@ public: // Specific Instruction type classes... note that all of the casts are // necessary because we use the instruction classes as opaque types... // - RetTy visitReturnInst(ReturnInst &I) { DELEGATE(TerminatorInst);} - RetTy visitBranchInst(BranchInst &I) { DELEGATE(TerminatorInst);} - RetTy visitSwitchInst(SwitchInst &I) { DELEGATE(TerminatorInst);} - RetTy visitIndirectBrInst(IndirectBrInst &I) { DELEGATE(TerminatorInst);} - RetTy visitResumeInst(ResumeInst &I) { DELEGATE(TerminatorInst);} - RetTy visitUnreachableInst(UnreachableInst &I) { DELEGATE(TerminatorInst);} - RetTy visitCleanupReturnInst(CleanupReturnInst &I) { DELEGATE(TerminatorInst);} - RetTy visitCatchReturnInst(CatchReturnInst &I) { DELEGATE(TerminatorInst); } - RetTy visitCatchSwitchInst(CatchSwitchInst &I) { DELEGATE(TerminatorInst);} RetTy visitICmpInst(ICmpInst &I) { DELEGATE(CmpInst);} RetTy visitFCmpInst(FCmpInst &I) { DELEGATE(CmpInst);} RetTy visitAllocaInst(AllocaInst &I) { DELEGATE(UnaryInstruction);} @@ -236,6 +227,37 @@ public: return static_cast(this)->visitCallSite(&I); } + // While terminators don't have a distinct type modeling them, we support + // intercepting them with dedicated a visitor callback. + RetTy visitReturnInst(ReturnInst &I) { + return static_cast(this)->visitTerminator(I); + } + RetTy visitBranchInst(BranchInst &I) { + return static_cast(this)->visitTerminator(I); + } + RetTy visitSwitchInst(SwitchInst &I) { + return static_cast(this)->visitTerminator(I); + } + RetTy visitIndirectBrInst(IndirectBrInst &I) { + return static_cast(this)->visitTerminator(I); + } + RetTy visitResumeInst(ResumeInst &I) { + return static_cast(this)->visitTerminator(I); + } + RetTy visitUnreachableInst(UnreachableInst &I) { + return static_cast(this)->visitTerminator(I); + } + RetTy visitCleanupReturnInst(CleanupReturnInst &I) { + return static_cast(this)->visitTerminator(I); + } + RetTy visitCatchReturnInst(CatchReturnInst &I) { + return static_cast(this)->visitTerminator(I); + } + RetTy visitCatchSwitchInst(CatchSwitchInst &I) { + return static_cast(this)->visitTerminator(I); + } + RetTy visitTerminator(Instruction &I) { DELEGATE(Instruction);} + // Next level propagators: If the user does not overload a specific // instruction type, they can overload one of these to get the whole class // of instructions... @@ -243,7 +265,6 @@ public: RetTy visitCastInst(CastInst &I) { DELEGATE(UnaryInstruction);} RetTy visitBinaryOperator(BinaryOperator &I) { DELEGATE(Instruction);} RetTy visitCmpInst(CmpInst &I) { DELEGATE(Instruction);} - RetTy visitTerminatorInst(TerminatorInst &I) { DELEGATE(Instruction);} RetTy visitUnaryInstruction(UnaryInstruction &I){ DELEGATE(Instruction);} // Provide a special visitor for a 'callsite' that visits both calls and @@ -256,7 +277,7 @@ public: DELEGATE(Instruction); assert(CS.isInvoke()); - DELEGATE(TerminatorInst); + return static_cast(this)->visitTerminator(I); } // If the user wants a 'default' case, they can choose to override this diff --git a/lib/IR/Verifier.cpp b/lib/IR/Verifier.cpp index d96555ca5f9..6e0bb5ad358 100644 --- a/lib/IR/Verifier.cpp +++ b/lib/IR/Verifier.cpp @@ -287,7 +287,7 @@ class Verifier : public InstVisitor, VerifierSupport { // Maps catchswitches and cleanuppads that unwind to siblings to the // terminators that indicate the unwind, used to detect cycles therein. - MapVector SiblingFuncletInfo; + MapVector SiblingFuncletInfo; /// Cache of constants visited in search of ConstantExprs. SmallPtrSet ConstantExprVisited; @@ -457,7 +457,7 @@ private: void visitStoreInst(StoreInst &SI); void verifyDominatesUse(Instruction &I, unsigned i); void visitInstruction(Instruction &I); - void visitTerminatorInst(TerminatorInst &I); + void visitTerminator(Instruction &I); void visitBranchInst(BranchInst &BI); void visitReturnInst(ReturnInst &RI); void visitSwitchInst(SwitchInst &SI); @@ -2009,7 +2009,7 @@ void Verifier::verifyFrameRecoverIndices() { } } -static Instruction *getSuccPad(TerminatorInst *Terminator) { +static Instruction *getSuccPad(Instruction *Terminator) { BasicBlock *UnwindDest; if (auto *II = dyn_cast(Terminator)) UnwindDest = II->getUnwindDest(); @@ -2028,7 +2028,7 @@ void Verifier::verifySiblingFuncletUnwinds() { if (Visited.count(PredPad)) continue; Active.insert(PredPad); - TerminatorInst *Terminator = Pair.second; + Instruction *Terminator = Pair.second; do { Instruction *SuccPad = getSuccPad(Terminator); if (Active.count(SuccPad)) { @@ -2037,7 +2037,7 @@ void Verifier::verifySiblingFuncletUnwinds() { SmallVector CycleNodes; do { CycleNodes.push_back(CyclePad); - TerminatorInst *CycleTerminator = SiblingFuncletInfo[CyclePad]; + Instruction *CycleTerminator = SiblingFuncletInfo[CyclePad]; if (CycleTerminator != CyclePad) CycleNodes.push_back(CycleTerminator); CyclePad = getSuccPad(CycleTerminator); @@ -2352,7 +2352,7 @@ void Verifier::visitBasicBlock(BasicBlock &BB) { } } -void Verifier::visitTerminatorInst(TerminatorInst &I) { +void Verifier::visitTerminator(Instruction &I) { // Ensure that terminators only exist at the end of the basic block. Assert(&I == I.getParent()->getTerminator(), "Terminator found in the middle of a basic block!", I.getParent()); @@ -2364,7 +2364,7 @@ void Verifier::visitBranchInst(BranchInst &BI) { Assert(BI.getCondition()->getType()->isIntegerTy(1), "Branch condition is not 'i1' type!", &BI, BI.getCondition()); } - visitTerminatorInst(BI); + visitTerminator(BI); } void Verifier::visitReturnInst(ReturnInst &RI) { @@ -2383,7 +2383,7 @@ void Verifier::visitReturnInst(ReturnInst &RI) { // Check to make sure that the return value has necessary properties for // terminators... - visitTerminatorInst(RI); + visitTerminator(RI); } void Verifier::visitSwitchInst(SwitchInst &SI) { @@ -2398,7 +2398,7 @@ void Verifier::visitSwitchInst(SwitchInst &SI) { "Duplicate integer as switch case", &SI, Case.getCaseValue()); } - visitTerminatorInst(SI); + visitTerminator(SI); } void Verifier::visitIndirectBrInst(IndirectBrInst &BI) { @@ -2408,7 +2408,7 @@ void Verifier::visitIndirectBrInst(IndirectBrInst &BI) { Assert(BI.getDestination(i)->getType()->isLabelTy(), "Indirectbr destinations must all have pointer type!", &BI); - visitTerminatorInst(BI); + visitTerminator(BI); } void Verifier::visitSelectInst(SelectInst &SI) { @@ -2987,7 +2987,7 @@ void Verifier::visitInvokeInst(InvokeInst &II) { "The unwind destination does not have an exception handling instruction!", &II); - visitTerminatorInst(II); + visitTerminator(II); } /// visitBinaryOperator - Check that both arguments to the binary operator are @@ -3538,7 +3538,7 @@ void Verifier::visitResumeInst(ResumeInst &RI) { "inside a function.", &RI); - visitTerminatorInst(RI); + visitTerminator(RI); } void Verifier::visitCatchPadInst(CatchPadInst &CPI) { @@ -3566,7 +3566,7 @@ void Verifier::visitCatchReturnInst(CatchReturnInst &CatchReturn) { "CatchReturnInst needs to be provided a CatchPad", &CatchReturn, CatchReturn.getOperand(0)); - visitTerminatorInst(CatchReturn); + visitTerminator(CatchReturn); } void Verifier::visitCleanupPadInst(CleanupPadInst &CPI) { @@ -3687,7 +3687,7 @@ void Verifier::visitFuncletPadInst(FuncletPadInst &FPI) { // Record cleanup sibling unwinds for verifySiblingFuncletUnwinds if (isa(&FPI) && !isa(UnwindPad) && getParentPad(UnwindPad) == getParentPad(&FPI)) - SiblingFuncletInfo[&FPI] = cast(U); + SiblingFuncletInfo[&FPI] = cast(U); } } // Make sure we visit all uses of FPI, but for nested pads stop as @@ -3788,7 +3788,7 @@ void Verifier::visitCatchSwitchInst(CatchSwitchInst &CatchSwitch) { } visitEHPadPredecessors(CatchSwitch); - visitTerminatorInst(CatchSwitch); + visitTerminator(CatchSwitch); } void Verifier::visitCleanupReturnInst(CleanupReturnInst &CRI) { @@ -3804,7 +3804,7 @@ void Verifier::visitCleanupReturnInst(CleanupReturnInst &CRI) { &CRI); } - visitTerminatorInst(CRI); + visitTerminator(CRI); } void Verifier::verifyDominatesUse(Instruction &I, unsigned i) { diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp index 11e5549c332..b7340f294fd 100644 --- a/lib/Transforms/Scalar/SCCP.cpp +++ b/lib/Transforms/Scalar/SCCP.cpp @@ -563,7 +563,7 @@ private: // getFeasibleSuccessors - Return a vector of booleans to indicate which // successors are reachable from a given terminator instruction. - void getFeasibleSuccessors(TerminatorInst &TI, SmallVectorImpl &Succs); + void getFeasibleSuccessors(Instruction &TI, SmallVectorImpl &Succs); // OperandChangedState - This method is invoked on all of the users of an // instruction that was just changed state somehow. Based on this @@ -604,7 +604,7 @@ private: // Terminators void visitReturnInst(ReturnInst &I); - void visitTerminatorInst(TerminatorInst &TI); + void visitTerminator(Instruction &TI); void visitCastInst(CastInst &I); void visitSelectInst(SelectInst &I); @@ -615,7 +615,7 @@ private: void visitCatchSwitchInst(CatchSwitchInst &CPI) { markOverdefined(&CPI); - visitTerminatorInst(CPI); + visitTerminator(CPI); } // Instructions that cannot be folded away. @@ -630,12 +630,12 @@ private: void visitInvokeInst (InvokeInst &II) { visitCallSite(&II); - visitTerminatorInst(II); + visitTerminator(II); } void visitCallSite (CallSite CS); - void visitResumeInst (TerminatorInst &I) { /*returns void*/ } - void visitUnreachableInst(TerminatorInst &I) { /*returns void*/ } + void visitResumeInst (ResumeInst &I) { /*returns void*/ } + void visitUnreachableInst(UnreachableInst &I) { /*returns void*/ } void visitFenceInst (FenceInst &I) { /*returns void*/ } void visitInstruction(Instruction &I) { @@ -650,7 +650,7 @@ private: // getFeasibleSuccessors - Return a vector of booleans to indicate which // successors are reachable from a given terminator instruction. -void SCCPSolver::getFeasibleSuccessors(TerminatorInst &TI, +void SCCPSolver::getFeasibleSuccessors(Instruction &TI, SmallVectorImpl &Succs) { Succs.resize(TI.getNumSuccessors()); if (auto *BI = dyn_cast(&TI)) { @@ -837,7 +837,7 @@ void SCCPSolver::visitReturnInst(ReturnInst &I) { } } -void SCCPSolver::visitTerminatorInst(TerminatorInst &TI) { +void SCCPSolver::visitTerminator(Instruction &TI) { SmallVector SuccFeasible; getFeasibleSuccessors(TI, SuccFeasible); -- GitLab From d8d8371469074d303b39e8047f74e382e5eeb8b7 Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Mon, 15 Oct 2018 10:42:50 +0000 Subject: [PATCH 0191/1116] [TI removal] Make `getTerminator()` return a generic `Instruction`. This removes the primary remaining API producing `TerminatorInst` which will reduce the rate at which code is introduced trying to use it and generally make it much easier to remove the remaining APIs across the codebase. Also clean up some of the stragglers that the previous mechanical update of variables missed. Users of LLVM and out-of-tree code generally will need to update any explicit variable types to handle this. Replacing `TerminatorInst` with `Instruction` (or `auto`) almost always works. Most of these edits were made in prior commits using the perl one-liner: ``` perl -i -ple 's/TerminatorInst(\b.* = .*getTerminator\(\))/Instruction\1/g' ``` This also my break some rare use cases where people overload for both `Instruction` and `TerminatorInst`, but these should be easily fixed by removing the `TerminatorInst` overload. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344504 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/IR/BasicBlock.h | 21 ++++++++++----------- lib/IR/BasicBlock.cpp | 7 ++++--- lib/Transforms/Coroutines/CoroFrame.cpp | 2 +- lib/Transforms/Scalar/GVNHoist.cpp | 2 +- lib/Transforms/Vectorize/SLPVectorizer.cpp | 18 +++++++++--------- tools/bugpoint/CrashDebugger.cpp | 2 +- tools/llvm-diff/DifferenceEngine.cpp | 4 ++-- unittests/IR/DominatorTreeTest.cpp | 2 +- unittests/IR/IRBuilderTest.cpp | 2 +- 9 files changed, 30 insertions(+), 30 deletions(-) diff --git a/include/llvm/IR/BasicBlock.h b/include/llvm/IR/BasicBlock.h index 1ee19975af7..7244bba1ca5 100644 --- a/include/llvm/IR/BasicBlock.h +++ b/include/llvm/IR/BasicBlock.h @@ -38,7 +38,6 @@ class LandingPadInst; class LLVMContext; class Module; class PHINode; -class TerminatorInst; class ValueSymbolTable; /// LLVM Basic Block Representation @@ -50,12 +49,12 @@ class ValueSymbolTable; /// represents a label to which a branch can jump. /// /// A well formed basic block is formed of a list of non-terminating -/// instructions followed by a single TerminatorInst instruction. -/// TerminatorInst's may not occur in the middle of basic blocks, and must -/// terminate the blocks. The BasicBlock class allows malformed basic blocks to -/// occur because it may be useful in the intermediate stage of constructing or -/// modifying a program. However, the verifier will ensure that basic blocks -/// are "well formed". +/// instructions followed by a single terminator instruction. Terminator +/// instructions may not occur in the middle of basic blocks, and must terminate +/// the blocks. The BasicBlock class allows malformed basic blocks to occur +/// because it may be useful in the intermediate stage of constructing or +/// modifying a program. However, the verifier will ensure that basic blocks are +/// "well formed". class BasicBlock final : public Value, // Basic blocks are data objects also public ilist_node_with_parent { public: @@ -120,10 +119,10 @@ public: /// Returns the terminator instruction if the block is well formed or null /// if the block is not well formed. - const TerminatorInst *getTerminator() const LLVM_READONLY; - TerminatorInst *getTerminator() { - return const_cast( - static_cast(this)->getTerminator()); + const Instruction *getTerminator() const LLVM_READONLY; + Instruction *getTerminator() { + return const_cast( + static_cast(this)->getTerminator()); } /// Returns the call instruction calling \@llvm.experimental.deoptimize diff --git a/lib/IR/BasicBlock.cpp b/lib/IR/BasicBlock.cpp index 03fb5ccaffc..12ab2e2ace4 100644 --- a/lib/IR/BasicBlock.cpp +++ b/lib/IR/BasicBlock.cpp @@ -135,9 +135,10 @@ const Module *BasicBlock::getModule() const { return getParent()->getParent(); } -const TerminatorInst *BasicBlock::getTerminator() const { - if (InstList.empty()) return nullptr; - return dyn_cast(&InstList.back()); +const Instruction *BasicBlock::getTerminator() const { + if (InstList.empty() || !InstList.back().isTerminator()) + return nullptr; + return &InstList.back(); } const CallInst *BasicBlock::getTerminatingMustTailCall() const { diff --git a/lib/Transforms/Coroutines/CoroFrame.cpp b/lib/Transforms/Coroutines/CoroFrame.cpp index 4357948d5ab..4cb0a52961c 100644 --- a/lib/Transforms/Coroutines/CoroFrame.cpp +++ b/lib/Transforms/Coroutines/CoroFrame.cpp @@ -601,7 +601,7 @@ static Instruction *insertSpills(SpillInfo &Spills, coro::Shape &Shape) { } // Sets the unwind edge of an instruction to a particular successor. -static void setUnwindEdgeTo(TerminatorInst *TI, BasicBlock *Succ) { +static void setUnwindEdgeTo(Instruction *TI, BasicBlock *Succ) { if (auto *II = dyn_cast(TI)) II->setUnwindDest(Succ); else if (auto *CS = dyn_cast(TI)) diff --git a/lib/Transforms/Scalar/GVNHoist.cpp b/lib/Transforms/Scalar/GVNHoist.cpp index 3043df9cca7..0797ce9adea 100644 --- a/lib/Transforms/Scalar/GVNHoist.cpp +++ b/lib/Transforms/Scalar/GVNHoist.cpp @@ -577,7 +577,7 @@ private: // Returns the edge via which an instruction in BB will get the values from. // Returns true when the values are flowing out to each edge. - bool valueAnticipable(CHIArgs C, TerminatorInst *TI) const { + bool valueAnticipable(CHIArgs C, Instruction *TI) const { if (TI->getNumSuccessors() > (unsigned)size(C)) return false; // Not enough args in this CHI. diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index 79b575b78cd..5fdbf219009 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1536,12 +1536,12 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, // Check for terminator values (e.g. invoke). for (unsigned j = 0; j < VL.size(); ++j) for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { - TerminatorInst *Term = dyn_cast( - cast(VL[j])->getIncomingValueForBlock(PH->getIncomingBlock(i))); - if (Term) { - LLVM_DEBUG( - dbgs() - << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n"); + Instruction *Term = dyn_cast( + cast(VL[j])->getIncomingValueForBlock( + PH->getIncomingBlock(i))); + if (Term && Term->isTerminator()) { + LLVM_DEBUG(dbgs() + << "SLP: Need to swizzle PHINodes (terminator use).\n"); BS.cancelScheduling(VL, VL0); newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); return; @@ -3652,7 +3652,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { if (PHINode *PH = dyn_cast(User)) { for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) { if (PH->getIncomingValue(i) == Scalar) { - TerminatorInst *IncomingTerminator = + Instruction *IncomingTerminator = PH->getIncomingBlock(i)->getTerminator(); if (isa(IncomingTerminator)) { Builder.SetInsertPoint(VecI->getParent(), @@ -3960,7 +3960,7 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V, ScheduleEnd = I->getNextNode(); if (isOneOf(S, I) != I) CheckSheduleForI(I); - assert(ScheduleEnd && "tried to vectorize a TerminatorInst?"); + assert(ScheduleEnd && "tried to vectorize a terminator?"); LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n"); return true; } @@ -3996,7 +3996,7 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V, ScheduleEnd = I->getNextNode(); if (isOneOf(S, I) != I) CheckSheduleForI(I); - assert(ScheduleEnd && "tried to vectorize a TerminatorInst?"); + assert(ScheduleEnd && "tried to vectorize a terminator?"); LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n"); return true; diff --git a/tools/bugpoint/CrashDebugger.cpp b/tools/bugpoint/CrashDebugger.cpp index e973bfef4dc..a50ff4c255b 100644 --- a/tools/bugpoint/CrashDebugger.cpp +++ b/tools/bugpoint/CrashDebugger.cpp @@ -409,7 +409,7 @@ bool ReduceCrashingBlocks::TestBlocks(std::vector &BBs) { for (BasicBlock *Succ : successors(&BB)) Succ->removePredecessor(&BB); - TerminatorInst *BBTerm = BB.getTerminator(); + Instruction *BBTerm = BB.getTerminator(); if (BBTerm->isEHPad() || BBTerm->getType()->isTokenTy()) continue; if (!BBTerm->getType()->isVoidTy()) diff --git a/tools/llvm-diff/DifferenceEngine.cpp b/tools/llvm-diff/DifferenceEngine.cpp index b2673c1407f..acff8bb3e89 100644 --- a/tools/llvm-diff/DifferenceEngine.cpp +++ b/tools/llvm-diff/DifferenceEngine.cpp @@ -629,8 +629,8 @@ void FunctionDifferenceEngine::runBlockDiff(BasicBlock::iterator LStart, // If the terminators have different kinds, but one is an invoke and the // other is an unconditional branch immediately following a call, unify // the results and the destinations. - TerminatorInst *LTerm = LStart->getParent()->getTerminator(); - TerminatorInst *RTerm = RStart->getParent()->getTerminator(); + Instruction *LTerm = LStart->getParent()->getTerminator(); + Instruction *RTerm = RStart->getParent()->getTerminator(); if (isa(LTerm) && isa(RTerm)) { if (cast(LTerm)->isConditional()) return; BasicBlock::iterator I = LTerm->getIterator(); diff --git a/unittests/IR/DominatorTreeTest.cpp b/unittests/IR/DominatorTreeTest.cpp index cf81623d0d1..7539bbc860b 100644 --- a/unittests/IR/DominatorTreeTest.cpp +++ b/unittests/IR/DominatorTreeTest.cpp @@ -301,7 +301,7 @@ TEST(DominatorTree, NonUniqueEdges) { BasicBlock *BB1 = &*FI++; BasicBlock *BB2 = &*FI++; - const TerminatorInst *TI = BB0->getTerminator(); + const Instruction *TI = BB0->getTerminator(); assert(TI->getNumSuccessors() == 3 && "Switch has three successors"); BasicBlockEdge Edge_BB0_BB2(BB0, TI->getSuccessor(0)); diff --git a/unittests/IR/IRBuilderTest.cpp b/unittests/IR/IRBuilderTest.cpp index 713c0a14f66..be29b41309a 100644 --- a/unittests/IR/IRBuilderTest.cpp +++ b/unittests/IR/IRBuilderTest.cpp @@ -160,7 +160,7 @@ TEST_F(IRBuilderTest, CreateCondBr) { BasicBlock *FBB = BasicBlock::Create(Ctx, "", F); BranchInst *BI = Builder.CreateCondBr(Builder.getTrue(), TBB, FBB); - TerminatorInst *TI = BB->getTerminator(); + Instruction *TI = BB->getTerminator(); EXPECT_EQ(BI, TI); EXPECT_EQ(2u, TI->getNumSuccessors()); EXPECT_EQ(TBB, TI->getSuccessor(0)); -- GitLab From 4b284c14ecdd7b86ec6d28370e7860012e546cae Mon Sep 17 00:00:00 2001 From: Fedor Sergeev Date: Mon, 15 Oct 2018 10:46:35 +0000 Subject: [PATCH 0192/1116] [NewPM] implement SCC printing for -print-before-all/-print-after-all Removing deficiency of initial implementation of -print-before-all/-after-all - it was effectively skipping IR printing for all the SCC passes. Now LazyCallGraph:SCC gets its IR printed. Reviewed By: skatkov Differential Revision: https://reviews.llvm.org/D53270 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344505 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Passes/StandardInstrumentations.cpp | 32 ++++++++++++++-- test/Other/scc-pass-printer.ll | 49 +++++++++++++++++++++++++ 2 files changed, 77 insertions(+), 4 deletions(-) create mode 100644 test/Other/scc-pass-printer.ll diff --git a/lib/Passes/StandardInstrumentations.cpp b/lib/Passes/StandardInstrumentations.cpp index aa34584fa12..48d36e5a01e 100644 --- a/lib/Passes/StandardInstrumentations.cpp +++ b/lib/Passes/StandardInstrumentations.cpp @@ -37,10 +37,6 @@ namespace PrintIR { /// Generic IR-printing helper that unpacks a pointer to IRUnit wrapped into /// llvm::Any and does actual print job. void unwrapAndPrint(StringRef Banner, Any IR) { - if (any_isa(IR) || - any_isa(IR)) - return; - SmallString<40> Extra{"\n"}; const Module *M = nullptr; if (any_isa(IR)) { @@ -55,6 +51,34 @@ void unwrapAndPrint(StringRef Banner, Any IR) { } M = F->getParent(); Extra = formatv(" (function: {0})\n", F->getName()); + } else if (any_isa(IR)) { + const LazyCallGraph::SCC *C = any_cast(IR); + assert(C); + if (!llvm::forcePrintModuleIR()) { + Extra = formatv(" (scc: {0})\n", C->getName()); + bool BannerPrinted = false; + for (const LazyCallGraph::Node &N : *C) { + const Function &F = N.getFunction(); + if (!F.isDeclaration() && isFunctionInPrintList(F.getName())) { + if (!BannerPrinted) { + dbgs() << Banner << Extra; + BannerPrinted = true; + } + F.print(dbgs()); + } + } + return; + } + for (const LazyCallGraph::Node &N : *C) { + const Function &F = N.getFunction(); + if (!F.isDeclaration() && isFunctionInPrintList(F.getName())) { + M = F.getParent(); + break; + } + } + if (!M) + return; + Extra = formatv(" (for scc: {0})\n", C->getName()); } else if (any_isa(IR)) { const Loop *L = any_cast(IR); const Function *F = L->getHeader()->getParent(); diff --git a/test/Other/scc-pass-printer.ll b/test/Other/scc-pass-printer.ll new file mode 100644 index 00000000000..9d86bf03963 --- /dev/null +++ b/test/Other/scc-pass-printer.ll @@ -0,0 +1,49 @@ +; RUN: opt < %s 2>&1 -disable-output \ +; RUN: -inline -print-after-all | FileCheck %s -check-prefix=INL +; RUN: opt < %s 2>&1 -disable-output \ +; RUN: -passes=inline -print-after-all | FileCheck %s -check-prefix=INL +; RUN: opt < %s 2>&1 -disable-output \ +; RUN: -inline -print-after-all -print-module-scope | FileCheck %s -check-prefix=INL-MOD +; RUN: opt < %s 2>&1 -disable-output \ +; RUN: -passes=inline -print-after-all -print-module-scope | FileCheck %s -check-prefix=INL-MOD + +; INL: IR Dump After {{Function Integration/Inlining|InlinerPass .*scc: .bar, foo}} +; INL: define void @bar() +; INL-NEXT: call void @foo() +; INL: define void @foo() +; INL-NEXT: call void @bar() +; INL: IR Dump After {{Function Integration/Inlining|InlinerPass .*scc: .tester}} +; INL: define void @tester() +; INL-NEXT: call void @foo() +; INL: IR Dump After + +; INL-MOD: IR Dump After {{Function Integration/Inlining|InlinerPass .*scc: .bar, foo}} +; INL-MOD: define void @tester() +; INL-MOD-NEXT: call void @foo() +; INL-MOD: define void @foo() +; INL-MOD-NEXT: call void @bar() +; INL-MOD: define void @bar() +; INL-MOD-NEXT: call void @foo() +; INL-MOD: IR Dump After {{Function Integration/Inlining|InlinerPass .*scc: .tester}} +; INL-MOD: define void @tester() +; INL-MOD-NEXT: call void @foo() +; INL-MOD: define void @foo() +; INL-MOD-NEXT: call void @bar() +; INL-MOD: define void @bar() +; INL-MOD-NEXT: call void @foo() +; INL-MOD: IR Dump After + +define void @tester() noinline { + call void @foo() + ret void +} + +define void @foo() noinline { + call void @bar() + ret void +} + +define void @bar() noinline { + call void @foo() + ret void +} -- GitLab From 69b3f302bf683ae0975608aa35fc9602f5026e39 Mon Sep 17 00:00:00 2001 From: Nicolai Haehnle Date: Mon, 15 Oct 2018 11:37:04 +0000 Subject: [PATCH 0193/1116] AMDGPU: Test showing a scalar buffer load deficiency Change-Id: I5b64a565f22a8482aa0712488d85e45163ac3d12 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344506 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/CodeGen/AMDGPU/smrd.ll | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/test/CodeGen/AMDGPU/smrd.ll b/test/CodeGen/AMDGPU/smrd.ll index 612943b66c4..6596119f8b3 100644 --- a/test/CodeGen/AMDGPU/smrd.ll +++ b/test/CodeGen/AMDGPU/smrd.ll @@ -511,6 +511,29 @@ main_body: ret void } +; GCN-LABEL: {{^}}smrd_uniform_loop: +; +; TODO: this should use an s_buffer_load +; +; GCN: buffer_load_dword +define amdgpu_ps float @smrd_uniform_loop(<4 x i32> inreg %desc, i32 %bound) #0 { +main_body: + br label %loop + +loop: + %counter = phi i32 [ 0, %main_body ], [ %counter.next, %loop ] + %sum = phi float [ 0.0, %main_body ], [ %sum.next, %loop ] + %offset = shl i32 %counter, 2 + %v = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 %offset) + %sum.next = fadd float %sum, %v + %counter.next = add i32 %counter, 1 + %cc = icmp uge i32 %counter.next, %bound + br i1 %cc, label %exit, label %loop + +exit: + ret float %sum.next +} + declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #1 -- GitLab From ee084ebe8c45915458aecd5dbb9ff131c63aee97 Mon Sep 17 00:00:00 2001 From: Aleksandar Beserminji Date: Mon, 15 Oct 2018 12:59:17 +0000 Subject: [PATCH 0194/1116] [mips][micromips] Fix overlaping FDEs error When compiling static executable for micromips, CFI symbols are incorrectly labeled as MICROMIPS, which cause ".eh_frame_hdr refers to overlapping FDEs." error. This patch does not label CFI symbols as MICROMIPS, and FDEs do not overlap anymore. This patch also exposes another bug, which is fixed here: https://reviews.llvm.org/D52985 Differential Revision: https://reviews.llvm.org/D52987 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344511 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../Mips/MCTargetDesc/MipsELFStreamer.cpp | 17 +++++++++ .../Mips/MCTargetDesc/MipsELFStreamer.h | 7 ++++ test/DebugInfo/Mips/eh_frame.ll | 38 +++++++++++++++++++ 3 files changed, 62 insertions(+) create mode 100644 test/DebugInfo/Mips/eh_frame.ll diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp index 7b9a02503ce..21b01e85096 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp @@ -15,6 +15,7 @@ #include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCContext.h" +#include "llvm/MC/MCDwarf.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCObjectWriter.h" #include "llvm/MC/MCSymbolELF.h" @@ -53,6 +54,22 @@ void MipsELFStreamer::EmitInstruction(const MCInst &Inst, createPendingLabelRelocs(); } +void MipsELFStreamer::EmitCFIStartProcImpl(MCDwarfFrameInfo &Frame) { + Frame.Begin = getContext().createTempSymbol(); + MCELFStreamer::EmitLabel(Frame.Begin); +} + +MCSymbol *MipsELFStreamer::EmitCFILabel() { + MCSymbol *Label = getContext().createTempSymbol("cfi", true); + MCELFStreamer::EmitLabel(Label); + return Label; +} + +void MipsELFStreamer::EmitCFIEndProcImpl(MCDwarfFrameInfo &Frame) { + Frame.End = getContext().createTempSymbol(); + MCELFStreamer::EmitLabel(Frame.End); +} + void MipsELFStreamer::createPendingLabelRelocs() { MipsTargetELFStreamer *ELFTargetStreamer = static_cast(getTargetStreamer()); diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h index d141f5d77c6..d140201494f 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h +++ b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h @@ -25,6 +25,7 @@ namespace llvm { class MCAsmBackend; class MCCodeEmitter; class MCContext; +class MCDwarfFrameInfo; class MCSubtargetInfo; class MipsELFStreamer : public MCELFStreamer { @@ -60,6 +61,12 @@ public: void EmitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) override; void EmitIntValue(uint64_t Value, unsigned Size) override; + // Overriding these functions allows us to avoid recording of these labels + // in EmitLabel and later marking them as microMIPS. + void EmitCFIStartProcImpl(MCDwarfFrameInfo &Frame) override; + void EmitCFIEndProcImpl(MCDwarfFrameInfo &Frame) override; + MCSymbol *EmitCFILabel() override; + /// Emits all the option records stored up until the point it's called. void EmitMipsOptionRecords(); diff --git a/test/DebugInfo/Mips/eh_frame.ll b/test/DebugInfo/Mips/eh_frame.ll new file mode 100644 index 00000000000..4687443cb1c --- /dev/null +++ b/test/DebugInfo/Mips/eh_frame.ll @@ -0,0 +1,38 @@ +; RUN: llc -mtriple mips-unknown-linux-gnu -mattr=+micromips -O3 -filetype=obj -o - %s | llvm-readelf -r | FileCheck %s + +; CHECK: .rel.eh_frame +; CHECK: DW.ref.__gxx_personality_v0 +; CHECK-NEXT: .text +; CHECK-NEXT: .gcc_except_table + +@_ZTIi = external constant i8* + +define dso_local i32 @main() local_unnamed_addr personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { +entry: + %exception.i = tail call i8* @__cxa_allocate_exception(i32 4) nounwind + %0 = bitcast i8* %exception.i to i32* + store i32 5, i32* %0, align 16 + invoke void @__cxa_throw(i8* %exception.i, i8* bitcast (i8** @_ZTIi to i8*), i8* null) noreturn + to label %.noexc unwind label %return + +.noexc: + unreachable + +return: + %1 = landingpad { i8*, i32 } + catch i8* null + %2 = extractvalue { i8*, i32 } %1, 0 + %3 = tail call i8* @__cxa_begin_catch(i8* %2) nounwind + tail call void @__cxa_end_catch() + ret i32 0 +} + +declare i32 @__gxx_personality_v0(...) + +declare i8* @__cxa_begin_catch(i8*) local_unnamed_addr + +declare void @__cxa_end_catch() local_unnamed_addr + +declare i8* @__cxa_allocate_exception(i32) local_unnamed_addr + +declare void @__cxa_throw(i8*, i8*, i8*) local_unnamed_addr -- GitLab From 7f770f7d215d6109fb4a919776261b190c53735d Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 15 Oct 2018 13:20:41 +0000 Subject: [PATCH 0195/1116] [ARM][NEON] Improve vector popcnt lowering with PADDL (PR39281) As I suggested on PR39281, this patch uses PADDL pairwise addition to widen from the vXi8 CTPOP result to the target vector type. This is a blocker for moving more x86 code to generic vector CTPOP expansion (P32655 + D53258) - ARM's vXi64 CTPOP currently expands, which would generate a vXi64 MUL but ARM's custom lowering expands the general MUL case and vectors aren't well handled in LegalizeDAG - improving the CTPOP lowering was a lot easier than fixing the MUL lowering for this one case...... Differential Revision: https://reviews.llvm.org/D53257 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344512 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/ARM/ARMISelLowering.cpp | 156 +++++------------------------ test/CodeGen/ARM/popcnt.ll | 154 ++++------------------------ 2 files changed, 43 insertions(+), 267 deletions(-) diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index bfff368a8fe..3527d049f50 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -669,8 +669,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::CTPOP, MVT::v4i32, Custom); setOperationAction(ISD::CTPOP, MVT::v4i16, Custom); setOperationAction(ISD::CTPOP, MVT::v8i16, Custom); - setOperationAction(ISD::CTPOP, MVT::v1i64, Expand); - setOperationAction(ISD::CTPOP, MVT::v2i64, Expand); + setOperationAction(ISD::CTPOP, MVT::v1i64, Custom); + setOperationAction(ISD::CTPOP, MVT::v2i64, Custom); setOperationAction(ISD::CTLZ, MVT::v1i64, Expand); setOperationAction(ISD::CTLZ, MVT::v2i64, Expand); @@ -5409,10 +5409,6 @@ static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, // Compute with: cttz(x) = ctpop(lsb - 1) - // Since we can only compute the number of bits in a byte with vcnt.8, we - // have to gather the result with pairwise addition (vpaddl) for i16, i32, - // and i64. - // Compute LSB - 1. SDValue Bits; if (ElemTy == MVT::i64) { @@ -5425,32 +5421,7 @@ static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, DAG.getTargetConstant(1, dl, ElemTy)); Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One); } - - // Count #bits with vcnt.8. - EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8; - SDValue BitsVT8 = DAG.getNode(ISD::BITCAST, dl, VT8Bit, Bits); - SDValue Cnt8 = DAG.getNode(ISD::CTPOP, dl, VT8Bit, BitsVT8); - - // Gather the #bits with vpaddl (pairwise add.) - EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16; - SDValue Cnt16 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT16Bit, - DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32), - Cnt8); - if (ElemTy == MVT::i16) - return Cnt16; - - EVT VT32Bit = VT.is64BitVector() ? MVT::v2i32 : MVT::v4i32; - SDValue Cnt32 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT32Bit, - DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32), - Cnt16); - if (ElemTy == MVT::i32) - return Cnt32; - - assert(ElemTy == MVT::i64); - SDValue Cnt64 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32), - Cnt32); - return Cnt64; + return DAG.getNode(ISD::CTPOP, dl, VT, Bits); } if (!ST->hasV6T2Ops()) @@ -5460,112 +5431,37 @@ static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, return DAG.getNode(ISD::CTLZ, dl, VT, rbit); } -/// getCTPOP16BitCounts - Returns a v8i8/v16i8 vector containing the bit-count -/// for each 16-bit element from operand, repeated. The basic idea is to -/// leverage vcnt to get the 8-bit counts, gather and add the results. -/// -/// Trace for v4i16: -/// input = [v0 v1 v2 v3 ] (vi 16-bit element) -/// cast: N0 = [w0 w1 w2 w3 w4 w5 w6 w7] (v0 = [w0 w1], wi 8-bit element) -/// vcnt: N1 = [b0 b1 b2 b3 b4 b5 b6 b7] (bi = bit-count of 8-bit element wi) -/// vrev: N2 = [b1 b0 b3 b2 b5 b4 b7 b6] -/// [b0 b1 b2 b3 b4 b5 b6 b7] -/// +[b1 b0 b3 b2 b5 b4 b7 b6] -/// N3=N1+N2 = [k0 k0 k1 k1 k2 k2 k3 k3] (k0 = b0+b1 = bit-count of 16-bit v0, -/// vuzp: = [k0 k1 k2 k3 k0 k1 k2 k3] each ki is 8-bits) -static SDValue getCTPOP16BitCounts(SDNode *N, SelectionDAG &DAG) { - EVT VT = N->getValueType(0); - SDLoc DL(N); - - EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8; - SDValue N0 = DAG.getNode(ISD::BITCAST, DL, VT8Bit, N->getOperand(0)); - SDValue N1 = DAG.getNode(ISD::CTPOP, DL, VT8Bit, N0); - SDValue N2 = DAG.getNode(ARMISD::VREV16, DL, VT8Bit, N1); - SDValue N3 = DAG.getNode(ISD::ADD, DL, VT8Bit, N1, N2); - return DAG.getNode(ARMISD::VUZP, DL, VT8Bit, N3, N3); -} - -/// lowerCTPOP16BitElements - Returns a v4i16/v8i16 vector containing the -/// bit-count for each 16-bit element from the operand. We need slightly -/// different sequencing for v4i16 and v8i16 to stay within NEON's available -/// 64/128-bit registers. -/// -/// Trace for v4i16: -/// input = [v0 v1 v2 v3 ] (vi 16-bit element) -/// v8i8: BitCounts = [k0 k1 k2 k3 k0 k1 k2 k3 ] (ki is the bit-count of vi) -/// v8i16:Extended = [k0 k1 k2 k3 k0 k1 k2 k3 ] -/// v4i16:Extracted = [k0 k1 k2 k3 ] -static SDValue lowerCTPOP16BitElements(SDNode *N, SelectionDAG &DAG) { +static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG, + const ARMSubtarget *ST) { EVT VT = N->getValueType(0); SDLoc DL(N); - SDValue BitCounts = getCTPOP16BitCounts(N, DAG); - if (VT.is64BitVector()) { - SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, BitCounts); - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Extended, - DAG.getIntPtrConstant(0, DL)); - } else { - SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, - BitCounts, DAG.getIntPtrConstant(0, DL)); - return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, Extracted); - } -} - -/// lowerCTPOP32BitElements - Returns a v2i32/v4i32 vector containing the -/// bit-count for each 32-bit element from the operand. The idea here is -/// to split the vector into 16-bit elements, leverage the 16-bit count -/// routine, and then combine the results. -/// -/// Trace for v2i32 (v4i32 similar with Extracted/Extended exchanged): -/// input = [v0 v1 ] (vi: 32-bit elements) -/// Bitcast = [w0 w1 w2 w3 ] (wi: 16-bit elements, v0 = [w0 w1]) -/// Counts16 = [k0 k1 k2 k3 ] (ki: 16-bit elements, bit-count of wi) -/// vrev: N0 = [k1 k0 k3 k2 ] -/// [k0 k1 k2 k3 ] -/// N1 =+[k1 k0 k3 k2 ] -/// [k0 k2 k1 k3 ] -/// N2 =+[k1 k3 k0 k2 ] -/// [k0 k2 k1 k3 ] -/// Extended =+[k1 k3 k0 k2 ] -/// [k0 k2 ] -/// Extracted=+[k1 k3 ] -/// -static SDValue lowerCTPOP32BitElements(SDNode *N, SelectionDAG &DAG) { - EVT VT = N->getValueType(0); - SDLoc DL(N); + assert(ST->hasNEON() && "Custom ctpop lowering requires NEON."); + assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 || + VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) && + "Unexpected type for custom ctpop lowering"); - EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16; + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8; + SDValue Res = DAG.getBitcast(VT8Bit, N->getOperand(0)); + Res = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Res); - SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, VT16Bit, N->getOperand(0)); - SDValue Counts16 = lowerCTPOP16BitElements(Bitcast.getNode(), DAG); - SDValue N0 = DAG.getNode(ARMISD::VREV32, DL, VT16Bit, Counts16); - SDValue N1 = DAG.getNode(ISD::ADD, DL, VT16Bit, Counts16, N0); - SDValue N2 = DAG.getNode(ARMISD::VUZP, DL, VT16Bit, N1, N1); + // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds. + unsigned EltSize = 8; + unsigned NumElts = VT.is64BitVector() ? 8 : 16; + while (EltSize != VT.getScalarSizeInBits()) { + SmallVector Ops; + Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddlu, DL, + TLI.getPointerTy(DAG.getDataLayout()))); + Ops.push_back(Res); - if (VT.is64BitVector()) { - SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, N2); - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Extended, - DAG.getIntPtrConstant(0, DL)); - } else { - SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, N2, - DAG.getIntPtrConstant(0, DL)); - return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, Extracted); + EltSize *= 2; + NumElts /= 2; + MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts); + Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WidenVT, Ops); } -} -static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG, - const ARMSubtarget *ST) { - EVT VT = N->getValueType(0); - - assert(ST->hasNEON() && "Custom ctpop lowering requires NEON."); - assert((VT == MVT::v2i32 || VT == MVT::v4i32 || - VT == MVT::v4i16 || VT == MVT::v8i16) && - "Unexpected type for custom ctpop lowering"); - - if (VT.getVectorElementType() == MVT::i32) - return lowerCTPOP32BitElements(N, DAG); - else - return lowerCTPOP16BitElements(N, DAG); + return Res; } static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, diff --git a/test/CodeGen/ARM/popcnt.ll b/test/CodeGen/ARM/popcnt.ll index 224d5dcb3a6..e3ce5cd1ff9 100644 --- a/test/CodeGen/ARM/popcnt.ll +++ b/test/CodeGen/ARM/popcnt.ll @@ -32,11 +32,7 @@ define <4 x i16> @vcnt16(<4 x i16>* %A) nounwind { ; CHECK: @ %bb.0: ; CHECK-NEXT: vldr d16, [r0] ; CHECK-NEXT: vcnt.8 d16, d16 -; CHECK-NEXT: vrev16.8 d17, d16 -; CHECK-NEXT: vadd.i8 d16, d16, d17 -; CHECK-NEXT: vorr d17, d16, d16 -; CHECK-NEXT: vuzp.8 d16, d17 -; CHECK-NEXT: vmovl.u8 q8, d16 +; CHECK-NEXT: vpaddl.u8 d16, d16 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: mov pc, lr %tmp1 = load <4 x i16>, <4 x i16>* %A @@ -49,11 +45,7 @@ define <8 x i16> @vcntQ16(<8 x i16>* %A) nounwind { ; CHECK: @ %bb.0: ; CHECK-NEXT: vld1.64 {d16, d17}, [r0] ; CHECK-NEXT: vcnt.8 q8, q8 -; CHECK-NEXT: vrev16.8 q9, q8 -; CHECK-NEXT: vadd.i8 q8, q8, q9 -; CHECK-NEXT: vorr q9, q8, q8 -; CHECK-NEXT: vuzp.8 q8, q9 -; CHECK-NEXT: vmovl.u8 q8, d16 +; CHECK-NEXT: vpaddl.u8 q8, q8 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr @@ -67,16 +59,8 @@ define <2 x i32> @vcnt32(<2 x i32>* %A) nounwind { ; CHECK: @ %bb.0: ; CHECK-NEXT: vldr d16, [r0] ; CHECK-NEXT: vcnt.8 d16, d16 -; CHECK-NEXT: vrev16.8 d17, d16 -; CHECK-NEXT: vadd.i8 d16, d16, d17 -; CHECK-NEXT: vorr d17, d16, d16 -; CHECK-NEXT: vuzp.8 d16, d17 -; CHECK-NEXT: vmovl.u8 q8, d16 -; CHECK-NEXT: vrev32.16 d18, d16 -; CHECK-NEXT: vadd.i16 d16, d16, d18 -; CHECK-NEXT: vorr d17, d16, d16 -; CHECK-NEXT: vuzp.16 d16, d17 -; CHECK-NEXT: vmovl.u16 q8, d16 +; CHECK-NEXT: vpaddl.u8 d16, d16 +; CHECK-NEXT: vpaddl.u16 d16, d16 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: mov pc, lr %tmp1 = load <2 x i32>, <2 x i32>* %A @@ -89,16 +73,8 @@ define <4 x i32> @vcntQ32(<4 x i32>* %A) nounwind { ; CHECK: @ %bb.0: ; CHECK-NEXT: vld1.64 {d16, d17}, [r0] ; CHECK-NEXT: vcnt.8 q8, q8 -; CHECK-NEXT: vrev16.8 q9, q8 -; CHECK-NEXT: vadd.i8 q8, q8, q9 -; CHECK-NEXT: vorr q9, q8, q8 -; CHECK-NEXT: vuzp.8 q8, q9 -; CHECK-NEXT: vmovl.u8 q9, d16 -; CHECK-NEXT: vrev32.16 q9, q9 -; CHECK-NEXT: vaddw.u8 q8, q9, d16 -; CHECK-NEXT: vorr q9, q8, q8 -; CHECK-NEXT: vuzp.16 q8, q9 -; CHECK-NEXT: vmovl.u16 q8, d16 +; CHECK-NEXT: vpaddl.u8 q8, q8 +; CHECK-NEXT: vpaddl.u16 q8, q8 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr @@ -110,50 +86,13 @@ define <4 x i32> @vcntQ32(<4 x i32>* %A) nounwind { define <1 x i64> @vcnt64(<1 x i64>* %A) nounwind { ; CHECK-LABEL: vcnt64: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: vldr d16, [r0] -; CHECK-NEXT: ldr r2, .LCPI6_0 -; CHECK-NEXT: vmov.32 r0, d16[0] -; CHECK-NEXT: ldr r3, .LCPI6_3 -; CHECK-NEXT: vmov.32 r1, d16[1] -; CHECK-NEXT: ldr lr, .LCPI6_2 -; CHECK-NEXT: ldr r12, .LCPI6_1 -; CHECK-NEXT: vldr s1, .LCPI6_4 -; CHECK-NEXT: and r4, r2, r0, lsr #1 -; CHECK-NEXT: sub r0, r0, r4 -; CHECK-NEXT: and r2, r2, r1, lsr #1 -; CHECK-NEXT: sub r1, r1, r2 -; CHECK-NEXT: and r4, r0, r3 -; CHECK-NEXT: and r0, r3, r0, lsr #2 -; CHECK-NEXT: and r2, r1, r3 -; CHECK-NEXT: add r0, r4, r0 -; CHECK-NEXT: and r1, r3, r1, lsr #2 -; CHECK-NEXT: add r1, r2, r1 -; CHECK-NEXT: add r0, r0, r0, lsr #4 -; CHECK-NEXT: and r0, r0, lr -; CHECK-NEXT: add r1, r1, r1, lsr #4 -; CHECK-NEXT: mul r2, r0, r12 -; CHECK-NEXT: and r0, r1, lr -; CHECK-NEXT: mul r1, r0, r12 -; CHECK-NEXT: lsr r0, r2, #24 -; CHECK-NEXT: add r0, r0, r1, lsr #24 -; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vmov r0, r1, d0 -; CHECK-NEXT: pop {r4, lr} +; CHECK-NEXT: vcnt.8 d16, d16 +; CHECK-NEXT: vpaddl.u8 d16, d16 +; CHECK-NEXT: vpaddl.u16 d16, d16 +; CHECK-NEXT: vpaddl.u32 d16, d16 +; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: mov pc, lr -; CHECK-NEXT: .p2align 2 -; CHECK-NEXT: @ %bb.1: -; CHECK-NEXT: .LCPI6_0: -; CHECK-NEXT: .long 1431655765 @ 0x55555555 -; CHECK-NEXT: .LCPI6_1: -; CHECK-NEXT: .long 16843009 @ 0x1010101 -; CHECK-NEXT: .LCPI6_2: -; CHECK-NEXT: .long 252645135 @ 0xf0f0f0f -; CHECK-NEXT: .LCPI6_3: -; CHECK-NEXT: .long 858993459 @ 0x33333333 -; CHECK-NEXT: .LCPI6_4: -; CHECK-NEXT: .long 0 @ float 0 %tmp1 = load <1 x i64>, <1 x i64>* %A %tmp2 = call <1 x i64> @llvm.ctpop.v1i64(<1 x i64> %tmp1) ret <1 x i64> %tmp2 @@ -162,73 +101,14 @@ define <1 x i64> @vcnt64(<1 x i64>* %A) nounwind { define <2 x i64> @vcntQ64(<2 x i64>* %A) nounwind { ; CHECK-LABEL: vcntQ64: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} ; CHECK-NEXT: vld1.64 {d16, d17}, [r0] -; CHECK-NEXT: vmov.32 r1, d17[1] -; CHECK-NEXT: ldr lr, .LCPI7_0 -; CHECK-NEXT: vmov.32 r2, d17[0] -; CHECK-NEXT: ldr r0, .LCPI7_2 -; CHECK-NEXT: vmov.32 r3, d16[0] -; CHECK-NEXT: ldr r12, .LCPI7_1 -; CHECK-NEXT: ldr r5, .LCPI7_3 -; CHECK-NEXT: vldr s3, .LCPI7_4 -; CHECK-NEXT: and r4, lr, r1, lsr #1 -; CHECK-NEXT: sub r1, r1, r4 -; CHECK-NEXT: and r4, r1, r0 -; CHECK-NEXT: and r1, r0, r1, lsr #2 -; CHECK-NEXT: add r1, r4, r1 -; CHECK-NEXT: and r4, lr, r2, lsr #1 -; CHECK-NEXT: sub r2, r2, r4 -; CHECK-NEXT: and r4, r2, r0 -; CHECK-NEXT: add r1, r1, r1, lsr #4 -; CHECK-NEXT: and r2, r0, r2, lsr #2 -; CHECK-NEXT: and r6, r1, r12 -; CHECK-NEXT: add r2, r4, r2 -; CHECK-NEXT: and r4, lr, r3, lsr #1 -; CHECK-NEXT: sub r3, r3, r4 -; CHECK-NEXT: and r4, r3, r0 -; CHECK-NEXT: add r2, r2, r2, lsr #4 -; CHECK-NEXT: and r3, r0, r3, lsr #2 -; CHECK-NEXT: and r2, r2, r12 -; CHECK-NEXT: add r3, r4, r3 -; CHECK-NEXT: add r3, r3, r3, lsr #4 -; CHECK-NEXT: and r3, r3, r12 -; CHECK-NEXT: mul r4, r3, r5 -; CHECK-NEXT: vmov.32 r3, d16[1] -; CHECK-NEXT: and r1, lr, r3, lsr #1 -; CHECK-NEXT: sub r1, r3, r1 -; CHECK-NEXT: and r3, r1, r0 -; CHECK-NEXT: and r0, r0, r1, lsr #2 -; CHECK-NEXT: mul r1, r2, r5 -; CHECK-NEXT: add r0, r3, r0 -; CHECK-NEXT: mul r2, r6, r5 -; CHECK-NEXT: add r0, r0, r0, lsr #4 -; CHECK-NEXT: and r0, r0, r12 -; CHECK-NEXT: mul r3, r0, r5 -; CHECK-NEXT: lsr r0, r1, #24 -; CHECK-NEXT: lsr r1, r4, #24 -; CHECK-NEXT: add r0, r0, r2, lsr #24 -; CHECK-NEXT: vmov s2, r0 -; CHECK-NEXT: add r0, r1, r3, lsr #24 -; CHECK-NEXT: vmov s0, r0 -; CHECK-NEXT: vmov.f32 s1, s3 -; CHECK-NEXT: vmov r2, r3, d1 -; CHECK-NEXT: vmov r0, r1, d0 -; CHECK-NEXT: pop {r4, r5, r6, lr} +; CHECK-NEXT: vcnt.8 q8, q8 +; CHECK-NEXT: vpaddl.u8 q8, q8 +; CHECK-NEXT: vpaddl.u16 q8, q8 +; CHECK-NEXT: vpaddl.u32 q8, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 ; CHECK-NEXT: mov pc, lr -; CHECK-NEXT: .p2align 2 -; CHECK-NEXT: @ %bb.1: -; CHECK-NEXT: .LCPI7_0: -; CHECK-NEXT: .long 1431655765 @ 0x55555555 -; CHECK-NEXT: .LCPI7_1: -; CHECK-NEXT: .long 252645135 @ 0xf0f0f0f -; CHECK-NEXT: .LCPI7_2: -; CHECK-NEXT: .long 858993459 @ 0x33333333 -; CHECK-NEXT: .LCPI7_3: -; CHECK-NEXT: .long 16843009 @ 0x1010101 -; CHECK-NEXT: .LCPI7_4: -; CHECK-NEXT: .long 0 @ float 0 %tmp1 = load <2 x i64>, <2 x i64>* %A %tmp2 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %tmp1) ret <2 x i64> %tmp2 -- GitLab From 1f57e3857aaae24a7afec22084f6c2d421c29c3b Mon Sep 17 00:00:00 2001 From: Aleksandar Beserminji Date: Mon, 15 Oct 2018 14:36:48 +0000 Subject: [PATCH 0196/1116] [mips][micromips] Revert "Fix overlaping FDEs error" This reverts r344511. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344515 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../Mips/MCTargetDesc/MipsELFStreamer.cpp | 17 --------- .../Mips/MCTargetDesc/MipsELFStreamer.h | 7 ---- test/DebugInfo/Mips/eh_frame.ll | 38 ------------------- 3 files changed, 62 deletions(-) delete mode 100644 test/DebugInfo/Mips/eh_frame.ll diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp index 21b01e85096..7b9a02503ce 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp @@ -15,7 +15,6 @@ #include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCContext.h" -#include "llvm/MC/MCDwarf.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCObjectWriter.h" #include "llvm/MC/MCSymbolELF.h" @@ -54,22 +53,6 @@ void MipsELFStreamer::EmitInstruction(const MCInst &Inst, createPendingLabelRelocs(); } -void MipsELFStreamer::EmitCFIStartProcImpl(MCDwarfFrameInfo &Frame) { - Frame.Begin = getContext().createTempSymbol(); - MCELFStreamer::EmitLabel(Frame.Begin); -} - -MCSymbol *MipsELFStreamer::EmitCFILabel() { - MCSymbol *Label = getContext().createTempSymbol("cfi", true); - MCELFStreamer::EmitLabel(Label); - return Label; -} - -void MipsELFStreamer::EmitCFIEndProcImpl(MCDwarfFrameInfo &Frame) { - Frame.End = getContext().createTempSymbol(); - MCELFStreamer::EmitLabel(Frame.End); -} - void MipsELFStreamer::createPendingLabelRelocs() { MipsTargetELFStreamer *ELFTargetStreamer = static_cast(getTargetStreamer()); diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h index d140201494f..d141f5d77c6 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h +++ b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h @@ -25,7 +25,6 @@ namespace llvm { class MCAsmBackend; class MCCodeEmitter; class MCContext; -class MCDwarfFrameInfo; class MCSubtargetInfo; class MipsELFStreamer : public MCELFStreamer { @@ -61,12 +60,6 @@ public: void EmitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) override; void EmitIntValue(uint64_t Value, unsigned Size) override; - // Overriding these functions allows us to avoid recording of these labels - // in EmitLabel and later marking them as microMIPS. - void EmitCFIStartProcImpl(MCDwarfFrameInfo &Frame) override; - void EmitCFIEndProcImpl(MCDwarfFrameInfo &Frame) override; - MCSymbol *EmitCFILabel() override; - /// Emits all the option records stored up until the point it's called. void EmitMipsOptionRecords(); diff --git a/test/DebugInfo/Mips/eh_frame.ll b/test/DebugInfo/Mips/eh_frame.ll deleted file mode 100644 index 4687443cb1c..00000000000 --- a/test/DebugInfo/Mips/eh_frame.ll +++ /dev/null @@ -1,38 +0,0 @@ -; RUN: llc -mtriple mips-unknown-linux-gnu -mattr=+micromips -O3 -filetype=obj -o - %s | llvm-readelf -r | FileCheck %s - -; CHECK: .rel.eh_frame -; CHECK: DW.ref.__gxx_personality_v0 -; CHECK-NEXT: .text -; CHECK-NEXT: .gcc_except_table - -@_ZTIi = external constant i8* - -define dso_local i32 @main() local_unnamed_addr personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { -entry: - %exception.i = tail call i8* @__cxa_allocate_exception(i32 4) nounwind - %0 = bitcast i8* %exception.i to i32* - store i32 5, i32* %0, align 16 - invoke void @__cxa_throw(i8* %exception.i, i8* bitcast (i8** @_ZTIi to i8*), i8* null) noreturn - to label %.noexc unwind label %return - -.noexc: - unreachable - -return: - %1 = landingpad { i8*, i32 } - catch i8* null - %2 = extractvalue { i8*, i32 } %1, 0 - %3 = tail call i8* @__cxa_begin_catch(i8* %2) nounwind - tail call void @__cxa_end_catch() - ret i32 0 -} - -declare i32 @__gxx_personality_v0(...) - -declare i8* @__cxa_begin_catch(i8*) local_unnamed_addr - -declare void @__cxa_end_catch() local_unnamed_addr - -declare i8* @__cxa_allocate_exception(i32) local_unnamed_addr - -declare void @__cxa_throw(i8*, i8*, i8*) local_unnamed_addr -- GitLab From c2c7e976de76a9e5eb5e56993c3b6289e314f8c9 Mon Sep 17 00:00:00 2001 From: Aleksandar Beserminji Date: Mon, 15 Oct 2018 14:39:12 +0000 Subject: [PATCH 0197/1116] [mips][micromips] Fix overlaping FDEs error When compiling static executable for micromips, CFI symbols are incorrectly labeled as MICROMIPS, which cause ".eh_frame_hdr refers to overlapping FDEs." error. This patch does not label CFI symbols as MICROMIPS, and FDEs do not overlap anymore. This patch also exposes another bug, which is fixed here: https://reviews.llvm.org/D52985 Differential Revision: https://reviews.llvm.org/D52987 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344516 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../Mips/MCTargetDesc/MipsELFStreamer.cpp | 17 +++++++++ .../Mips/MCTargetDesc/MipsELFStreamer.h | 7 ++++ test/DebugInfo/Mips/eh_frame.ll | 38 +++++++++++++++++++ 3 files changed, 62 insertions(+) create mode 100644 test/DebugInfo/Mips/eh_frame.ll diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp index 7b9a02503ce..21b01e85096 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp @@ -15,6 +15,7 @@ #include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCContext.h" +#include "llvm/MC/MCDwarf.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCObjectWriter.h" #include "llvm/MC/MCSymbolELF.h" @@ -53,6 +54,22 @@ void MipsELFStreamer::EmitInstruction(const MCInst &Inst, createPendingLabelRelocs(); } +void MipsELFStreamer::EmitCFIStartProcImpl(MCDwarfFrameInfo &Frame) { + Frame.Begin = getContext().createTempSymbol(); + MCELFStreamer::EmitLabel(Frame.Begin); +} + +MCSymbol *MipsELFStreamer::EmitCFILabel() { + MCSymbol *Label = getContext().createTempSymbol("cfi", true); + MCELFStreamer::EmitLabel(Label); + return Label; +} + +void MipsELFStreamer::EmitCFIEndProcImpl(MCDwarfFrameInfo &Frame) { + Frame.End = getContext().createTempSymbol(); + MCELFStreamer::EmitLabel(Frame.End); +} + void MipsELFStreamer::createPendingLabelRelocs() { MipsTargetELFStreamer *ELFTargetStreamer = static_cast(getTargetStreamer()); diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h index d141f5d77c6..56a0ff96c7b 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h +++ b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h @@ -26,6 +26,7 @@ class MCAsmBackend; class MCCodeEmitter; class MCContext; class MCSubtargetInfo; +struct MCDwarfFrameInfo; class MipsELFStreamer : public MCELFStreamer { SmallVector, 8> MipsOptionRecords; @@ -60,6 +61,12 @@ public: void EmitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) override; void EmitIntValue(uint64_t Value, unsigned Size) override; + // Overriding these functions allows us to avoid recording of these labels + // in EmitLabel and later marking them as microMIPS. + void EmitCFIStartProcImpl(MCDwarfFrameInfo &Frame) override; + void EmitCFIEndProcImpl(MCDwarfFrameInfo &Frame) override; + MCSymbol *EmitCFILabel() override; + /// Emits all the option records stored up until the point it's called. void EmitMipsOptionRecords(); diff --git a/test/DebugInfo/Mips/eh_frame.ll b/test/DebugInfo/Mips/eh_frame.ll new file mode 100644 index 00000000000..4687443cb1c --- /dev/null +++ b/test/DebugInfo/Mips/eh_frame.ll @@ -0,0 +1,38 @@ +; RUN: llc -mtriple mips-unknown-linux-gnu -mattr=+micromips -O3 -filetype=obj -o - %s | llvm-readelf -r | FileCheck %s + +; CHECK: .rel.eh_frame +; CHECK: DW.ref.__gxx_personality_v0 +; CHECK-NEXT: .text +; CHECK-NEXT: .gcc_except_table + +@_ZTIi = external constant i8* + +define dso_local i32 @main() local_unnamed_addr personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { +entry: + %exception.i = tail call i8* @__cxa_allocate_exception(i32 4) nounwind + %0 = bitcast i8* %exception.i to i32* + store i32 5, i32* %0, align 16 + invoke void @__cxa_throw(i8* %exception.i, i8* bitcast (i8** @_ZTIi to i8*), i8* null) noreturn + to label %.noexc unwind label %return + +.noexc: + unreachable + +return: + %1 = landingpad { i8*, i32 } + catch i8* null + %2 = extractvalue { i8*, i32 } %1, 0 + %3 = tail call i8* @__cxa_begin_catch(i8* %2) nounwind + tail call void @__cxa_end_catch() + ret i32 0 +} + +declare i32 @__gxx_personality_v0(...) + +declare i8* @__cxa_begin_catch(i8*) local_unnamed_addr + +declare void @__cxa_end_catch() local_unnamed_addr + +declare i8* @__cxa_allocate_exception(i32) local_unnamed_addr + +declare void @__cxa_throw(i8*, i8*, i8*) local_unnamed_addr -- GitLab From bce0a9abfff5f789f591d301b0bb1344ed14921e Mon Sep 17 00:00:00 2001 From: Fedor Sergeev Date: Mon, 15 Oct 2018 15:00:18 +0000 Subject: [PATCH 0198/1116] [NewPM] teach -passes= to emit meaningful error messages Summary: All the PassBuilder::parse interfaces now return descriptive StringError instead of a plain bool. It allows to make -passes/aa-pipeline parsing errors context-specific and thus less confusing. TODO: ideally we should also make suggestions for misspelled pass names, but that requires some extensions to PassBuilder. Reviewed By: philip.pfaffe, chandlerc Differential Revision: https://reviews.llvm.org/D53246 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344519 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Passes/PassBuilder.h | 51 +-- lib/LTO/LTOBackend.cpp | 14 +- lib/Passes/PassBuilder.cpp | 366 ++++++++++++--------- test/Other/pass-pipeline-parsing.ll | 83 ++++- test/tools/llvm-lto2/X86/pipeline.ll | 4 +- test/tools/llvm-opt-fuzzer/command-line.ll | 2 +- tools/llvm-opt-fuzzer/llvm-opt-fuzzer.cpp | 11 +- tools/opt/NewPMDriver.cpp | 84 +++-- unittests/IR/CMakeLists.txt | 2 + unittests/IR/PassBuilderCallbacksTest.cpp | 37 ++- unittests/Passes/CMakeLists.txt | 1 + unittests/Passes/PluginsTest.cpp | 5 +- 12 files changed, 394 insertions(+), 266 deletions(-) diff --git a/include/llvm/Passes/PassBuilder.h b/include/llvm/Passes/PassBuilder.h index 91314430a96..22e5eb0caa0 100644 --- a/include/llvm/Passes/PassBuilder.h +++ b/include/llvm/Passes/PassBuilder.h @@ -19,6 +19,7 @@ #include "llvm/ADT/Optional.h" #include "llvm/Analysis/CGSCCPassManager.h" #include "llvm/IR/PassManager.h" +#include "llvm/Support/Error.h" #include "llvm/Transforms/Instrumentation.h" #include "llvm/Transforms/Scalar/LoopPassManager.h" #include @@ -384,8 +385,9 @@ public: /// If the sequence of passes aren't all the exact same kind of pass, it will /// be an error. You cannot mix different levels implicitly, you must /// explicitly form a pass manager in which to nest passes. - bool parsePassPipeline(ModulePassManager &MPM, StringRef PipelineText, - bool VerifyEachPass = true, bool DebugLogging = false); + Error parsePassPipeline(ModulePassManager &MPM, StringRef PipelineText, + bool VerifyEachPass = true, + bool DebugLogging = false); /// {{@ Parse a textual pass pipeline description into a specific PassManager /// @@ -394,12 +396,15 @@ public: /// this is the valid pipeline text: /// /// function(lpass) - bool parsePassPipeline(CGSCCPassManager &CGPM, StringRef PipelineText, - bool VerifyEachPass = true, bool DebugLogging = false); - bool parsePassPipeline(FunctionPassManager &FPM, StringRef PipelineText, - bool VerifyEachPass = true, bool DebugLogging = false); - bool parsePassPipeline(LoopPassManager &LPM, StringRef PipelineText, - bool VerifyEachPass = true, bool DebugLogging = false); + Error parsePassPipeline(CGSCCPassManager &CGPM, StringRef PipelineText, + bool VerifyEachPass = true, + bool DebugLogging = false); + Error parsePassPipeline(FunctionPassManager &FPM, StringRef PipelineText, + bool VerifyEachPass = true, + bool DebugLogging = false); + Error parsePassPipeline(LoopPassManager &LPM, StringRef PipelineText, + bool VerifyEachPass = true, + bool DebugLogging = false); /// @}} /// Parse a textual alias analysis pipeline into the provided AA manager. @@ -417,7 +422,7 @@ public: /// Returns false if the text cannot be parsed cleanly. The specific state of /// the \p AA manager is unspecified if such an error is encountered and this /// returns false. - bool parseAAPipeline(AAManager &AA, StringRef PipelineText); + Error parseAAPipeline(AAManager &AA, StringRef PipelineText); /// Register a callback for a default optimizer pipeline extension /// point @@ -565,28 +570,28 @@ private: static Optional> parsePipelineText(StringRef Text); - bool parseModulePass(ModulePassManager &MPM, const PipelineElement &E, + Error parseModulePass(ModulePassManager &MPM, const PipelineElement &E, + bool VerifyEachPass, bool DebugLogging); + Error parseCGSCCPass(CGSCCPassManager &CGPM, const PipelineElement &E, bool VerifyEachPass, bool DebugLogging); - bool parseCGSCCPass(CGSCCPassManager &CGPM, const PipelineElement &E, + Error parseFunctionPass(FunctionPassManager &FPM, const PipelineElement &E, + bool VerifyEachPass, bool DebugLogging); + Error parseLoopPass(LoopPassManager &LPM, const PipelineElement &E, bool VerifyEachPass, bool DebugLogging); - bool parseFunctionPass(FunctionPassManager &FPM, const PipelineElement &E, - bool VerifyEachPass, bool DebugLogging); - bool parseLoopPass(LoopPassManager &LPM, const PipelineElement &E, - bool VerifyEachPass, bool DebugLogging); bool parseAAPassName(AAManager &AA, StringRef Name); - bool parseLoopPassPipeline(LoopPassManager &LPM, - ArrayRef Pipeline, - bool VerifyEachPass, bool DebugLogging); - bool parseFunctionPassPipeline(FunctionPassManager &FPM, - ArrayRef Pipeline, - bool VerifyEachPass, bool DebugLogging); - bool parseCGSCCPassPipeline(CGSCCPassManager &CGPM, + Error parseLoopPassPipeline(LoopPassManager &LPM, ArrayRef Pipeline, bool VerifyEachPass, bool DebugLogging); - bool parseModulePassPipeline(ModulePassManager &MPM, + Error parseFunctionPassPipeline(FunctionPassManager &FPM, + ArrayRef Pipeline, + bool VerifyEachPass, bool DebugLogging); + Error parseCGSCCPassPipeline(CGSCCPassManager &CGPM, ArrayRef Pipeline, bool VerifyEachPass, bool DebugLogging); + Error parseModulePassPipeline(ModulePassManager &MPM, + ArrayRef Pipeline, + bool VerifyEachPass, bool DebugLogging); void addPGOInstrPasses(ModulePassManager &MPM, bool DebugLogging, OptimizationLevel Level, bool RunProfileGen, diff --git a/lib/LTO/LTOBackend.cpp b/lib/LTO/LTOBackend.cpp index 20fc40de4b9..1f9d60a5bdf 100644 --- a/lib/LTO/LTOBackend.cpp +++ b/lib/LTO/LTOBackend.cpp @@ -162,7 +162,7 @@ static void runNewPMPasses(Config &Conf, Module &Mod, TargetMachine *TM, AAManager AA; // Parse a custom AA pipeline if asked to. - if (!PB.parseAAPipeline(AA, "default")) + if (auto Err = PB.parseAAPipeline(AA, "default")) report_fatal_error("Error parsing default AA pipeline"); LoopAnalysisManager LAM(Conf.DebugPassManager); @@ -221,9 +221,9 @@ static void runNewPMCustomPasses(Module &Mod, TargetMachine *TM, // Parse a custom AA pipeline if asked to. if (!AAPipelineDesc.empty()) - if (!PB.parseAAPipeline(AA, AAPipelineDesc)) - report_fatal_error("unable to parse AA pipeline description: " + - AAPipelineDesc); + if (auto Err = PB.parseAAPipeline(AA, AAPipelineDesc)) + report_fatal_error("unable to parse AA pipeline description '" + + AAPipelineDesc + "': " + toString(std::move(Err))); LoopAnalysisManager LAM; FunctionAnalysisManager FAM; @@ -246,9 +246,9 @@ static void runNewPMCustomPasses(Module &Mod, TargetMachine *TM, MPM.addPass(VerifierPass()); // Now, add all the passes we've been requested to. - if (!PB.parsePassPipeline(MPM, PipelineDesc)) - report_fatal_error("unable to parse pass pipeline description: " + - PipelineDesc); + if (auto Err = PB.parsePassPipeline(MPM, PipelineDesc)) + report_fatal_error("unable to parse pass pipeline description '" + + PipelineDesc + "': " + toString(std::move(Err))); if (!DisableVerify) MPM.addPass(VerifierPass()); diff --git a/lib/Passes/PassBuilder.cpp b/lib/Passes/PassBuilder.cpp index 09758dc5651..f6313d23e2d 100644 --- a/lib/Passes/PassBuilder.cpp +++ b/lib/Passes/PassBuilder.cpp @@ -58,6 +58,7 @@ #include "llvm/IR/PassManager.h" #include "llvm/IR/Verifier.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/FormatVariadic.h" #include "llvm/Support/Regex.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h" @@ -1402,9 +1403,9 @@ PassBuilder::parsePipelineText(StringRef Text) { return {std::move(ResultPipeline)}; } -bool PassBuilder::parseModulePass(ModulePassManager &MPM, - const PipelineElement &E, bool VerifyEachPass, - bool DebugLogging) { +Error PassBuilder::parseModulePass(ModulePassManager &MPM, + const PipelineElement &E, + bool VerifyEachPass, bool DebugLogging) { auto &Name = E.Name; auto &InnerPipeline = E.InnerPipeline; @@ -1412,50 +1413,56 @@ bool PassBuilder::parseModulePass(ModulePassManager &MPM, if (!InnerPipeline.empty()) { if (Name == "module") { ModulePassManager NestedMPM(DebugLogging); - if (!parseModulePassPipeline(NestedMPM, InnerPipeline, VerifyEachPass, - DebugLogging)) - return false; + if (auto Err = parseModulePassPipeline(NestedMPM, InnerPipeline, + VerifyEachPass, DebugLogging)) + return Err; MPM.addPass(std::move(NestedMPM)); - return true; + return Error::success(); } if (Name == "cgscc") { CGSCCPassManager CGPM(DebugLogging); - if (!parseCGSCCPassPipeline(CGPM, InnerPipeline, VerifyEachPass, - DebugLogging)) - return false; + if (auto Err = parseCGSCCPassPipeline(CGPM, InnerPipeline, VerifyEachPass, + DebugLogging)) + return Err; MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM))); - return true; + return Error::success(); } if (Name == "function") { FunctionPassManager FPM(DebugLogging); - if (!parseFunctionPassPipeline(FPM, InnerPipeline, VerifyEachPass, - DebugLogging)) - return false; + if (auto Err = parseFunctionPassPipeline(FPM, InnerPipeline, + VerifyEachPass, DebugLogging)) + return Err; MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); - return true; + return Error::success(); } if (auto Count = parseRepeatPassName(Name)) { ModulePassManager NestedMPM(DebugLogging); - if (!parseModulePassPipeline(NestedMPM, InnerPipeline, VerifyEachPass, - DebugLogging)) - return false; + if (auto Err = parseModulePassPipeline(NestedMPM, InnerPipeline, + VerifyEachPass, DebugLogging)) + return Err; MPM.addPass(createRepeatedPass(*Count, std::move(NestedMPM))); - return true; + return Error::success(); } for (auto &C : ModulePipelineParsingCallbacks) if (C(Name, MPM, InnerPipeline)) - return true; + return Error::success(); // Normal passes can't have pipelines. - return false; + return make_error( + formatv("invalid use of '{0}' pass as module pipeline", Name).str(), + inconvertibleErrorCode()); + ; } // Manually handle aliases for pre-configured pipeline fragments. if (startsWithDefaultPipelineAliasPrefix(Name)) { SmallVector Matches; if (!DefaultAliasRegex.match(Name, &Matches)) - return false; + return make_error( + formatv("unknown default pipeline alias '{0}'", Name).str(), + inconvertibleErrorCode()); + assert(Matches.size() == 3 && "Must capture two matched strings!"); OptimizationLevel L = StringSwitch(Matches[2]) @@ -1467,7 +1474,7 @@ bool PassBuilder::parseModulePass(ModulePassManager &MPM, .Case("Oz", Oz); if (L == O0) // At O0 we do nothing at all! - return true; + return Error::success(); if (Matches[1] == "default") { MPM.addPass(buildPerModuleDefaultPipeline(L, DebugLogging)); @@ -1481,38 +1488,40 @@ bool PassBuilder::parseModulePass(ModulePassManager &MPM, assert(Matches[1] == "lto" && "Not one of the matched options!"); MPM.addPass(buildLTODefaultPipeline(L, DebugLogging, nullptr)); } - return true; + return Error::success(); } // Finally expand the basic registered passes from the .inc file. #define MODULE_PASS(NAME, CREATE_PASS) \ if (Name == NAME) { \ MPM.addPass(CREATE_PASS); \ - return true; \ + return Error::success(); \ } #define MODULE_ANALYSIS(NAME, CREATE_PASS) \ if (Name == "require<" NAME ">") { \ MPM.addPass( \ RequireAnalysisPass< \ std::remove_reference::type, Module>()); \ - return true; \ + return Error::success(); \ } \ if (Name == "invalidate<" NAME ">") { \ MPM.addPass(InvalidateAnalysisPass< \ std::remove_reference::type>()); \ - return true; \ + return Error::success(); \ } #include "PassRegistry.def" for (auto &C : ModulePipelineParsingCallbacks) if (C(Name, MPM, InnerPipeline)) - return true; - return false; + return Error::success(); + return make_error( + formatv("unknown module pass '{0}'", Name).str(), + inconvertibleErrorCode()); } -bool PassBuilder::parseCGSCCPass(CGSCCPassManager &CGPM, - const PipelineElement &E, bool VerifyEachPass, - bool DebugLogging) { +Error PassBuilder::parseCGSCCPass(CGSCCPassManager &CGPM, + const PipelineElement &E, bool VerifyEachPass, + bool DebugLogging) { auto &Name = E.Name; auto &InnerPipeline = E.InnerPipeline; @@ -1520,53 +1529,55 @@ bool PassBuilder::parseCGSCCPass(CGSCCPassManager &CGPM, if (!InnerPipeline.empty()) { if (Name == "cgscc") { CGSCCPassManager NestedCGPM(DebugLogging); - if (!parseCGSCCPassPipeline(NestedCGPM, InnerPipeline, VerifyEachPass, - DebugLogging)) - return false; + if (auto Err = parseCGSCCPassPipeline(NestedCGPM, InnerPipeline, + VerifyEachPass, DebugLogging)) + return Err; // Add the nested pass manager with the appropriate adaptor. CGPM.addPass(std::move(NestedCGPM)); - return true; + return Error::success(); } if (Name == "function") { FunctionPassManager FPM(DebugLogging); - if (!parseFunctionPassPipeline(FPM, InnerPipeline, VerifyEachPass, - DebugLogging)) - return false; + if (auto Err = parseFunctionPassPipeline(FPM, InnerPipeline, + VerifyEachPass, DebugLogging)) + return Err; // Add the nested pass manager with the appropriate adaptor. CGPM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM))); - return true; + return Error::success(); } if (auto Count = parseRepeatPassName(Name)) { CGSCCPassManager NestedCGPM(DebugLogging); - if (!parseCGSCCPassPipeline(NestedCGPM, InnerPipeline, VerifyEachPass, - DebugLogging)) - return false; + if (auto Err = parseCGSCCPassPipeline(NestedCGPM, InnerPipeline, + VerifyEachPass, DebugLogging)) + return Err; CGPM.addPass(createRepeatedPass(*Count, std::move(NestedCGPM))); - return true; + return Error::success(); } if (auto MaxRepetitions = parseDevirtPassName(Name)) { CGSCCPassManager NestedCGPM(DebugLogging); - if (!parseCGSCCPassPipeline(NestedCGPM, InnerPipeline, VerifyEachPass, - DebugLogging)) - return false; + if (auto Err = parseCGSCCPassPipeline(NestedCGPM, InnerPipeline, + VerifyEachPass, DebugLogging)) + return Err; CGPM.addPass( createDevirtSCCRepeatedPass(std::move(NestedCGPM), *MaxRepetitions)); - return true; + return Error::success(); } for (auto &C : CGSCCPipelineParsingCallbacks) if (C(Name, CGPM, InnerPipeline)) - return true; + return Error::success(); // Normal passes can't have pipelines. - return false; + return make_error( + formatv("invalid use of '{0}' pass as cgscc pipeline", Name).str(), + inconvertibleErrorCode()); } // Now expand the basic registered passes from the .inc file. #define CGSCC_PASS(NAME, CREATE_PASS) \ if (Name == NAME) { \ CGPM.addPass(CREATE_PASS); \ - return true; \ + return Error::success(); \ } #define CGSCC_ANALYSIS(NAME, CREATE_PASS) \ if (Name == "require<" NAME ">") { \ @@ -1574,24 +1585,26 @@ bool PassBuilder::parseCGSCCPass(CGSCCPassManager &CGPM, std::remove_reference::type, \ LazyCallGraph::SCC, CGSCCAnalysisManager, LazyCallGraph &, \ CGSCCUpdateResult &>()); \ - return true; \ + return Error::success(); \ } \ if (Name == "invalidate<" NAME ">") { \ CGPM.addPass(InvalidateAnalysisPass< \ std::remove_reference::type>()); \ - return true; \ + return Error::success(); \ } #include "PassRegistry.def" for (auto &C : CGSCCPipelineParsingCallbacks) if (C(Name, CGPM, InnerPipeline)) - return true; - return false; + return Error::success(); + return make_error( + formatv("unknown cgscc pass '{0}'", Name).str(), + inconvertibleErrorCode()); } -bool PassBuilder::parseFunctionPass(FunctionPassManager &FPM, - const PipelineElement &E, - bool VerifyEachPass, bool DebugLogging) { +Error PassBuilder::parseFunctionPass(FunctionPassManager &FPM, + const PipelineElement &E, + bool VerifyEachPass, bool DebugLogging) { auto &Name = E.Name; auto &InnerPipeline = E.InnerPipeline; @@ -1599,68 +1612,72 @@ bool PassBuilder::parseFunctionPass(FunctionPassManager &FPM, if (!InnerPipeline.empty()) { if (Name == "function") { FunctionPassManager NestedFPM(DebugLogging); - if (!parseFunctionPassPipeline(NestedFPM, InnerPipeline, VerifyEachPass, - DebugLogging)) - return false; + if (auto Err = parseFunctionPassPipeline(NestedFPM, InnerPipeline, + VerifyEachPass, DebugLogging)) + return Err; // Add the nested pass manager with the appropriate adaptor. FPM.addPass(std::move(NestedFPM)); - return true; + return Error::success(); } if (Name == "loop") { LoopPassManager LPM(DebugLogging); - if (!parseLoopPassPipeline(LPM, InnerPipeline, VerifyEachPass, - DebugLogging)) - return false; + if (auto Err = parseLoopPassPipeline(LPM, InnerPipeline, VerifyEachPass, + DebugLogging)) + return Err; // Add the nested pass manager with the appropriate adaptor. FPM.addPass( createFunctionToLoopPassAdaptor(std::move(LPM), DebugLogging)); - return true; + return Error::success(); } if (auto Count = parseRepeatPassName(Name)) { FunctionPassManager NestedFPM(DebugLogging); - if (!parseFunctionPassPipeline(NestedFPM, InnerPipeline, VerifyEachPass, - DebugLogging)) - return false; + if (auto Err = parseFunctionPassPipeline(NestedFPM, InnerPipeline, + VerifyEachPass, DebugLogging)) + return Err; FPM.addPass(createRepeatedPass(*Count, std::move(NestedFPM))); - return true; + return Error::success(); } for (auto &C : FunctionPipelineParsingCallbacks) if (C(Name, FPM, InnerPipeline)) - return true; + return Error::success(); // Normal passes can't have pipelines. - return false; + return make_error( + formatv("invalid use of '{0}' pass as function pipeline", Name).str(), + inconvertibleErrorCode()); } // Now expand the basic registered passes from the .inc file. #define FUNCTION_PASS(NAME, CREATE_PASS) \ if (Name == NAME) { \ FPM.addPass(CREATE_PASS); \ - return true; \ + return Error::success(); \ } #define FUNCTION_ANALYSIS(NAME, CREATE_PASS) \ if (Name == "require<" NAME ">") { \ FPM.addPass( \ RequireAnalysisPass< \ std::remove_reference::type, Function>()); \ - return true; \ + return Error::success(); \ } \ if (Name == "invalidate<" NAME ">") { \ FPM.addPass(InvalidateAnalysisPass< \ std::remove_reference::type>()); \ - return true; \ + return Error::success(); \ } #include "PassRegistry.def" for (auto &C : FunctionPipelineParsingCallbacks) if (C(Name, FPM, InnerPipeline)) - return true; - return false; + return Error::success(); + return make_error( + formatv("unknown function pass '{0}'", Name).str(), + inconvertibleErrorCode()); } -bool PassBuilder::parseLoopPass(LoopPassManager &LPM, const PipelineElement &E, - bool VerifyEachPass, bool DebugLogging) { +Error PassBuilder::parseLoopPass(LoopPassManager &LPM, const PipelineElement &E, + bool VerifyEachPass, bool DebugLogging) { StringRef Name = E.Name; auto &InnerPipeline = E.InnerPipeline; @@ -1668,35 +1685,37 @@ bool PassBuilder::parseLoopPass(LoopPassManager &LPM, const PipelineElement &E, if (!InnerPipeline.empty()) { if (Name == "loop") { LoopPassManager NestedLPM(DebugLogging); - if (!parseLoopPassPipeline(NestedLPM, InnerPipeline, VerifyEachPass, - DebugLogging)) - return false; + if (auto Err = parseLoopPassPipeline(NestedLPM, InnerPipeline, + VerifyEachPass, DebugLogging)) + return Err; // Add the nested pass manager with the appropriate adaptor. LPM.addPass(std::move(NestedLPM)); - return true; + return Error::success(); } if (auto Count = parseRepeatPassName(Name)) { LoopPassManager NestedLPM(DebugLogging); - if (!parseLoopPassPipeline(NestedLPM, InnerPipeline, VerifyEachPass, - DebugLogging)) - return false; + if (auto Err = parseLoopPassPipeline(NestedLPM, InnerPipeline, + VerifyEachPass, DebugLogging)) + return Err; LPM.addPass(createRepeatedPass(*Count, std::move(NestedLPM))); - return true; + return Error::success(); } for (auto &C : LoopPipelineParsingCallbacks) if (C(Name, LPM, InnerPipeline)) - return true; + return Error::success(); // Normal passes can't have pipelines. - return false; + return make_error( + formatv("invalid use of '{0}' pass as loop pipeline", Name).str(), + inconvertibleErrorCode()); } // Now expand the basic registered passes from the .inc file. #define LOOP_PASS(NAME, CREATE_PASS) \ if (Name == NAME) { \ LPM.addPass(CREATE_PASS); \ - return true; \ + return Error::success(); \ } #define LOOP_ANALYSIS(NAME, CREATE_PASS) \ if (Name == "require<" NAME ">") { \ @@ -1704,19 +1723,20 @@ bool PassBuilder::parseLoopPass(LoopPassManager &LPM, const PipelineElement &E, std::remove_reference::type, Loop, \ LoopAnalysisManager, LoopStandardAnalysisResults &, \ LPMUpdater &>()); \ - return true; \ + return Error::success(); \ } \ if (Name == "invalidate<" NAME ">") { \ LPM.addPass(InvalidateAnalysisPass< \ std::remove_reference::type>()); \ - return true; \ + return Error::success(); \ } #include "PassRegistry.def" for (auto &C : LoopPipelineParsingCallbacks) if (C(Name, LPM, InnerPipeline)) - return true; - return false; + return Error::success(); + return make_error(formatv("unknown loop pass '{0}'", Name).str(), + inconvertibleErrorCode()); } bool PassBuilder::parseAAPassName(AAManager &AA, StringRef Name) { @@ -1740,41 +1760,42 @@ bool PassBuilder::parseAAPassName(AAManager &AA, StringRef Name) { return false; } -bool PassBuilder::parseLoopPassPipeline(LoopPassManager &LPM, - ArrayRef Pipeline, - bool VerifyEachPass, - bool DebugLogging) { +Error PassBuilder::parseLoopPassPipeline(LoopPassManager &LPM, + ArrayRef Pipeline, + bool VerifyEachPass, + bool DebugLogging) { for (const auto &Element : Pipeline) { - if (!parseLoopPass(LPM, Element, VerifyEachPass, DebugLogging)) - return false; + if (auto Err = parseLoopPass(LPM, Element, VerifyEachPass, DebugLogging)) + return Err; // FIXME: No verifier support for Loop passes! } - return true; + return Error::success(); } -bool PassBuilder::parseFunctionPassPipeline(FunctionPassManager &FPM, - ArrayRef Pipeline, - bool VerifyEachPass, - bool DebugLogging) { +Error PassBuilder::parseFunctionPassPipeline(FunctionPassManager &FPM, + ArrayRef Pipeline, + bool VerifyEachPass, + bool DebugLogging) { for (const auto &Element : Pipeline) { - if (!parseFunctionPass(FPM, Element, VerifyEachPass, DebugLogging)) - return false; + if (auto Err = + parseFunctionPass(FPM, Element, VerifyEachPass, DebugLogging)) + return Err; if (VerifyEachPass) FPM.addPass(VerifierPass()); } - return true; + return Error::success(); } -bool PassBuilder::parseCGSCCPassPipeline(CGSCCPassManager &CGPM, - ArrayRef Pipeline, - bool VerifyEachPass, - bool DebugLogging) { +Error PassBuilder::parseCGSCCPassPipeline(CGSCCPassManager &CGPM, + ArrayRef Pipeline, + bool VerifyEachPass, + bool DebugLogging) { for (const auto &Element : Pipeline) { - if (!parseCGSCCPass(CGPM, Element, VerifyEachPass, DebugLogging)) - return false; + if (auto Err = parseCGSCCPass(CGPM, Element, VerifyEachPass, DebugLogging)) + return Err; // FIXME: No verifier support for CGSCC passes! } - return true; + return Error::success(); } void PassBuilder::crossRegisterProxies(LoopAnalysisManager &LAM, @@ -1790,28 +1811,30 @@ void PassBuilder::crossRegisterProxies(LoopAnalysisManager &LAM, LAM.registerPass([&] { return FunctionAnalysisManagerLoopProxy(FAM); }); } -bool PassBuilder::parseModulePassPipeline(ModulePassManager &MPM, - ArrayRef Pipeline, - bool VerifyEachPass, - bool DebugLogging) { +Error PassBuilder::parseModulePassPipeline(ModulePassManager &MPM, + ArrayRef Pipeline, + bool VerifyEachPass, + bool DebugLogging) { for (const auto &Element : Pipeline) { - if (!parseModulePass(MPM, Element, VerifyEachPass, DebugLogging)) - return false; + if (auto Err = parseModulePass(MPM, Element, VerifyEachPass, DebugLogging)) + return Err; if (VerifyEachPass) MPM.addPass(VerifierPass()); } - return true; + return Error::success(); } // Primary pass pipeline description parsing routine for a \c ModulePassManager // FIXME: Should this routine accept a TargetMachine or require the caller to // pre-populate the analysis managers with target-specific stuff? -bool PassBuilder::parsePassPipeline(ModulePassManager &MPM, - StringRef PipelineText, bool VerifyEachPass, - bool DebugLogging) { +Error PassBuilder::parsePassPipeline(ModulePassManager &MPM, + StringRef PipelineText, + bool VerifyEachPass, bool DebugLogging) { auto Pipeline = parsePipelineText(PipelineText); if (!Pipeline || Pipeline->empty()) - return false; + return make_error( + formatv("invalid pipeline '{0}'", PipelineText).str(), + inconvertibleErrorCode()); // If the first name isn't at the module layer, wrap the pipeline up // automatically. @@ -1828,73 +1851,106 @@ bool PassBuilder::parsePassPipeline(ModulePassManager &MPM, } else { for (auto &C : TopLevelPipelineParsingCallbacks) if (C(MPM, *Pipeline, VerifyEachPass, DebugLogging)) - return true; - - // Unknown pass name! - return false; + return Error::success(); + + // Unknown pass or pipeline name! + auto &InnerPipeline = Pipeline->front().InnerPipeline; + return make_error( + formatv("unknown {0} name '{1}'", + (InnerPipeline.empty() ? "pass" : "pipeline"), FirstName) + .str(), + inconvertibleErrorCode()); } } - return parseModulePassPipeline(MPM, *Pipeline, VerifyEachPass, DebugLogging); + if (auto Err = + parseModulePassPipeline(MPM, *Pipeline, VerifyEachPass, DebugLogging)) + return Err; + return Error::success(); } // Primary pass pipeline description parsing routine for a \c CGSCCPassManager -bool PassBuilder::parsePassPipeline(CGSCCPassManager &CGPM, - StringRef PipelineText, bool VerifyEachPass, - bool DebugLogging) { +Error PassBuilder::parsePassPipeline(CGSCCPassManager &CGPM, + StringRef PipelineText, + bool VerifyEachPass, bool DebugLogging) { auto Pipeline = parsePipelineText(PipelineText); if (!Pipeline || Pipeline->empty()) - return false; + return make_error( + formatv("invalid pipeline '{0}'", PipelineText).str(), + inconvertibleErrorCode()); StringRef FirstName = Pipeline->front().Name; if (!isCGSCCPassName(FirstName, CGSCCPipelineParsingCallbacks)) - return false; - - return parseCGSCCPassPipeline(CGPM, *Pipeline, VerifyEachPass, DebugLogging); + return make_error( + formatv("unknown cgscc pass '{0}' in pipeline '{1}'", FirstName, + PipelineText) + .str(), + inconvertibleErrorCode()); + + if (auto Err = + parseCGSCCPassPipeline(CGPM, *Pipeline, VerifyEachPass, DebugLogging)) + return Err; + return Error::success(); } // Primary pass pipeline description parsing routine for a \c // FunctionPassManager -bool PassBuilder::parsePassPipeline(FunctionPassManager &FPM, - StringRef PipelineText, bool VerifyEachPass, - bool DebugLogging) { +Error PassBuilder::parsePassPipeline(FunctionPassManager &FPM, + StringRef PipelineText, + bool VerifyEachPass, bool DebugLogging) { auto Pipeline = parsePipelineText(PipelineText); if (!Pipeline || Pipeline->empty()) - return false; + return make_error( + formatv("invalid pipeline '{0}'", PipelineText).str(), + inconvertibleErrorCode()); StringRef FirstName = Pipeline->front().Name; if (!isFunctionPassName(FirstName, FunctionPipelineParsingCallbacks)) - return false; - - return parseFunctionPassPipeline(FPM, *Pipeline, VerifyEachPass, - DebugLogging); + return make_error( + formatv("unknown function pass '{0}' in pipeline '{1}'", FirstName, + PipelineText) + .str(), + inconvertibleErrorCode()); + + if (auto Err = parseFunctionPassPipeline(FPM, *Pipeline, VerifyEachPass, + DebugLogging)) + return Err; + return Error::success(); } // Primary pass pipeline description parsing routine for a \c LoopPassManager -bool PassBuilder::parsePassPipeline(LoopPassManager &CGPM, - StringRef PipelineText, bool VerifyEachPass, - bool DebugLogging) { +Error PassBuilder::parsePassPipeline(LoopPassManager &CGPM, + StringRef PipelineText, + bool VerifyEachPass, bool DebugLogging) { auto Pipeline = parsePipelineText(PipelineText); if (!Pipeline || Pipeline->empty()) - return false; + return make_error( + formatv("invalid pipeline '{0}'", PipelineText).str(), + inconvertibleErrorCode()); - return parseLoopPassPipeline(CGPM, *Pipeline, VerifyEachPass, DebugLogging); + if (auto Err = + parseLoopPassPipeline(CGPM, *Pipeline, VerifyEachPass, DebugLogging)) + return Err; + + return Error::success(); } -bool PassBuilder::parseAAPipeline(AAManager &AA, StringRef PipelineText) { +Error PassBuilder::parseAAPipeline(AAManager &AA, StringRef PipelineText) { // If the pipeline just consists of the word 'default' just replace the AA // manager with our default one. if (PipelineText == "default") { AA = buildDefaultAAPipeline(); - return true; + return Error::success(); } while (!PipelineText.empty()) { StringRef Name; std::tie(Name, PipelineText) = PipelineText.split(','); if (!parseAAPassName(AA, Name)) - return false; + return make_error( + formatv("unknown alias analysis name '{0}'", Name).str(), + inconvertibleErrorCode()); } - return true; + return Error::success(); } diff --git a/test/Other/pass-pipeline-parsing.ll b/test/Other/pass-pipeline-parsing.ll index b303318c796..d13a977dbce 100644 --- a/test/Other/pass-pipeline-parsing.ll +++ b/test/Other/pass-pipeline-parsing.ll @@ -54,52 +54,52 @@ ; RUN: not opt -disable-output -debug-pass-manager \ ; RUN: -passes='no-op-module)' %s 2>&1 \ ; RUN: | FileCheck %s --check-prefix=CHECK-UNBALANCED1 -; CHECK-UNBALANCED1: unable to parse pass pipeline description +; CHECK-UNBALANCED1: invalid pipeline 'no-op-module)' ; RUN: not opt -disable-output -debug-pass-manager \ ; RUN: -passes='module(no-op-module))' %s 2>&1 \ ; RUN: | FileCheck %s --check-prefix=CHECK-UNBALANCED2 -; CHECK-UNBALANCED2: unable to parse pass pipeline description +; CHECK-UNBALANCED2: invalid pipeline 'module(no-op-module))' ; RUN: not opt -disable-output -debug-pass-manager \ ; RUN: -passes='module(no-op-module' %s 2>&1 \ ; RUN: | FileCheck %s --check-prefix=CHECK-UNBALANCED3 -; CHECK-UNBALANCED3: unable to parse pass pipeline description +; CHECK-UNBALANCED3: invalid pipeline 'module(no-op-module' ; RUN: not opt -disable-output -debug-pass-manager \ ; RUN: -passes='no-op-function)' %s 2>&1 \ ; RUN: | FileCheck %s --check-prefix=CHECK-UNBALANCED4 -; CHECK-UNBALANCED4: unable to parse pass pipeline description +; CHECK-UNBALANCED4: invalid pipeline 'no-op-function)' ; RUN: not opt -disable-output -debug-pass-manager \ ; RUN: -passes='function(no-op-function))' %s 2>&1 \ ; RUN: | FileCheck %s --check-prefix=CHECK-UNBALANCED5 -; CHECK-UNBALANCED5: unable to parse pass pipeline description +; CHECK-UNBALANCED5: invalid pipeline 'function(no-op-function))' ; RUN: not opt -disable-output -debug-pass-manager \ ; RUN: -passes='function(function(no-op-function)))' %s 2>&1 \ ; RUN: | FileCheck %s --check-prefix=CHECK-UNBALANCED6 -; CHECK-UNBALANCED6: unable to parse pass pipeline description +; CHECK-UNBALANCED6: invalid pipeline 'function(function(no-op-function)))' ; RUN: not opt -disable-output -debug-pass-manager \ ; RUN: -passes='function(no-op-function' %s 2>&1 \ ; RUN: | FileCheck %s --check-prefix=CHECK-UNBALANCED7 -; CHECK-UNBALANCED7: unable to parse pass pipeline description +; CHECK-UNBALANCED7: invalid pipeline 'function(no-op-function' ; RUN: not opt -disable-output -debug-pass-manager \ ; RUN: -passes='function(function(no-op-function)' %s 2>&1 \ ; RUN: | FileCheck %s --check-prefix=CHECK-UNBALANCED8 -; CHECK-UNBALANCED8: unable to parse pass pipeline description +; CHECK-UNBALANCED8: invalid pipeline 'function(function(no-op-function)' ; RUN: not opt -disable-output -debug-pass-manager \ ; RUN: -passes='no-op-module,)' %s 2>&1 \ ; RUN: | FileCheck %s --check-prefix=CHECK-UNBALANCED9 -; CHECK-UNBALANCED9: unable to parse pass pipeline description +; CHECK-UNBALANCED9: invalid pipeline 'no-op-module,)' ; RUN: not opt -disable-output -debug-pass-manager \ ; RUN: -passes='no-op-function,)' %s 2>&1 \ ; RUN: | FileCheck %s --check-prefix=CHECK-UNBALANCED10 -; CHECK-UNBALANCED10: unable to parse pass pipeline description +; CHECK-UNBALANCED10: invalid pipeline 'no-op-function,)' ; RUN: opt -disable-output -debug-pass-manager \ ; RUN: -passes=no-op-cgscc,no-op-cgscc %s 2>&1 \ @@ -176,37 +176,86 @@ ; RUN: not opt -disable-output -debug-pass-manager \ ; RUN: -passes='function(no-op-function)function(no-op-function)' %s 2>&1 \ ; RUN: | FileCheck %s --check-prefix=CHECK-MISSING-COMMA1 -; CHECK-MISSING-COMMA1: unable to parse pass pipeline description +; CHECK-MISSING-COMMA1: invalid pipeline 'function(no-op-function)function(no-op-function)' ; RUN: not opt -disable-output -debug-pass-manager \ ; RUN: -passes='function()' %s 2>&1 \ ; RUN: | FileCheck %s --check-prefix=CHECK-EMPTY-INNER-PIPELINE -; CHECK-EMPTY-INNER-PIPELINE: unable to parse pass pipeline description +; CHECK-EMPTY-INNER-PIPELINE: unknown function pass '' ; RUN: not opt -disable-output -debug-pass-manager \ ; RUN: -passes='no-op-module(no-op-module,whatever)' %s 2>&1 \ ; RUN: | FileCheck %s --check-prefix=CHECK-PIPELINE-ON-MODULE-PASS -; CHECK-PIPELINE-ON-MODULE-PASS: unable to parse pass pipeline description +; CHECK-PIPELINE-ON-MODULE-PASS: invalid use of 'no-op-module' pass as module pipeline ; RUN: not opt -disable-output -debug-pass-manager \ ; RUN: -passes='no-op-cgscc(no-op-cgscc,whatever)' %s 2>&1 \ ; RUN: | FileCheck %s --check-prefix=CHECK-PIPELINE-ON-CGSCC-PASS -; CHECK-PIPELINE-ON-CGSCC-PASS: unable to parse pass pipeline description +; CHECK-PIPELINE-ON-CGSCC-PASS: invalid use of 'no-op-cgscc' pass as cgscc pipeline ; RUN: not opt -disable-output -debug-pass-manager \ ; RUN: -passes='no-op-function(no-op-function,whatever)' %s 2>&1 \ ; RUN: | FileCheck %s --check-prefix=CHECK-PIPELINE-ON-FUNCTION-PASS -; CHECK-PIPELINE-ON-FUNCTION-PASS: unable to parse pass pipeline description +; CHECK-PIPELINE-ON-FUNCTION-PASS: invalid use of 'no-op-function' pass as function pipeline ; RUN: not opt -disable-output -debug-pass-manager \ ; RUN: -passes='no-op-loop(no-op-loop,whatever)' %s 2>&1 \ ; RUN: | FileCheck %s --check-prefix=CHECK-PIPELINE-ON-LOOP-PASS -; CHECK-PIPELINE-ON-LOOP-PASS: unable to parse pass pipeline description +; CHECK-PIPELINE-ON-LOOP-PASS: invalid use of 'no-op-loop' pass as loop pipeline ; RUN: not opt -disable-output -debug-pass-manager \ ; RUN: -passes='no-op-function()' %s 2>&1 \ ; RUN: | FileCheck %s --check-prefix=CHECK-EMPTY-PIPELINE-ON-PASS -; CHECK-EMPTY-PIPELINE-ON-PASS: unable to parse pass pipeline description +; CHECK-EMPTY-PIPELINE-ON-PASS: invalid use of 'no-op-function' pass as function pipeline + +; RUN: not opt -passes='no-op-module,bad' \ +; RUN: /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=CHECK-UNKNOWN-MODULE +; CHECK-UNKNOWN-MODULE: opt: unknown module pass 'bad' + +; RUN: not opt -passes='no-op-loop,bad' \ +; RUN: /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=CHECK-UNKNOWN-LOOP +; CHECK-UNKNOWN-LOOP: opt: unknown loop pass 'bad' + +; RUN: not opt -passes='no-op-cgscc,bad' \ +; RUN: /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=CHECK-UNKNOWN-CGSCC +; CHECK-UNKNOWN-CGSCC: opt: unknown cgscc pass 'bad' + +; RUN: not opt -passes='no-op-function,bad' \ +; RUN: /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=CHECK-UNKNOWN-FUNCTION +; RUN: not opt -passes='function(bad,pipeline,text)' \ +; RUN: /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=CHECK-UNKNOWN-FUNCTION +; RUN: not opt -passes='module(no-op-module,function(bad,pipeline,text))' \ +; RUN: /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=CHECK-UNKNOWN-FUNCTION +; RUN: not opt -passes='no-op-module,function(bad,pipeline,text)' \ +; RUN: /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=CHECK-UNKNOWN-FUNCTION +; RUN: not opt -passes='module(cgscc(function(bad,pipeline,text)))' \ +; RUN: /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=CHECK-UNKNOWN-FUNCTION +; CHECK-UNKNOWN-FUNCTION: opt: unknown function pass 'bad' + +; RUN: not opt -aa-pipeline=bad -passes=no-op-function \ +; RUN: /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=AA-PIPELINE-ERR +; AA-PIPELINE-ERR: unknown alias analysis name 'bad' +; RUN: opt -passes-ep-peephole=bad -passes=no-op-function \ +; RUN: /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=PASSES-EP-PEEPHOLE-ERR +; PASSES-EP-PEEPHOLE-ERR: Could not parse pipeline 'bad'. I'm going to ignore it. +; RUN: opt -passes-ep-late-loop-optimizations=bad -passes=no-op-function \ +; RUN: /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=PASSES-EP-LATELOOPOPT-ERR +; PASSES-EP-LATELOOPOPT-ERR: Could not parse pipeline 'bad'. I'm going to ignore it. +; RUN: opt -passes-ep-loop-optimizer-end=bad -passes=no-op-function \ +; RUN: /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=PASSES-EP-LOOPOPTEND-ERR +; PASSES-EP-LOOPOPTEND-ERR: Could not parse pipeline 'bad'. I'm going to ignore it. +; RUN: opt -passes-ep-scalar-optimizer-late=bad -passes=no-op-function \ +; RUN: /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=PASSES-EP-SCALAROPTLATE-ERR +; PASSES-EP-SCALAROPTLATE-ERR: Could not parse pipeline 'bad'. I'm going to ignore it. +; RUN: opt -passes-ep-cgscc-optimizer-late=bad -passes=no-op-function \ +; RUN: /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=PASSES-EP-CGSCCOPTLATE-ERR +; PASSES-EP-CGSCCOPTLATE-ERR: Could not parse pipeline 'bad'. I'm going to ignore it. +; RUN: opt -passes-ep-vectorizer-start=bad -passes=no-op-function \ +; RUN: /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=PASSES-EP-VECTORIZERSTART-ERR +; PASSES-EP-VECTORIZERSTART-ERR: Could not parse pipeline 'bad'. I'm going to ignore it. +; RUN: opt -passes-ep-pipeline-start=bad -passes=no-op-function \ +; RUN: /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=PASSES-EP-PIPELINESTART-ERR +; PASSES-EP-PIPELINESTART-ERR: Could not parse pipeline 'bad'. I'm going to ignore it. define void @f() { entry: diff --git a/test/tools/llvm-lto2/X86/pipeline.ll b/test/tools/llvm-lto2/X86/pipeline.ll index 29276d8d13a..9ab81ac70a7 100644 --- a/test/tools/llvm-lto2/X86/pipeline.ll +++ b/test/tools/llvm-lto2/X86/pipeline.ll @@ -32,11 +32,11 @@ define void @patatino() { ; RUN: -r %t1.bc,patatino,px -opt-pipeline foogoo 2>&1 | \ ; RUN: FileCheck %s --check-prefix=ERR -; ERR: LLVM ERROR: unable to parse pass pipeline description: foogoo +; ERR: LLVM ERROR: unable to parse pass pipeline description 'foogoo': unknown pass name 'foogoo' ; RUN: not llvm-lto2 run %t1.bc -o %t.o \ ; RUN: -r %t1.bc,patatino,px -aa-pipeline patatino \ ; RUN: -opt-pipeline loweratomic 2>&1 | \ ; RUN: FileCheck %s --check-prefix=AAERR -; AAERR: LLVM ERROR: unable to parse AA pipeline description: patatino +; AAERR: LLVM ERROR: unable to parse AA pipeline description 'patatino': unknown alias analysis name 'patatino' diff --git a/test/tools/llvm-opt-fuzzer/command-line.ll b/test/tools/llvm-opt-fuzzer/command-line.ll index f747bba431b..8c3f6b60154 100644 --- a/test/tools/llvm-opt-fuzzer/command-line.ll +++ b/test/tools/llvm-opt-fuzzer/command-line.ll @@ -13,7 +13,7 @@ ; Don't start with incorrect passes specified ; RUN: not llvm-opt-fuzzer %t -ignore_remaining_args=1 -mtriple x86_64 -passes no-pass 2>&1 | FileCheck -check-prefix=PIPELINE %s -; PIPELINE: can't parse pass pipeline +; PIPELINE: unknown pass name 'no-pass' ; Correct command line ; RUN: llvm-opt-fuzzer %t -ignore_remaining_args=1 -mtriple x86_64 -passes instcombine 2>&1 | FileCheck -check-prefix=CORRECT %s diff --git a/tools/llvm-opt-fuzzer/llvm-opt-fuzzer.cpp b/tools/llvm-opt-fuzzer/llvm-opt-fuzzer.cpp index 98d5428ddd1..57e75b1db9e 100644 --- a/tools/llvm-opt-fuzzer/llvm-opt-fuzzer.cpp +++ b/tools/llvm-opt-fuzzer/llvm-opt-fuzzer.cpp @@ -144,9 +144,10 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) { PB.registerLoopAnalyses(LAM); PB.crossRegisterProxies(LAM, FAM, CGAM, MAM); - bool Ok = PB.parsePassPipeline(MPM, PassPipeline, false, false); - assert(Ok && "Should have been checked during fuzzer initialization"); - (void)Ok; // silence unused variable warning on release builds + auto Err = PB.parsePassPipeline(MPM, PassPipeline, false, false); + assert(!Err && "Should have been checked during fuzzer initialization"); + // Only fail with assert above, otherwise ignore the parsing error. + consumeError(std::move(Err)); // Run passes which we need to test // @@ -235,8 +236,8 @@ extern "C" LLVM_ATTRIBUTE_USED int LLVMFuzzerInitialize( PassBuilder PB(TM.get()); ModulePassManager MPM; - if (!PB.parsePassPipeline(MPM, PassPipeline, false, false)) { - errs() << *argv[0] << ": can't parse pass pipeline\n"; + if (auto Err = PB.parsePassPipeline(MPM, PassPipeline, false, false)) { + errs() << *argv[0] << ": " << toString(std::move(Err)) << "\n"; exit(1); } diff --git a/tools/opt/NewPMDriver.cpp b/tools/opt/NewPMDriver.cpp index e63547a79d0..11879d26a6c 100644 --- a/tools/opt/NewPMDriver.cpp +++ b/tools/opt/NewPMDriver.cpp @@ -124,12 +124,12 @@ bool tryParsePipelineText(PassBuilder &PB, StringRef PipelineText) { // Verify the pipeline is parseable: PassManagerT PM; - if (PB.parsePassPipeline(PM, PipelineText)) - return true; - - errs() << "Could not parse pipeline '" << PipelineText - << "'. I'm going to igore it.\n"; - return false; + if (auto Err = PB.parsePassPipeline(PM, PipelineText)) { + errs() << "Could not parse pipeline '" << PipelineText + << "'. I'm going to ignore it.\n"; + return false; + } + return true; } /// If one of the EPPipeline command line options was given, register callbacks @@ -137,50 +137,61 @@ bool tryParsePipelineText(PassBuilder &PB, StringRef PipelineText) { static void registerEPCallbacks(PassBuilder &PB, bool VerifyEachPass, bool DebugLogging) { if (tryParsePipelineText(PB, PeepholeEPPipeline)) - PB.registerPeepholeEPCallback([&PB, VerifyEachPass, DebugLogging]( - FunctionPassManager &PM, PassBuilder::OptimizationLevel Level) { - PB.parsePassPipeline(PM, PeepholeEPPipeline, VerifyEachPass, - DebugLogging); - }); + PB.registerPeepholeEPCallback( + [&PB, VerifyEachPass, DebugLogging]( + FunctionPassManager &PM, PassBuilder::OptimizationLevel Level) { + ExitOnError Err("Unable to parse PeepholeEP pipeline: "); + Err(PB.parsePassPipeline(PM, PeepholeEPPipeline, VerifyEachPass, + DebugLogging)); + }); if (tryParsePipelineText(PB, LateLoopOptimizationsEPPipeline)) PB.registerLateLoopOptimizationsEPCallback( [&PB, VerifyEachPass, DebugLogging]( LoopPassManager &PM, PassBuilder::OptimizationLevel Level) { - PB.parsePassPipeline(PM, LateLoopOptimizationsEPPipeline, - VerifyEachPass, DebugLogging); + ExitOnError Err("Unable to parse LateLoopOptimizationsEP pipeline: "); + Err(PB.parsePassPipeline(PM, LateLoopOptimizationsEPPipeline, + VerifyEachPass, DebugLogging)); }); if (tryParsePipelineText(PB, LoopOptimizerEndEPPipeline)) - PB.registerLoopOptimizerEndEPCallback([&PB, VerifyEachPass, DebugLogging]( - LoopPassManager &PM, PassBuilder::OptimizationLevel Level) { - PB.parsePassPipeline(PM, LoopOptimizerEndEPPipeline, VerifyEachPass, - DebugLogging); - }); + PB.registerLoopOptimizerEndEPCallback( + [&PB, VerifyEachPass, DebugLogging]( + LoopPassManager &PM, PassBuilder::OptimizationLevel Level) { + ExitOnError Err("Unable to parse LoopOptimizerEndEP pipeline: "); + Err(PB.parsePassPipeline(PM, LoopOptimizerEndEPPipeline, + VerifyEachPass, DebugLogging)); + }); if (tryParsePipelineText(PB, ScalarOptimizerLateEPPipeline)) PB.registerScalarOptimizerLateEPCallback( [&PB, VerifyEachPass, DebugLogging]( FunctionPassManager &PM, PassBuilder::OptimizationLevel Level) { - PB.parsePassPipeline(PM, ScalarOptimizerLateEPPipeline, - VerifyEachPass, DebugLogging); + ExitOnError Err("Unable to parse ScalarOptimizerLateEP pipeline: "); + Err(PB.parsePassPipeline(PM, ScalarOptimizerLateEPPipeline, + VerifyEachPass, DebugLogging)); }); if (tryParsePipelineText(PB, CGSCCOptimizerLateEPPipeline)) - PB.registerCGSCCOptimizerLateEPCallback([&PB, VerifyEachPass, DebugLogging]( - CGSCCPassManager &PM, PassBuilder::OptimizationLevel Level) { - PB.parsePassPipeline(PM, CGSCCOptimizerLateEPPipeline, VerifyEachPass, - DebugLogging); - }); + PB.registerCGSCCOptimizerLateEPCallback( + [&PB, VerifyEachPass, DebugLogging]( + CGSCCPassManager &PM, PassBuilder::OptimizationLevel Level) { + ExitOnError Err("Unable to parse CGSCCOptimizerLateEP pipeline: "); + Err(PB.parsePassPipeline(PM, CGSCCOptimizerLateEPPipeline, + VerifyEachPass, DebugLogging)); + }); if (tryParsePipelineText(PB, VectorizerStartEPPipeline)) - PB.registerVectorizerStartEPCallback([&PB, VerifyEachPass, DebugLogging]( - FunctionPassManager &PM, PassBuilder::OptimizationLevel Level) { - PB.parsePassPipeline(PM, VectorizerStartEPPipeline, VerifyEachPass, - DebugLogging); - }); + PB.registerVectorizerStartEPCallback( + [&PB, VerifyEachPass, DebugLogging]( + FunctionPassManager &PM, PassBuilder::OptimizationLevel Level) { + ExitOnError Err("Unable to parse VectorizerStartEP pipeline: "); + Err(PB.parsePassPipeline(PM, VectorizerStartEPPipeline, + VerifyEachPass, DebugLogging)); + }); if (tryParsePipelineText(PB, PipelineStartEPPipeline)) PB.registerPipelineStartEPCallback( [&PB, VerifyEachPass, DebugLogging](ModulePassManager &PM) { - PB.parsePassPipeline(PM, PipelineStartEPPipeline, VerifyEachPass, - DebugLogging); + ExitOnError Err("Unable to parse PipelineStartEP pipeline: "); + Err(PB.parsePassPipeline(PM, PipelineStartEPPipeline, VerifyEachPass, + DebugLogging)); }); } @@ -258,8 +269,8 @@ bool llvm::runPassPipeline(StringRef Arg0, Module &M, TargetMachine *TM, // Specially handle the alias analysis manager so that we can register // a custom pipeline of AA passes with it. AAManager AA; - if (!PB.parseAAPipeline(AA, AAPipeline)) { - errs() << Arg0 << ": unable to parse AA pipeline description.\n"; + if (auto Err = PB.parseAAPipeline(AA, AAPipeline)) { + errs() << Arg0 << ": " << toString(std::move(Err)) << "\n"; return false; } @@ -284,8 +295,9 @@ bool llvm::runPassPipeline(StringRef Arg0, Module &M, TargetMachine *TM, if (EnableDebugify) MPM.addPass(NewPMDebugifyPass()); - if (!PB.parsePassPipeline(MPM, PassPipeline, VerifyEachPass, DebugPM)) { - errs() << Arg0 << ": unable to parse pass pipeline description.\n"; + if (auto Err = + PB.parsePassPipeline(MPM, PassPipeline, VerifyEachPass, DebugPM)) { + errs() << Arg0 << ": " << toString(std::move(Err)) << "\n"; return false; } diff --git a/unittests/IR/CMakeLists.txt b/unittests/IR/CMakeLists.txt index 211ab109131..7498983b260 100644 --- a/unittests/IR/CMakeLists.txt +++ b/unittests/IR/CMakeLists.txt @@ -40,3 +40,5 @@ add_llvm_unittest(IRTests VerifierTest.cpp WaymarkTest.cpp ) + +target_link_libraries(IRTests PRIVATE LLVMTestingSupport) diff --git a/unittests/IR/PassBuilderCallbacksTest.cpp b/unittests/IR/PassBuilderCallbacksTest.cpp index 97bbb81a6b0..20c47b045e7 100644 --- a/unittests/IR/PassBuilderCallbacksTest.cpp +++ b/unittests/IR/PassBuilderCallbacksTest.cpp @@ -7,6 +7,7 @@ // //===----------------------------------------------------------------------===// +#include "llvm/Testing/Support/Error.h" #include #include #include @@ -460,7 +461,7 @@ TEST_F(ModuleCallbacksTest, Passes) { .WillOnce(Invoke(getAnalysisResult)); StringRef PipelineText = "test-transform"; - ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true)) + ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded()) << "Pipeline was: " << PipelineText; PM.run(*M, AM); @@ -494,7 +495,7 @@ TEST_F(ModuleCallbacksTest, InstrumentedPasses) { .InSequence(PISequence); StringRef PipelineText = "test-transform"; - ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true)) + ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded()) << "Pipeline was: " << PipelineText; PM.run(*M, AM); @@ -525,7 +526,7 @@ TEST_F(ModuleCallbacksTest, InstrumentedSkippedPasses) { .Times(0); StringRef PipelineText = "test-transform"; - ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true)) + ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded()) << "Pipeline was: " << PipelineText; PM.run(*M, AM); @@ -537,7 +538,7 @@ TEST_F(FunctionCallbacksTest, Passes) { .WillOnce(Invoke(getAnalysisResult)); StringRef PipelineText = "test-transform"; - ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true)) + ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded()) << "Pipeline was: " << PipelineText; PM.run(*M, AM); } @@ -571,7 +572,7 @@ TEST_F(FunctionCallbacksTest, InstrumentedPasses) { .InSequence(PISequence); StringRef PipelineText = "test-transform"; - ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true)) + ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded()) << "Pipeline was: " << PipelineText; PM.run(*M, AM); } @@ -604,7 +605,7 @@ TEST_F(FunctionCallbacksTest, InstrumentedSkippedPasses) { .Times(0); StringRef PipelineText = "test-transform"; - ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true)) + ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded()) << "Pipeline was: " << PipelineText; PM.run(*M, AM); } @@ -615,7 +616,7 @@ TEST_F(LoopCallbacksTest, Passes) { .WillOnce(WithArgs<0, 1, 2>(Invoke(getAnalysisResult))); StringRef PipelineText = "test-transform"; - ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true)) + ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded()) << "Pipeline was: " << PipelineText; PM.run(*M, AM); } @@ -650,7 +651,7 @@ TEST_F(LoopCallbacksTest, InstrumentedPasses) { .InSequence(PISequence); StringRef PipelineText = "test-transform"; - ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true)) + ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded()) << "Pipeline was: " << PipelineText; PM.run(*M, AM); } @@ -682,7 +683,7 @@ TEST_F(LoopCallbacksTest, InstrumentedSkippedPasses) { .Times(0); StringRef PipelineText = "test-transform"; - ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true)) + ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded()) << "Pipeline was: " << PipelineText; PM.run(*M, AM); } @@ -693,7 +694,7 @@ TEST_F(CGSCCCallbacksTest, Passes) { .WillOnce(WithArgs<0, 1, 2>(Invoke(getAnalysisResult))); StringRef PipelineText = "test-transform"; - ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true)) + ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded()) << "Pipeline was: " << PipelineText; PM.run(*M, AM); } @@ -727,7 +728,7 @@ TEST_F(CGSCCCallbacksTest, InstrumentedPasses) { .InSequence(PISequence); StringRef PipelineText = "test-transform"; - ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true)) + ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded()) << "Pipeline was: " << PipelineText; PM.run(*M, AM); } @@ -759,7 +760,7 @@ TEST_F(CGSCCCallbacksTest, InstrumentedSkippedPasses) { .Times(0); StringRef PipelineText = "test-transform"; - ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true)) + ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded()) << "Pipeline was: " << PipelineText; PM.run(*M, AM); } @@ -774,7 +775,7 @@ TEST_F(ModuleCallbacksTest, AnalysisUtilities) { EXPECT_CALL(AnalysisHandle, invalidate(HasName(""), _, _)); StringRef PipelineText = "require,invalidate"; - ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true)) + ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded()) << "Pipeline was: " << PipelineText; PM.run(*M, AM); } @@ -784,7 +785,7 @@ TEST_F(CGSCCCallbacksTest, PassUtilities) { EXPECT_CALL(AnalysisHandle, invalidate(HasName("(foo)"), _, _)); StringRef PipelineText = "require,invalidate"; - ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true)) + ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded()) << "Pipeline was: " << PipelineText; PM.run(*M, AM); } @@ -794,7 +795,7 @@ TEST_F(FunctionCallbacksTest, AnalysisUtilities) { EXPECT_CALL(AnalysisHandle, invalidate(HasName("foo"), _, _)); StringRef PipelineText = "require,invalidate"; - ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true)) + ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded()) << "Pipeline was: " << PipelineText; PM.run(*M, AM); } @@ -805,7 +806,7 @@ TEST_F(LoopCallbacksTest, PassUtilities) { StringRef PipelineText = "require,invalidate"; - ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true)) + ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded()) << "Pipeline was: " << PipelineText; PM.run(*M, AM); } @@ -845,13 +846,13 @@ TEST_F(ModuleCallbacksTest, ParseTopLevelPipeline) { StringRef PipelineText = "another-pipeline(test-transform,invalidate)"; - ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true)) + ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded()) << "Pipeline was: " << PipelineText; PM.run(*M, AM); /// Test the negative case PipelineText = "another-pipeline(instcombine)"; - ASSERT_FALSE(PB.parsePassPipeline(PM, PipelineText, true)) + ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Failed()) << "Pipeline was: " << PipelineText; } } // end anonymous namespace diff --git a/unittests/Passes/CMakeLists.txt b/unittests/Passes/CMakeLists.txt index d90df209d4e..415f3a71734 100644 --- a/unittests/Passes/CMakeLists.txt +++ b/unittests/Passes/CMakeLists.txt @@ -12,6 +12,7 @@ add_llvm_unittest(PluginsTests PluginsTest.cpp ) export_executable_symbols(PluginsTests) +target_link_libraries(PluginsTests PRIVATE LLVMTestingSupport) set(LLVM_LINK_COMPONENTS) add_llvm_loadable_module(TestPlugin diff --git a/unittests/Passes/PluginsTest.cpp b/unittests/Passes/PluginsTest.cpp index 726978714e8..abb7b57ee0c 100644 --- a/unittests/Passes/PluginsTest.cpp +++ b/unittests/Passes/PluginsTest.cpp @@ -15,6 +15,7 @@ #include "llvm/Support/FileSystem.h" #include "llvm/Support/ManagedStatic.h" #include "llvm/Support/Path.h" +#include "llvm/Testing/Support/Error.h" #include "llvm/Transforms/Scalar/LoopPassManager.h" #include "gtest/gtest.h" @@ -54,8 +55,8 @@ TEST(PluginsTests, LoadPlugin) { PassBuilder PB; ModulePassManager PM; - ASSERT_FALSE(PB.parsePassPipeline(PM, "plugin-pass")); + ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, "plugin-pass"), Failed()); Plugin->registerPassBuilderCallbacks(PB); - ASSERT_TRUE(PB.parsePassPipeline(PM, "plugin-pass")); + ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, "plugin-pass"), Succeeded()); } -- GitLab From cb8b3a2740adc2ea3b866bab8b727b0a93e96353 Mon Sep 17 00:00:00 2001 From: Lang Hames Date: Mon, 15 Oct 2018 15:26:47 +0000 Subject: [PATCH 0199/1116] [ADT] Adds equality operators for DenseMap and DenseSet, and an initializer_list constructor for DenseMap (DenseSet already had an initializer_list constructor). These changes make it easier to migrate existing code that uses std::map and std::set (which support initializer_list construction and equality comparison) to DenseMap and DenseSet. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344522 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/ADT/DenseMap.h | 43 ++++++++++++++++++++++++++++++++++ include/llvm/ADT/DenseSet.h | 28 ++++++++++++++++++++++ unittests/ADT/DenseMapTest.cpp | 20 ++++++++++++++++ unittests/ADT/DenseSetTest.cpp | 9 +++++++ 4 files changed, 100 insertions(+) diff --git a/include/llvm/ADT/DenseMap.h b/include/llvm/ADT/DenseMap.h index 8fe0f48adf2..ac1e5c632d3 100644 --- a/include/llvm/ADT/DenseMap.h +++ b/include/llvm/ADT/DenseMap.h @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -38,6 +39,9 @@ namespace detail { // implementation without requiring two members. template struct DenseMapPair : public std::pair { + + using std::pair::pair; + KeyT &getFirst() { return std::pair::first; } const KeyT &getFirst() const { return std::pair::first; } ValueT &getSecond() { return std::pair::second; } @@ -640,6 +644,40 @@ public: } }; +/// Equality comparison for DenseMap. +/// +/// Iterates over elements of LHS confirming that each (key, value) pair in LHS +/// is also in RHS, and that no additional pairs are in RHS. +/// Equivalent to N calls to RHS.find and N value comparisons. Amortized +/// complexity is linear, worst case is O(N^2) (if every hash collides). +template +bool operator==( + const DenseMapBase &LHS, + const DenseMapBase &RHS) { + if (LHS.size() != RHS.size()) + return false; + + for (auto &KV : LHS) { + auto I = RHS.find(KV.first); + if (I == RHS.end() || I->second != KV.second) + return false; + } + + return true; +} + +/// Inequality comparison for DenseMap. +/// +/// Equivalent to !(LHS == RHS). See operator== for performance notes. +template +bool operator!=( + const DenseMapBase &LHS, + const DenseMapBase &RHS) { + return !(LHS == RHS); +} + template , typename BucketT = llvm::detail::DenseMapPair> @@ -677,6 +715,11 @@ public: this->insert(I, E); } + DenseMap(std::initializer_list Vals) { + init(Vals.size()); + this->insert(Vals.begin(), Vals.end()); + } + ~DenseMap() { this->destroyAll(); operator delete(Buckets); diff --git a/include/llvm/ADT/DenseSet.h b/include/llvm/ADT/DenseSet.h index 52fe4adb5bd..404b2f74766 100644 --- a/include/llvm/ADT/DenseSet.h +++ b/include/llvm/ADT/DenseSet.h @@ -214,6 +214,34 @@ public: } }; +/// Equality comparison for DenseSet. +/// +/// Iterates over elements of LHS confirming that each element is also a member +/// of RHS, and that RHS contains no additional values. +/// Equivalent to N calls to RHS.count. Amortized complexity is linear, worst +/// case is O(N^2) (if every hash collides). +template +bool operator==(const DenseSetImpl &LHS, + const DenseSetImpl &RHS) { + if (LHS.size() != RHS.size()) + return false; + + for (auto &E : LHS) + if (!RHS.count(E)) + return false; + + return true; +} + +/// Inequality comparison for DenseSet. +/// +/// Equivalent to !(LHS == RHS). See operator== for performance notes. +template +bool operator!=(const DenseSetImpl &LHS, + const DenseSetImpl &RHS) { + return !(LHS == RHS); +} + } // end namespace detail /// Implements a dense probed hash-table based set. diff --git a/unittests/ADT/DenseMapTest.cpp b/unittests/ADT/DenseMapTest.cpp index 87f22f6f403..ee9c5dd3800 100644 --- a/unittests/ADT/DenseMapTest.cpp +++ b/unittests/ADT/DenseMapTest.cpp @@ -362,6 +362,26 @@ int CountCopyAndMove::Move = 0; } // anonymous namespace +// Test initializer list construction. +TEST(DenseMapCustomTest, InitializerList) { + DenseMap M({{0, 0}, {0, 1}, {1, 2}}); + EXPECT_EQ(2u, M.size()); + EXPECT_EQ(1u, M.count(0)); + EXPECT_EQ(0, M[0]); + EXPECT_EQ(1u, M.count(1)); + EXPECT_EQ(2, M[1]); +} + +// Test initializer list construction. +TEST(DenseMapCustomTest, EqualityComparison) { + DenseMap M1({{0, 0}, {1, 2}}); + DenseMap M2({{0, 0}, {1, 2}}); + DenseMap M3({{0, 0}, {1, 3}}); + + EXPECT_EQ(M1, M2); + EXPECT_NE(M1, M3); +} + // Test for the default minimum size of a DenseMap TEST(DenseMapCustomTest, DefaultMinReservedSizeTest) { // IF THIS VALUE CHANGE, please update InitialSizeTest, InitFromIterator, and diff --git a/unittests/ADT/DenseSetTest.cpp b/unittests/ADT/DenseSetTest.cpp index 0247f023dce..04f84e041fb 100644 --- a/unittests/ADT/DenseSetTest.cpp +++ b/unittests/ADT/DenseSetTest.cpp @@ -121,6 +121,15 @@ TYPED_TEST(DenseSetTest, FindAsTest) { EXPECT_TRUE(set.find_as("d") == set.end()); } +TYPED_TEST(DenseSetTest, EqualityComparisonTest) { + TypeParam set1({1, 2, 3, 4}); + TypeParam set2({4, 3, 2, 1}); + TypeParam set3({2, 3, 4, 5}); + + EXPECT_EQ(set1, set2); + EXPECT_NE(set1, set3); +} + // Simple class that counts how many moves and copy happens when growing a map struct CountCopyAndMove { static int Move; -- GitLab From 8559689cb23c3dd7d21dcdf113748f1cf5fefb85 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Mon, 15 Oct 2018 15:28:44 +0000 Subject: [PATCH 0200/1116] [x86] add tests for fma with undef elts; NFC git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344523 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/CodeGen/X86/fma_patterns.ll | 46 ++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/test/CodeGen/X86/fma_patterns.ll b/test/CodeGen/X86/fma_patterns.ll index 2d01c570f99..d0d0dfed352 100644 --- a/test/CodeGen/X86/fma_patterns.ll +++ b/test/CodeGen/X86/fma_patterns.ll @@ -636,6 +636,29 @@ define <4 x float> @test_v4f32_mul_y_add_x_one(<4 x float> %x, <4 x float> %y) { ret <4 x float> %m } +define <4 x float> @test_v4f32_mul_y_add_x_one_undefs(<4 x float> %x, <4 x float> %y) { +; FMA-LABEL: test_v4f32_mul_y_add_x_one_undefs: +; FMA: # %bb.0: +; FMA-NEXT: vaddps {{.*}}(%rip), %xmm0, %xmm0 +; FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v4f32_mul_y_add_x_one_undefs: +; FMA4: # %bb.0: +; FMA4-NEXT: vaddps {{.*}}(%rip), %xmm0, %xmm0 +; FMA4-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v4f32_mul_y_add_x_one_undefs: +; AVX512: # %bb.0: +; AVX512-NEXT: vaddps {{.*}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: retq + %a = fadd <4 x float> %x, + %m = fmul <4 x float> %y, %a + ret <4 x float> %m +} + define <4 x float> @test_v4f32_mul_add_x_negone_y(<4 x float> %x, <4 x float> %y) { ; FMA-INFS-LABEL: test_v4f32_mul_add_x_negone_y: ; FMA-INFS: # %bb.0: @@ -712,6 +735,29 @@ define <4 x float> @test_v4f32_mul_y_add_x_negone(<4 x float> %x, <4 x float> %y ret <4 x float> %m } +define <4 x float> @test_v4f32_mul_y_add_x_negone_undefs(<4 x float> %x, <4 x float> %y) { +; FMA-LABEL: test_v4f32_mul_y_add_x_negone_undefs: +; FMA: # %bb.0: +; FMA-NEXT: vaddps {{.*}}(%rip), %xmm0, %xmm0 +; FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v4f32_mul_y_add_x_negone_undefs: +; FMA4: # %bb.0: +; FMA4-NEXT: vaddps {{.*}}(%rip), %xmm0, %xmm0 +; FMA4-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v4f32_mul_y_add_x_negone_undefs: +; AVX512: # %bb.0: +; AVX512-NEXT: vaddps {{.*}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: retq + %a = fadd <4 x float> %x, + %m = fmul <4 x float> %y, %a + ret <4 x float> %m +} + define <4 x float> @test_v4f32_mul_sub_one_x_y(<4 x float> %x, <4 x float> %y) { ; FMA-INFS-LABEL: test_v4f32_mul_sub_one_x_y: ; FMA-INFS: # %bb.0: -- GitLab From dea3b338ac8a25004094b5adffe2b0a33f7142fb Mon Sep 17 00:00:00 2001 From: Fedor Sergeev Date: Mon, 15 Oct 2018 15:36:08 +0000 Subject: [PATCH 0201/1116] Revert "[NewPM] teach -passes= to emit meaningful error messages" This reverts r344519 due to failures in pipeline-parsing test. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344524 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Passes/PassBuilder.h | 51 ++- lib/LTO/LTOBackend.cpp | 14 +- lib/Passes/PassBuilder.cpp | 366 +++++++++------------ test/Other/pass-pipeline-parsing.ll | 83 +---- test/tools/llvm-lto2/X86/pipeline.ll | 4 +- test/tools/llvm-opt-fuzzer/command-line.ll | 2 +- tools/llvm-opt-fuzzer/llvm-opt-fuzzer.cpp | 11 +- tools/opt/NewPMDriver.cpp | 84 ++--- unittests/IR/CMakeLists.txt | 2 - unittests/IR/PassBuilderCallbacksTest.cpp | 37 +-- unittests/Passes/CMakeLists.txt | 1 - unittests/Passes/PluginsTest.cpp | 5 +- 12 files changed, 266 insertions(+), 394 deletions(-) diff --git a/include/llvm/Passes/PassBuilder.h b/include/llvm/Passes/PassBuilder.h index 22e5eb0caa0..91314430a96 100644 --- a/include/llvm/Passes/PassBuilder.h +++ b/include/llvm/Passes/PassBuilder.h @@ -19,7 +19,6 @@ #include "llvm/ADT/Optional.h" #include "llvm/Analysis/CGSCCPassManager.h" #include "llvm/IR/PassManager.h" -#include "llvm/Support/Error.h" #include "llvm/Transforms/Instrumentation.h" #include "llvm/Transforms/Scalar/LoopPassManager.h" #include @@ -385,9 +384,8 @@ public: /// If the sequence of passes aren't all the exact same kind of pass, it will /// be an error. You cannot mix different levels implicitly, you must /// explicitly form a pass manager in which to nest passes. - Error parsePassPipeline(ModulePassManager &MPM, StringRef PipelineText, - bool VerifyEachPass = true, - bool DebugLogging = false); + bool parsePassPipeline(ModulePassManager &MPM, StringRef PipelineText, + bool VerifyEachPass = true, bool DebugLogging = false); /// {{@ Parse a textual pass pipeline description into a specific PassManager /// @@ -396,15 +394,12 @@ public: /// this is the valid pipeline text: /// /// function(lpass) - Error parsePassPipeline(CGSCCPassManager &CGPM, StringRef PipelineText, - bool VerifyEachPass = true, - bool DebugLogging = false); - Error parsePassPipeline(FunctionPassManager &FPM, StringRef PipelineText, - bool VerifyEachPass = true, - bool DebugLogging = false); - Error parsePassPipeline(LoopPassManager &LPM, StringRef PipelineText, - bool VerifyEachPass = true, - bool DebugLogging = false); + bool parsePassPipeline(CGSCCPassManager &CGPM, StringRef PipelineText, + bool VerifyEachPass = true, bool DebugLogging = false); + bool parsePassPipeline(FunctionPassManager &FPM, StringRef PipelineText, + bool VerifyEachPass = true, bool DebugLogging = false); + bool parsePassPipeline(LoopPassManager &LPM, StringRef PipelineText, + bool VerifyEachPass = true, bool DebugLogging = false); /// @}} /// Parse a textual alias analysis pipeline into the provided AA manager. @@ -422,7 +417,7 @@ public: /// Returns false if the text cannot be parsed cleanly. The specific state of /// the \p AA manager is unspecified if such an error is encountered and this /// returns false. - Error parseAAPipeline(AAManager &AA, StringRef PipelineText); + bool parseAAPipeline(AAManager &AA, StringRef PipelineText); /// Register a callback for a default optimizer pipeline extension /// point @@ -570,28 +565,28 @@ private: static Optional> parsePipelineText(StringRef Text); - Error parseModulePass(ModulePassManager &MPM, const PipelineElement &E, - bool VerifyEachPass, bool DebugLogging); - Error parseCGSCCPass(CGSCCPassManager &CGPM, const PipelineElement &E, + bool parseModulePass(ModulePassManager &MPM, const PipelineElement &E, bool VerifyEachPass, bool DebugLogging); - Error parseFunctionPass(FunctionPassManager &FPM, const PipelineElement &E, - bool VerifyEachPass, bool DebugLogging); - Error parseLoopPass(LoopPassManager &LPM, const PipelineElement &E, + bool parseCGSCCPass(CGSCCPassManager &CGPM, const PipelineElement &E, bool VerifyEachPass, bool DebugLogging); + bool parseFunctionPass(FunctionPassManager &FPM, const PipelineElement &E, + bool VerifyEachPass, bool DebugLogging); + bool parseLoopPass(LoopPassManager &LPM, const PipelineElement &E, + bool VerifyEachPass, bool DebugLogging); bool parseAAPassName(AAManager &AA, StringRef Name); - Error parseLoopPassPipeline(LoopPassManager &LPM, + bool parseLoopPassPipeline(LoopPassManager &LPM, + ArrayRef Pipeline, + bool VerifyEachPass, bool DebugLogging); + bool parseFunctionPassPipeline(FunctionPassManager &FPM, + ArrayRef Pipeline, + bool VerifyEachPass, bool DebugLogging); + bool parseCGSCCPassPipeline(CGSCCPassManager &CGPM, ArrayRef Pipeline, bool VerifyEachPass, bool DebugLogging); - Error parseFunctionPassPipeline(FunctionPassManager &FPM, - ArrayRef Pipeline, - bool VerifyEachPass, bool DebugLogging); - Error parseCGSCCPassPipeline(CGSCCPassManager &CGPM, + bool parseModulePassPipeline(ModulePassManager &MPM, ArrayRef Pipeline, bool VerifyEachPass, bool DebugLogging); - Error parseModulePassPipeline(ModulePassManager &MPM, - ArrayRef Pipeline, - bool VerifyEachPass, bool DebugLogging); void addPGOInstrPasses(ModulePassManager &MPM, bool DebugLogging, OptimizationLevel Level, bool RunProfileGen, diff --git a/lib/LTO/LTOBackend.cpp b/lib/LTO/LTOBackend.cpp index 1f9d60a5bdf..20fc40de4b9 100644 --- a/lib/LTO/LTOBackend.cpp +++ b/lib/LTO/LTOBackend.cpp @@ -162,7 +162,7 @@ static void runNewPMPasses(Config &Conf, Module &Mod, TargetMachine *TM, AAManager AA; // Parse a custom AA pipeline if asked to. - if (auto Err = PB.parseAAPipeline(AA, "default")) + if (!PB.parseAAPipeline(AA, "default")) report_fatal_error("Error parsing default AA pipeline"); LoopAnalysisManager LAM(Conf.DebugPassManager); @@ -221,9 +221,9 @@ static void runNewPMCustomPasses(Module &Mod, TargetMachine *TM, // Parse a custom AA pipeline if asked to. if (!AAPipelineDesc.empty()) - if (auto Err = PB.parseAAPipeline(AA, AAPipelineDesc)) - report_fatal_error("unable to parse AA pipeline description '" + - AAPipelineDesc + "': " + toString(std::move(Err))); + if (!PB.parseAAPipeline(AA, AAPipelineDesc)) + report_fatal_error("unable to parse AA pipeline description: " + + AAPipelineDesc); LoopAnalysisManager LAM; FunctionAnalysisManager FAM; @@ -246,9 +246,9 @@ static void runNewPMCustomPasses(Module &Mod, TargetMachine *TM, MPM.addPass(VerifierPass()); // Now, add all the passes we've been requested to. - if (auto Err = PB.parsePassPipeline(MPM, PipelineDesc)) - report_fatal_error("unable to parse pass pipeline description '" + - PipelineDesc + "': " + toString(std::move(Err))); + if (!PB.parsePassPipeline(MPM, PipelineDesc)) + report_fatal_error("unable to parse pass pipeline description: " + + PipelineDesc); if (!DisableVerify) MPM.addPass(VerifierPass()); diff --git a/lib/Passes/PassBuilder.cpp b/lib/Passes/PassBuilder.cpp index f6313d23e2d..09758dc5651 100644 --- a/lib/Passes/PassBuilder.cpp +++ b/lib/Passes/PassBuilder.cpp @@ -58,7 +58,6 @@ #include "llvm/IR/PassManager.h" #include "llvm/IR/Verifier.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/FormatVariadic.h" #include "llvm/Support/Regex.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h" @@ -1403,9 +1402,9 @@ PassBuilder::parsePipelineText(StringRef Text) { return {std::move(ResultPipeline)}; } -Error PassBuilder::parseModulePass(ModulePassManager &MPM, - const PipelineElement &E, - bool VerifyEachPass, bool DebugLogging) { +bool PassBuilder::parseModulePass(ModulePassManager &MPM, + const PipelineElement &E, bool VerifyEachPass, + bool DebugLogging) { auto &Name = E.Name; auto &InnerPipeline = E.InnerPipeline; @@ -1413,56 +1412,50 @@ Error PassBuilder::parseModulePass(ModulePassManager &MPM, if (!InnerPipeline.empty()) { if (Name == "module") { ModulePassManager NestedMPM(DebugLogging); - if (auto Err = parseModulePassPipeline(NestedMPM, InnerPipeline, - VerifyEachPass, DebugLogging)) - return Err; + if (!parseModulePassPipeline(NestedMPM, InnerPipeline, VerifyEachPass, + DebugLogging)) + return false; MPM.addPass(std::move(NestedMPM)); - return Error::success(); + return true; } if (Name == "cgscc") { CGSCCPassManager CGPM(DebugLogging); - if (auto Err = parseCGSCCPassPipeline(CGPM, InnerPipeline, VerifyEachPass, - DebugLogging)) - return Err; + if (!parseCGSCCPassPipeline(CGPM, InnerPipeline, VerifyEachPass, + DebugLogging)) + return false; MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM))); - return Error::success(); + return true; } if (Name == "function") { FunctionPassManager FPM(DebugLogging); - if (auto Err = parseFunctionPassPipeline(FPM, InnerPipeline, - VerifyEachPass, DebugLogging)) - return Err; + if (!parseFunctionPassPipeline(FPM, InnerPipeline, VerifyEachPass, + DebugLogging)) + return false; MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); - return Error::success(); + return true; } if (auto Count = parseRepeatPassName(Name)) { ModulePassManager NestedMPM(DebugLogging); - if (auto Err = parseModulePassPipeline(NestedMPM, InnerPipeline, - VerifyEachPass, DebugLogging)) - return Err; + if (!parseModulePassPipeline(NestedMPM, InnerPipeline, VerifyEachPass, + DebugLogging)) + return false; MPM.addPass(createRepeatedPass(*Count, std::move(NestedMPM))); - return Error::success(); + return true; } for (auto &C : ModulePipelineParsingCallbacks) if (C(Name, MPM, InnerPipeline)) - return Error::success(); + return true; // Normal passes can't have pipelines. - return make_error( - formatv("invalid use of '{0}' pass as module pipeline", Name).str(), - inconvertibleErrorCode()); - ; + return false; } // Manually handle aliases for pre-configured pipeline fragments. if (startsWithDefaultPipelineAliasPrefix(Name)) { SmallVector Matches; if (!DefaultAliasRegex.match(Name, &Matches)) - return make_error( - formatv("unknown default pipeline alias '{0}'", Name).str(), - inconvertibleErrorCode()); - + return false; assert(Matches.size() == 3 && "Must capture two matched strings!"); OptimizationLevel L = StringSwitch(Matches[2]) @@ -1474,7 +1467,7 @@ Error PassBuilder::parseModulePass(ModulePassManager &MPM, .Case("Oz", Oz); if (L == O0) // At O0 we do nothing at all! - return Error::success(); + return true; if (Matches[1] == "default") { MPM.addPass(buildPerModuleDefaultPipeline(L, DebugLogging)); @@ -1488,40 +1481,38 @@ Error PassBuilder::parseModulePass(ModulePassManager &MPM, assert(Matches[1] == "lto" && "Not one of the matched options!"); MPM.addPass(buildLTODefaultPipeline(L, DebugLogging, nullptr)); } - return Error::success(); + return true; } // Finally expand the basic registered passes from the .inc file. #define MODULE_PASS(NAME, CREATE_PASS) \ if (Name == NAME) { \ MPM.addPass(CREATE_PASS); \ - return Error::success(); \ + return true; \ } #define MODULE_ANALYSIS(NAME, CREATE_PASS) \ if (Name == "require<" NAME ">") { \ MPM.addPass( \ RequireAnalysisPass< \ std::remove_reference::type, Module>()); \ - return Error::success(); \ + return true; \ } \ if (Name == "invalidate<" NAME ">") { \ MPM.addPass(InvalidateAnalysisPass< \ std::remove_reference::type>()); \ - return Error::success(); \ + return true; \ } #include "PassRegistry.def" for (auto &C : ModulePipelineParsingCallbacks) if (C(Name, MPM, InnerPipeline)) - return Error::success(); - return make_error( - formatv("unknown module pass '{0}'", Name).str(), - inconvertibleErrorCode()); + return true; + return false; } -Error PassBuilder::parseCGSCCPass(CGSCCPassManager &CGPM, - const PipelineElement &E, bool VerifyEachPass, - bool DebugLogging) { +bool PassBuilder::parseCGSCCPass(CGSCCPassManager &CGPM, + const PipelineElement &E, bool VerifyEachPass, + bool DebugLogging) { auto &Name = E.Name; auto &InnerPipeline = E.InnerPipeline; @@ -1529,55 +1520,53 @@ Error PassBuilder::parseCGSCCPass(CGSCCPassManager &CGPM, if (!InnerPipeline.empty()) { if (Name == "cgscc") { CGSCCPassManager NestedCGPM(DebugLogging); - if (auto Err = parseCGSCCPassPipeline(NestedCGPM, InnerPipeline, - VerifyEachPass, DebugLogging)) - return Err; + if (!parseCGSCCPassPipeline(NestedCGPM, InnerPipeline, VerifyEachPass, + DebugLogging)) + return false; // Add the nested pass manager with the appropriate adaptor. CGPM.addPass(std::move(NestedCGPM)); - return Error::success(); + return true; } if (Name == "function") { FunctionPassManager FPM(DebugLogging); - if (auto Err = parseFunctionPassPipeline(FPM, InnerPipeline, - VerifyEachPass, DebugLogging)) - return Err; + if (!parseFunctionPassPipeline(FPM, InnerPipeline, VerifyEachPass, + DebugLogging)) + return false; // Add the nested pass manager with the appropriate adaptor. CGPM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM))); - return Error::success(); + return true; } if (auto Count = parseRepeatPassName(Name)) { CGSCCPassManager NestedCGPM(DebugLogging); - if (auto Err = parseCGSCCPassPipeline(NestedCGPM, InnerPipeline, - VerifyEachPass, DebugLogging)) - return Err; + if (!parseCGSCCPassPipeline(NestedCGPM, InnerPipeline, VerifyEachPass, + DebugLogging)) + return false; CGPM.addPass(createRepeatedPass(*Count, std::move(NestedCGPM))); - return Error::success(); + return true; } if (auto MaxRepetitions = parseDevirtPassName(Name)) { CGSCCPassManager NestedCGPM(DebugLogging); - if (auto Err = parseCGSCCPassPipeline(NestedCGPM, InnerPipeline, - VerifyEachPass, DebugLogging)) - return Err; + if (!parseCGSCCPassPipeline(NestedCGPM, InnerPipeline, VerifyEachPass, + DebugLogging)) + return false; CGPM.addPass( createDevirtSCCRepeatedPass(std::move(NestedCGPM), *MaxRepetitions)); - return Error::success(); + return true; } for (auto &C : CGSCCPipelineParsingCallbacks) if (C(Name, CGPM, InnerPipeline)) - return Error::success(); + return true; // Normal passes can't have pipelines. - return make_error( - formatv("invalid use of '{0}' pass as cgscc pipeline", Name).str(), - inconvertibleErrorCode()); + return false; } // Now expand the basic registered passes from the .inc file. #define CGSCC_PASS(NAME, CREATE_PASS) \ if (Name == NAME) { \ CGPM.addPass(CREATE_PASS); \ - return Error::success(); \ + return true; \ } #define CGSCC_ANALYSIS(NAME, CREATE_PASS) \ if (Name == "require<" NAME ">") { \ @@ -1585,26 +1574,24 @@ Error PassBuilder::parseCGSCCPass(CGSCCPassManager &CGPM, std::remove_reference::type, \ LazyCallGraph::SCC, CGSCCAnalysisManager, LazyCallGraph &, \ CGSCCUpdateResult &>()); \ - return Error::success(); \ + return true; \ } \ if (Name == "invalidate<" NAME ">") { \ CGPM.addPass(InvalidateAnalysisPass< \ std::remove_reference::type>()); \ - return Error::success(); \ + return true; \ } #include "PassRegistry.def" for (auto &C : CGSCCPipelineParsingCallbacks) if (C(Name, CGPM, InnerPipeline)) - return Error::success(); - return make_error( - formatv("unknown cgscc pass '{0}'", Name).str(), - inconvertibleErrorCode()); + return true; + return false; } -Error PassBuilder::parseFunctionPass(FunctionPassManager &FPM, - const PipelineElement &E, - bool VerifyEachPass, bool DebugLogging) { +bool PassBuilder::parseFunctionPass(FunctionPassManager &FPM, + const PipelineElement &E, + bool VerifyEachPass, bool DebugLogging) { auto &Name = E.Name; auto &InnerPipeline = E.InnerPipeline; @@ -1612,72 +1599,68 @@ Error PassBuilder::parseFunctionPass(FunctionPassManager &FPM, if (!InnerPipeline.empty()) { if (Name == "function") { FunctionPassManager NestedFPM(DebugLogging); - if (auto Err = parseFunctionPassPipeline(NestedFPM, InnerPipeline, - VerifyEachPass, DebugLogging)) - return Err; + if (!parseFunctionPassPipeline(NestedFPM, InnerPipeline, VerifyEachPass, + DebugLogging)) + return false; // Add the nested pass manager with the appropriate adaptor. FPM.addPass(std::move(NestedFPM)); - return Error::success(); + return true; } if (Name == "loop") { LoopPassManager LPM(DebugLogging); - if (auto Err = parseLoopPassPipeline(LPM, InnerPipeline, VerifyEachPass, - DebugLogging)) - return Err; + if (!parseLoopPassPipeline(LPM, InnerPipeline, VerifyEachPass, + DebugLogging)) + return false; // Add the nested pass manager with the appropriate adaptor. FPM.addPass( createFunctionToLoopPassAdaptor(std::move(LPM), DebugLogging)); - return Error::success(); + return true; } if (auto Count = parseRepeatPassName(Name)) { FunctionPassManager NestedFPM(DebugLogging); - if (auto Err = parseFunctionPassPipeline(NestedFPM, InnerPipeline, - VerifyEachPass, DebugLogging)) - return Err; + if (!parseFunctionPassPipeline(NestedFPM, InnerPipeline, VerifyEachPass, + DebugLogging)) + return false; FPM.addPass(createRepeatedPass(*Count, std::move(NestedFPM))); - return Error::success(); + return true; } for (auto &C : FunctionPipelineParsingCallbacks) if (C(Name, FPM, InnerPipeline)) - return Error::success(); + return true; // Normal passes can't have pipelines. - return make_error( - formatv("invalid use of '{0}' pass as function pipeline", Name).str(), - inconvertibleErrorCode()); + return false; } // Now expand the basic registered passes from the .inc file. #define FUNCTION_PASS(NAME, CREATE_PASS) \ if (Name == NAME) { \ FPM.addPass(CREATE_PASS); \ - return Error::success(); \ + return true; \ } #define FUNCTION_ANALYSIS(NAME, CREATE_PASS) \ if (Name == "require<" NAME ">") { \ FPM.addPass( \ RequireAnalysisPass< \ std::remove_reference::type, Function>()); \ - return Error::success(); \ + return true; \ } \ if (Name == "invalidate<" NAME ">") { \ FPM.addPass(InvalidateAnalysisPass< \ std::remove_reference::type>()); \ - return Error::success(); \ + return true; \ } #include "PassRegistry.def" for (auto &C : FunctionPipelineParsingCallbacks) if (C(Name, FPM, InnerPipeline)) - return Error::success(); - return make_error( - formatv("unknown function pass '{0}'", Name).str(), - inconvertibleErrorCode()); + return true; + return false; } -Error PassBuilder::parseLoopPass(LoopPassManager &LPM, const PipelineElement &E, - bool VerifyEachPass, bool DebugLogging) { +bool PassBuilder::parseLoopPass(LoopPassManager &LPM, const PipelineElement &E, + bool VerifyEachPass, bool DebugLogging) { StringRef Name = E.Name; auto &InnerPipeline = E.InnerPipeline; @@ -1685,37 +1668,35 @@ Error PassBuilder::parseLoopPass(LoopPassManager &LPM, const PipelineElement &E, if (!InnerPipeline.empty()) { if (Name == "loop") { LoopPassManager NestedLPM(DebugLogging); - if (auto Err = parseLoopPassPipeline(NestedLPM, InnerPipeline, - VerifyEachPass, DebugLogging)) - return Err; + if (!parseLoopPassPipeline(NestedLPM, InnerPipeline, VerifyEachPass, + DebugLogging)) + return false; // Add the nested pass manager with the appropriate adaptor. LPM.addPass(std::move(NestedLPM)); - return Error::success(); + return true; } if (auto Count = parseRepeatPassName(Name)) { LoopPassManager NestedLPM(DebugLogging); - if (auto Err = parseLoopPassPipeline(NestedLPM, InnerPipeline, - VerifyEachPass, DebugLogging)) - return Err; + if (!parseLoopPassPipeline(NestedLPM, InnerPipeline, VerifyEachPass, + DebugLogging)) + return false; LPM.addPass(createRepeatedPass(*Count, std::move(NestedLPM))); - return Error::success(); + return true; } for (auto &C : LoopPipelineParsingCallbacks) if (C(Name, LPM, InnerPipeline)) - return Error::success(); + return true; // Normal passes can't have pipelines. - return make_error( - formatv("invalid use of '{0}' pass as loop pipeline", Name).str(), - inconvertibleErrorCode()); + return false; } // Now expand the basic registered passes from the .inc file. #define LOOP_PASS(NAME, CREATE_PASS) \ if (Name == NAME) { \ LPM.addPass(CREATE_PASS); \ - return Error::success(); \ + return true; \ } #define LOOP_ANALYSIS(NAME, CREATE_PASS) \ if (Name == "require<" NAME ">") { \ @@ -1723,20 +1704,19 @@ Error PassBuilder::parseLoopPass(LoopPassManager &LPM, const PipelineElement &E, std::remove_reference::type, Loop, \ LoopAnalysisManager, LoopStandardAnalysisResults &, \ LPMUpdater &>()); \ - return Error::success(); \ + return true; \ } \ if (Name == "invalidate<" NAME ">") { \ LPM.addPass(InvalidateAnalysisPass< \ std::remove_reference::type>()); \ - return Error::success(); \ + return true; \ } #include "PassRegistry.def" for (auto &C : LoopPipelineParsingCallbacks) if (C(Name, LPM, InnerPipeline)) - return Error::success(); - return make_error(formatv("unknown loop pass '{0}'", Name).str(), - inconvertibleErrorCode()); + return true; + return false; } bool PassBuilder::parseAAPassName(AAManager &AA, StringRef Name) { @@ -1760,42 +1740,41 @@ bool PassBuilder::parseAAPassName(AAManager &AA, StringRef Name) { return false; } -Error PassBuilder::parseLoopPassPipeline(LoopPassManager &LPM, - ArrayRef Pipeline, - bool VerifyEachPass, - bool DebugLogging) { +bool PassBuilder::parseLoopPassPipeline(LoopPassManager &LPM, + ArrayRef Pipeline, + bool VerifyEachPass, + bool DebugLogging) { for (const auto &Element : Pipeline) { - if (auto Err = parseLoopPass(LPM, Element, VerifyEachPass, DebugLogging)) - return Err; + if (!parseLoopPass(LPM, Element, VerifyEachPass, DebugLogging)) + return false; // FIXME: No verifier support for Loop passes! } - return Error::success(); + return true; } -Error PassBuilder::parseFunctionPassPipeline(FunctionPassManager &FPM, - ArrayRef Pipeline, - bool VerifyEachPass, - bool DebugLogging) { +bool PassBuilder::parseFunctionPassPipeline(FunctionPassManager &FPM, + ArrayRef Pipeline, + bool VerifyEachPass, + bool DebugLogging) { for (const auto &Element : Pipeline) { - if (auto Err = - parseFunctionPass(FPM, Element, VerifyEachPass, DebugLogging)) - return Err; + if (!parseFunctionPass(FPM, Element, VerifyEachPass, DebugLogging)) + return false; if (VerifyEachPass) FPM.addPass(VerifierPass()); } - return Error::success(); + return true; } -Error PassBuilder::parseCGSCCPassPipeline(CGSCCPassManager &CGPM, - ArrayRef Pipeline, - bool VerifyEachPass, - bool DebugLogging) { +bool PassBuilder::parseCGSCCPassPipeline(CGSCCPassManager &CGPM, + ArrayRef Pipeline, + bool VerifyEachPass, + bool DebugLogging) { for (const auto &Element : Pipeline) { - if (auto Err = parseCGSCCPass(CGPM, Element, VerifyEachPass, DebugLogging)) - return Err; + if (!parseCGSCCPass(CGPM, Element, VerifyEachPass, DebugLogging)) + return false; // FIXME: No verifier support for CGSCC passes! } - return Error::success(); + return true; } void PassBuilder::crossRegisterProxies(LoopAnalysisManager &LAM, @@ -1811,30 +1790,28 @@ void PassBuilder::crossRegisterProxies(LoopAnalysisManager &LAM, LAM.registerPass([&] { return FunctionAnalysisManagerLoopProxy(FAM); }); } -Error PassBuilder::parseModulePassPipeline(ModulePassManager &MPM, - ArrayRef Pipeline, - bool VerifyEachPass, - bool DebugLogging) { +bool PassBuilder::parseModulePassPipeline(ModulePassManager &MPM, + ArrayRef Pipeline, + bool VerifyEachPass, + bool DebugLogging) { for (const auto &Element : Pipeline) { - if (auto Err = parseModulePass(MPM, Element, VerifyEachPass, DebugLogging)) - return Err; + if (!parseModulePass(MPM, Element, VerifyEachPass, DebugLogging)) + return false; if (VerifyEachPass) MPM.addPass(VerifierPass()); } - return Error::success(); + return true; } // Primary pass pipeline description parsing routine for a \c ModulePassManager // FIXME: Should this routine accept a TargetMachine or require the caller to // pre-populate the analysis managers with target-specific stuff? -Error PassBuilder::parsePassPipeline(ModulePassManager &MPM, - StringRef PipelineText, - bool VerifyEachPass, bool DebugLogging) { +bool PassBuilder::parsePassPipeline(ModulePassManager &MPM, + StringRef PipelineText, bool VerifyEachPass, + bool DebugLogging) { auto Pipeline = parsePipelineText(PipelineText); if (!Pipeline || Pipeline->empty()) - return make_error( - formatv("invalid pipeline '{0}'", PipelineText).str(), - inconvertibleErrorCode()); + return false; // If the first name isn't at the module layer, wrap the pipeline up // automatically. @@ -1851,106 +1828,73 @@ Error PassBuilder::parsePassPipeline(ModulePassManager &MPM, } else { for (auto &C : TopLevelPipelineParsingCallbacks) if (C(MPM, *Pipeline, VerifyEachPass, DebugLogging)) - return Error::success(); - - // Unknown pass or pipeline name! - auto &InnerPipeline = Pipeline->front().InnerPipeline; - return make_error( - formatv("unknown {0} name '{1}'", - (InnerPipeline.empty() ? "pass" : "pipeline"), FirstName) - .str(), - inconvertibleErrorCode()); + return true; + + // Unknown pass name! + return false; } } - if (auto Err = - parseModulePassPipeline(MPM, *Pipeline, VerifyEachPass, DebugLogging)) - return Err; - return Error::success(); + return parseModulePassPipeline(MPM, *Pipeline, VerifyEachPass, DebugLogging); } // Primary pass pipeline description parsing routine for a \c CGSCCPassManager -Error PassBuilder::parsePassPipeline(CGSCCPassManager &CGPM, - StringRef PipelineText, - bool VerifyEachPass, bool DebugLogging) { +bool PassBuilder::parsePassPipeline(CGSCCPassManager &CGPM, + StringRef PipelineText, bool VerifyEachPass, + bool DebugLogging) { auto Pipeline = parsePipelineText(PipelineText); if (!Pipeline || Pipeline->empty()) - return make_error( - formatv("invalid pipeline '{0}'", PipelineText).str(), - inconvertibleErrorCode()); + return false; StringRef FirstName = Pipeline->front().Name; if (!isCGSCCPassName(FirstName, CGSCCPipelineParsingCallbacks)) - return make_error( - formatv("unknown cgscc pass '{0}' in pipeline '{1}'", FirstName, - PipelineText) - .str(), - inconvertibleErrorCode()); - - if (auto Err = - parseCGSCCPassPipeline(CGPM, *Pipeline, VerifyEachPass, DebugLogging)) - return Err; - return Error::success(); + return false; + + return parseCGSCCPassPipeline(CGPM, *Pipeline, VerifyEachPass, DebugLogging); } // Primary pass pipeline description parsing routine for a \c // FunctionPassManager -Error PassBuilder::parsePassPipeline(FunctionPassManager &FPM, - StringRef PipelineText, - bool VerifyEachPass, bool DebugLogging) { +bool PassBuilder::parsePassPipeline(FunctionPassManager &FPM, + StringRef PipelineText, bool VerifyEachPass, + bool DebugLogging) { auto Pipeline = parsePipelineText(PipelineText); if (!Pipeline || Pipeline->empty()) - return make_error( - formatv("invalid pipeline '{0}'", PipelineText).str(), - inconvertibleErrorCode()); + return false; StringRef FirstName = Pipeline->front().Name; if (!isFunctionPassName(FirstName, FunctionPipelineParsingCallbacks)) - return make_error( - formatv("unknown function pass '{0}' in pipeline '{1}'", FirstName, - PipelineText) - .str(), - inconvertibleErrorCode()); - - if (auto Err = parseFunctionPassPipeline(FPM, *Pipeline, VerifyEachPass, - DebugLogging)) - return Err; - return Error::success(); + return false; + + return parseFunctionPassPipeline(FPM, *Pipeline, VerifyEachPass, + DebugLogging); } // Primary pass pipeline description parsing routine for a \c LoopPassManager -Error PassBuilder::parsePassPipeline(LoopPassManager &CGPM, - StringRef PipelineText, - bool VerifyEachPass, bool DebugLogging) { +bool PassBuilder::parsePassPipeline(LoopPassManager &CGPM, + StringRef PipelineText, bool VerifyEachPass, + bool DebugLogging) { auto Pipeline = parsePipelineText(PipelineText); if (!Pipeline || Pipeline->empty()) - return make_error( - formatv("invalid pipeline '{0}'", PipelineText).str(), - inconvertibleErrorCode()); - - if (auto Err = - parseLoopPassPipeline(CGPM, *Pipeline, VerifyEachPass, DebugLogging)) - return Err; + return false; - return Error::success(); + return parseLoopPassPipeline(CGPM, *Pipeline, VerifyEachPass, DebugLogging); } -Error PassBuilder::parseAAPipeline(AAManager &AA, StringRef PipelineText) { +bool PassBuilder::parseAAPipeline(AAManager &AA, StringRef PipelineText) { // If the pipeline just consists of the word 'default' just replace the AA // manager with our default one. if (PipelineText == "default") { AA = buildDefaultAAPipeline(); - return Error::success(); + return true; } while (!PipelineText.empty()) { StringRef Name; std::tie(Name, PipelineText) = PipelineText.split(','); if (!parseAAPassName(AA, Name)) - return make_error( - formatv("unknown alias analysis name '{0}'", Name).str(), - inconvertibleErrorCode()); + return false; } - return Error::success(); + return true; } diff --git a/test/Other/pass-pipeline-parsing.ll b/test/Other/pass-pipeline-parsing.ll index d13a977dbce..b303318c796 100644 --- a/test/Other/pass-pipeline-parsing.ll +++ b/test/Other/pass-pipeline-parsing.ll @@ -54,52 +54,52 @@ ; RUN: not opt -disable-output -debug-pass-manager \ ; RUN: -passes='no-op-module)' %s 2>&1 \ ; RUN: | FileCheck %s --check-prefix=CHECK-UNBALANCED1 -; CHECK-UNBALANCED1: invalid pipeline 'no-op-module)' +; CHECK-UNBALANCED1: unable to parse pass pipeline description ; RUN: not opt -disable-output -debug-pass-manager \ ; RUN: -passes='module(no-op-module))' %s 2>&1 \ ; RUN: | FileCheck %s --check-prefix=CHECK-UNBALANCED2 -; CHECK-UNBALANCED2: invalid pipeline 'module(no-op-module))' +; CHECK-UNBALANCED2: unable to parse pass pipeline description ; RUN: not opt -disable-output -debug-pass-manager \ ; RUN: -passes='module(no-op-module' %s 2>&1 \ ; RUN: | FileCheck %s --check-prefix=CHECK-UNBALANCED3 -; CHECK-UNBALANCED3: invalid pipeline 'module(no-op-module' +; CHECK-UNBALANCED3: unable to parse pass pipeline description ; RUN: not opt -disable-output -debug-pass-manager \ ; RUN: -passes='no-op-function)' %s 2>&1 \ ; RUN: | FileCheck %s --check-prefix=CHECK-UNBALANCED4 -; CHECK-UNBALANCED4: invalid pipeline 'no-op-function)' +; CHECK-UNBALANCED4: unable to parse pass pipeline description ; RUN: not opt -disable-output -debug-pass-manager \ ; RUN: -passes='function(no-op-function))' %s 2>&1 \ ; RUN: | FileCheck %s --check-prefix=CHECK-UNBALANCED5 -; CHECK-UNBALANCED5: invalid pipeline 'function(no-op-function))' +; CHECK-UNBALANCED5: unable to parse pass pipeline description ; RUN: not opt -disable-output -debug-pass-manager \ ; RUN: -passes='function(function(no-op-function)))' %s 2>&1 \ ; RUN: | FileCheck %s --check-prefix=CHECK-UNBALANCED6 -; CHECK-UNBALANCED6: invalid pipeline 'function(function(no-op-function)))' +; CHECK-UNBALANCED6: unable to parse pass pipeline description ; RUN: not opt -disable-output -debug-pass-manager \ ; RUN: -passes='function(no-op-function' %s 2>&1 \ ; RUN: | FileCheck %s --check-prefix=CHECK-UNBALANCED7 -; CHECK-UNBALANCED7: invalid pipeline 'function(no-op-function' +; CHECK-UNBALANCED7: unable to parse pass pipeline description ; RUN: not opt -disable-output -debug-pass-manager \ ; RUN: -passes='function(function(no-op-function)' %s 2>&1 \ ; RUN: | FileCheck %s --check-prefix=CHECK-UNBALANCED8 -; CHECK-UNBALANCED8: invalid pipeline 'function(function(no-op-function)' +; CHECK-UNBALANCED8: unable to parse pass pipeline description ; RUN: not opt -disable-output -debug-pass-manager \ ; RUN: -passes='no-op-module,)' %s 2>&1 \ ; RUN: | FileCheck %s --check-prefix=CHECK-UNBALANCED9 -; CHECK-UNBALANCED9: invalid pipeline 'no-op-module,)' +; CHECK-UNBALANCED9: unable to parse pass pipeline description ; RUN: not opt -disable-output -debug-pass-manager \ ; RUN: -passes='no-op-function,)' %s 2>&1 \ ; RUN: | FileCheck %s --check-prefix=CHECK-UNBALANCED10 -; CHECK-UNBALANCED10: invalid pipeline 'no-op-function,)' +; CHECK-UNBALANCED10: unable to parse pass pipeline description ; RUN: opt -disable-output -debug-pass-manager \ ; RUN: -passes=no-op-cgscc,no-op-cgscc %s 2>&1 \ @@ -176,86 +176,37 @@ ; RUN: not opt -disable-output -debug-pass-manager \ ; RUN: -passes='function(no-op-function)function(no-op-function)' %s 2>&1 \ ; RUN: | FileCheck %s --check-prefix=CHECK-MISSING-COMMA1 -; CHECK-MISSING-COMMA1: invalid pipeline 'function(no-op-function)function(no-op-function)' +; CHECK-MISSING-COMMA1: unable to parse pass pipeline description ; RUN: not opt -disable-output -debug-pass-manager \ ; RUN: -passes='function()' %s 2>&1 \ ; RUN: | FileCheck %s --check-prefix=CHECK-EMPTY-INNER-PIPELINE -; CHECK-EMPTY-INNER-PIPELINE: unknown function pass '' +; CHECK-EMPTY-INNER-PIPELINE: unable to parse pass pipeline description ; RUN: not opt -disable-output -debug-pass-manager \ ; RUN: -passes='no-op-module(no-op-module,whatever)' %s 2>&1 \ ; RUN: | FileCheck %s --check-prefix=CHECK-PIPELINE-ON-MODULE-PASS -; CHECK-PIPELINE-ON-MODULE-PASS: invalid use of 'no-op-module' pass as module pipeline +; CHECK-PIPELINE-ON-MODULE-PASS: unable to parse pass pipeline description ; RUN: not opt -disable-output -debug-pass-manager \ ; RUN: -passes='no-op-cgscc(no-op-cgscc,whatever)' %s 2>&1 \ ; RUN: | FileCheck %s --check-prefix=CHECK-PIPELINE-ON-CGSCC-PASS -; CHECK-PIPELINE-ON-CGSCC-PASS: invalid use of 'no-op-cgscc' pass as cgscc pipeline +; CHECK-PIPELINE-ON-CGSCC-PASS: unable to parse pass pipeline description ; RUN: not opt -disable-output -debug-pass-manager \ ; RUN: -passes='no-op-function(no-op-function,whatever)' %s 2>&1 \ ; RUN: | FileCheck %s --check-prefix=CHECK-PIPELINE-ON-FUNCTION-PASS -; CHECK-PIPELINE-ON-FUNCTION-PASS: invalid use of 'no-op-function' pass as function pipeline +; CHECK-PIPELINE-ON-FUNCTION-PASS: unable to parse pass pipeline description ; RUN: not opt -disable-output -debug-pass-manager \ ; RUN: -passes='no-op-loop(no-op-loop,whatever)' %s 2>&1 \ ; RUN: | FileCheck %s --check-prefix=CHECK-PIPELINE-ON-LOOP-PASS -; CHECK-PIPELINE-ON-LOOP-PASS: invalid use of 'no-op-loop' pass as loop pipeline +; CHECK-PIPELINE-ON-LOOP-PASS: unable to parse pass pipeline description ; RUN: not opt -disable-output -debug-pass-manager \ ; RUN: -passes='no-op-function()' %s 2>&1 \ ; RUN: | FileCheck %s --check-prefix=CHECK-EMPTY-PIPELINE-ON-PASS -; CHECK-EMPTY-PIPELINE-ON-PASS: invalid use of 'no-op-function' pass as function pipeline - -; RUN: not opt -passes='no-op-module,bad' \ -; RUN: /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=CHECK-UNKNOWN-MODULE -; CHECK-UNKNOWN-MODULE: opt: unknown module pass 'bad' - -; RUN: not opt -passes='no-op-loop,bad' \ -; RUN: /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=CHECK-UNKNOWN-LOOP -; CHECK-UNKNOWN-LOOP: opt: unknown loop pass 'bad' - -; RUN: not opt -passes='no-op-cgscc,bad' \ -; RUN: /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=CHECK-UNKNOWN-CGSCC -; CHECK-UNKNOWN-CGSCC: opt: unknown cgscc pass 'bad' - -; RUN: not opt -passes='no-op-function,bad' \ -; RUN: /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=CHECK-UNKNOWN-FUNCTION -; RUN: not opt -passes='function(bad,pipeline,text)' \ -; RUN: /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=CHECK-UNKNOWN-FUNCTION -; RUN: not opt -passes='module(no-op-module,function(bad,pipeline,text))' \ -; RUN: /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=CHECK-UNKNOWN-FUNCTION -; RUN: not opt -passes='no-op-module,function(bad,pipeline,text)' \ -; RUN: /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=CHECK-UNKNOWN-FUNCTION -; RUN: not opt -passes='module(cgscc(function(bad,pipeline,text)))' \ -; RUN: /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=CHECK-UNKNOWN-FUNCTION -; CHECK-UNKNOWN-FUNCTION: opt: unknown function pass 'bad' - -; RUN: not opt -aa-pipeline=bad -passes=no-op-function \ -; RUN: /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=AA-PIPELINE-ERR -; AA-PIPELINE-ERR: unknown alias analysis name 'bad' -; RUN: opt -passes-ep-peephole=bad -passes=no-op-function \ -; RUN: /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=PASSES-EP-PEEPHOLE-ERR -; PASSES-EP-PEEPHOLE-ERR: Could not parse pipeline 'bad'. I'm going to ignore it. -; RUN: opt -passes-ep-late-loop-optimizations=bad -passes=no-op-function \ -; RUN: /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=PASSES-EP-LATELOOPOPT-ERR -; PASSES-EP-LATELOOPOPT-ERR: Could not parse pipeline 'bad'. I'm going to ignore it. -; RUN: opt -passes-ep-loop-optimizer-end=bad -passes=no-op-function \ -; RUN: /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=PASSES-EP-LOOPOPTEND-ERR -; PASSES-EP-LOOPOPTEND-ERR: Could not parse pipeline 'bad'. I'm going to ignore it. -; RUN: opt -passes-ep-scalar-optimizer-late=bad -passes=no-op-function \ -; RUN: /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=PASSES-EP-SCALAROPTLATE-ERR -; PASSES-EP-SCALAROPTLATE-ERR: Could not parse pipeline 'bad'. I'm going to ignore it. -; RUN: opt -passes-ep-cgscc-optimizer-late=bad -passes=no-op-function \ -; RUN: /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=PASSES-EP-CGSCCOPTLATE-ERR -; PASSES-EP-CGSCCOPTLATE-ERR: Could not parse pipeline 'bad'. I'm going to ignore it. -; RUN: opt -passes-ep-vectorizer-start=bad -passes=no-op-function \ -; RUN: /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=PASSES-EP-VECTORIZERSTART-ERR -; PASSES-EP-VECTORIZERSTART-ERR: Could not parse pipeline 'bad'. I'm going to ignore it. -; RUN: opt -passes-ep-pipeline-start=bad -passes=no-op-function \ -; RUN: /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=PASSES-EP-PIPELINESTART-ERR -; PASSES-EP-PIPELINESTART-ERR: Could not parse pipeline 'bad'. I'm going to ignore it. +; CHECK-EMPTY-PIPELINE-ON-PASS: unable to parse pass pipeline description define void @f() { entry: diff --git a/test/tools/llvm-lto2/X86/pipeline.ll b/test/tools/llvm-lto2/X86/pipeline.ll index 9ab81ac70a7..29276d8d13a 100644 --- a/test/tools/llvm-lto2/X86/pipeline.ll +++ b/test/tools/llvm-lto2/X86/pipeline.ll @@ -32,11 +32,11 @@ define void @patatino() { ; RUN: -r %t1.bc,patatino,px -opt-pipeline foogoo 2>&1 | \ ; RUN: FileCheck %s --check-prefix=ERR -; ERR: LLVM ERROR: unable to parse pass pipeline description 'foogoo': unknown pass name 'foogoo' +; ERR: LLVM ERROR: unable to parse pass pipeline description: foogoo ; RUN: not llvm-lto2 run %t1.bc -o %t.o \ ; RUN: -r %t1.bc,patatino,px -aa-pipeline patatino \ ; RUN: -opt-pipeline loweratomic 2>&1 | \ ; RUN: FileCheck %s --check-prefix=AAERR -; AAERR: LLVM ERROR: unable to parse AA pipeline description 'patatino': unknown alias analysis name 'patatino' +; AAERR: LLVM ERROR: unable to parse AA pipeline description: patatino diff --git a/test/tools/llvm-opt-fuzzer/command-line.ll b/test/tools/llvm-opt-fuzzer/command-line.ll index 8c3f6b60154..f747bba431b 100644 --- a/test/tools/llvm-opt-fuzzer/command-line.ll +++ b/test/tools/llvm-opt-fuzzer/command-line.ll @@ -13,7 +13,7 @@ ; Don't start with incorrect passes specified ; RUN: not llvm-opt-fuzzer %t -ignore_remaining_args=1 -mtriple x86_64 -passes no-pass 2>&1 | FileCheck -check-prefix=PIPELINE %s -; PIPELINE: unknown pass name 'no-pass' +; PIPELINE: can't parse pass pipeline ; Correct command line ; RUN: llvm-opt-fuzzer %t -ignore_remaining_args=1 -mtriple x86_64 -passes instcombine 2>&1 | FileCheck -check-prefix=CORRECT %s diff --git a/tools/llvm-opt-fuzzer/llvm-opt-fuzzer.cpp b/tools/llvm-opt-fuzzer/llvm-opt-fuzzer.cpp index 57e75b1db9e..98d5428ddd1 100644 --- a/tools/llvm-opt-fuzzer/llvm-opt-fuzzer.cpp +++ b/tools/llvm-opt-fuzzer/llvm-opt-fuzzer.cpp @@ -144,10 +144,9 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) { PB.registerLoopAnalyses(LAM); PB.crossRegisterProxies(LAM, FAM, CGAM, MAM); - auto Err = PB.parsePassPipeline(MPM, PassPipeline, false, false); - assert(!Err && "Should have been checked during fuzzer initialization"); - // Only fail with assert above, otherwise ignore the parsing error. - consumeError(std::move(Err)); + bool Ok = PB.parsePassPipeline(MPM, PassPipeline, false, false); + assert(Ok && "Should have been checked during fuzzer initialization"); + (void)Ok; // silence unused variable warning on release builds // Run passes which we need to test // @@ -236,8 +235,8 @@ extern "C" LLVM_ATTRIBUTE_USED int LLVMFuzzerInitialize( PassBuilder PB(TM.get()); ModulePassManager MPM; - if (auto Err = PB.parsePassPipeline(MPM, PassPipeline, false, false)) { - errs() << *argv[0] << ": " << toString(std::move(Err)) << "\n"; + if (!PB.parsePassPipeline(MPM, PassPipeline, false, false)) { + errs() << *argv[0] << ": can't parse pass pipeline\n"; exit(1); } diff --git a/tools/opt/NewPMDriver.cpp b/tools/opt/NewPMDriver.cpp index 11879d26a6c..e63547a79d0 100644 --- a/tools/opt/NewPMDriver.cpp +++ b/tools/opt/NewPMDriver.cpp @@ -124,12 +124,12 @@ bool tryParsePipelineText(PassBuilder &PB, StringRef PipelineText) { // Verify the pipeline is parseable: PassManagerT PM; - if (auto Err = PB.parsePassPipeline(PM, PipelineText)) { - errs() << "Could not parse pipeline '" << PipelineText - << "'. I'm going to ignore it.\n"; - return false; - } - return true; + if (PB.parsePassPipeline(PM, PipelineText)) + return true; + + errs() << "Could not parse pipeline '" << PipelineText + << "'. I'm going to igore it.\n"; + return false; } /// If one of the EPPipeline command line options was given, register callbacks @@ -137,61 +137,50 @@ bool tryParsePipelineText(PassBuilder &PB, StringRef PipelineText) { static void registerEPCallbacks(PassBuilder &PB, bool VerifyEachPass, bool DebugLogging) { if (tryParsePipelineText(PB, PeepholeEPPipeline)) - PB.registerPeepholeEPCallback( - [&PB, VerifyEachPass, DebugLogging]( - FunctionPassManager &PM, PassBuilder::OptimizationLevel Level) { - ExitOnError Err("Unable to parse PeepholeEP pipeline: "); - Err(PB.parsePassPipeline(PM, PeepholeEPPipeline, VerifyEachPass, - DebugLogging)); - }); + PB.registerPeepholeEPCallback([&PB, VerifyEachPass, DebugLogging]( + FunctionPassManager &PM, PassBuilder::OptimizationLevel Level) { + PB.parsePassPipeline(PM, PeepholeEPPipeline, VerifyEachPass, + DebugLogging); + }); if (tryParsePipelineText(PB, LateLoopOptimizationsEPPipeline)) PB.registerLateLoopOptimizationsEPCallback( [&PB, VerifyEachPass, DebugLogging]( LoopPassManager &PM, PassBuilder::OptimizationLevel Level) { - ExitOnError Err("Unable to parse LateLoopOptimizationsEP pipeline: "); - Err(PB.parsePassPipeline(PM, LateLoopOptimizationsEPPipeline, - VerifyEachPass, DebugLogging)); + PB.parsePassPipeline(PM, LateLoopOptimizationsEPPipeline, + VerifyEachPass, DebugLogging); }); if (tryParsePipelineText(PB, LoopOptimizerEndEPPipeline)) - PB.registerLoopOptimizerEndEPCallback( - [&PB, VerifyEachPass, DebugLogging]( - LoopPassManager &PM, PassBuilder::OptimizationLevel Level) { - ExitOnError Err("Unable to parse LoopOptimizerEndEP pipeline: "); - Err(PB.parsePassPipeline(PM, LoopOptimizerEndEPPipeline, - VerifyEachPass, DebugLogging)); - }); + PB.registerLoopOptimizerEndEPCallback([&PB, VerifyEachPass, DebugLogging]( + LoopPassManager &PM, PassBuilder::OptimizationLevel Level) { + PB.parsePassPipeline(PM, LoopOptimizerEndEPPipeline, VerifyEachPass, + DebugLogging); + }); if (tryParsePipelineText(PB, ScalarOptimizerLateEPPipeline)) PB.registerScalarOptimizerLateEPCallback( [&PB, VerifyEachPass, DebugLogging]( FunctionPassManager &PM, PassBuilder::OptimizationLevel Level) { - ExitOnError Err("Unable to parse ScalarOptimizerLateEP pipeline: "); - Err(PB.parsePassPipeline(PM, ScalarOptimizerLateEPPipeline, - VerifyEachPass, DebugLogging)); + PB.parsePassPipeline(PM, ScalarOptimizerLateEPPipeline, + VerifyEachPass, DebugLogging); }); if (tryParsePipelineText(PB, CGSCCOptimizerLateEPPipeline)) - PB.registerCGSCCOptimizerLateEPCallback( - [&PB, VerifyEachPass, DebugLogging]( - CGSCCPassManager &PM, PassBuilder::OptimizationLevel Level) { - ExitOnError Err("Unable to parse CGSCCOptimizerLateEP pipeline: "); - Err(PB.parsePassPipeline(PM, CGSCCOptimizerLateEPPipeline, - VerifyEachPass, DebugLogging)); - }); + PB.registerCGSCCOptimizerLateEPCallback([&PB, VerifyEachPass, DebugLogging]( + CGSCCPassManager &PM, PassBuilder::OptimizationLevel Level) { + PB.parsePassPipeline(PM, CGSCCOptimizerLateEPPipeline, VerifyEachPass, + DebugLogging); + }); if (tryParsePipelineText(PB, VectorizerStartEPPipeline)) - PB.registerVectorizerStartEPCallback( - [&PB, VerifyEachPass, DebugLogging]( - FunctionPassManager &PM, PassBuilder::OptimizationLevel Level) { - ExitOnError Err("Unable to parse VectorizerStartEP pipeline: "); - Err(PB.parsePassPipeline(PM, VectorizerStartEPPipeline, - VerifyEachPass, DebugLogging)); - }); + PB.registerVectorizerStartEPCallback([&PB, VerifyEachPass, DebugLogging]( + FunctionPassManager &PM, PassBuilder::OptimizationLevel Level) { + PB.parsePassPipeline(PM, VectorizerStartEPPipeline, VerifyEachPass, + DebugLogging); + }); if (tryParsePipelineText(PB, PipelineStartEPPipeline)) PB.registerPipelineStartEPCallback( [&PB, VerifyEachPass, DebugLogging](ModulePassManager &PM) { - ExitOnError Err("Unable to parse PipelineStartEP pipeline: "); - Err(PB.parsePassPipeline(PM, PipelineStartEPPipeline, VerifyEachPass, - DebugLogging)); + PB.parsePassPipeline(PM, PipelineStartEPPipeline, VerifyEachPass, + DebugLogging); }); } @@ -269,8 +258,8 @@ bool llvm::runPassPipeline(StringRef Arg0, Module &M, TargetMachine *TM, // Specially handle the alias analysis manager so that we can register // a custom pipeline of AA passes with it. AAManager AA; - if (auto Err = PB.parseAAPipeline(AA, AAPipeline)) { - errs() << Arg0 << ": " << toString(std::move(Err)) << "\n"; + if (!PB.parseAAPipeline(AA, AAPipeline)) { + errs() << Arg0 << ": unable to parse AA pipeline description.\n"; return false; } @@ -295,9 +284,8 @@ bool llvm::runPassPipeline(StringRef Arg0, Module &M, TargetMachine *TM, if (EnableDebugify) MPM.addPass(NewPMDebugifyPass()); - if (auto Err = - PB.parsePassPipeline(MPM, PassPipeline, VerifyEachPass, DebugPM)) { - errs() << Arg0 << ": " << toString(std::move(Err)) << "\n"; + if (!PB.parsePassPipeline(MPM, PassPipeline, VerifyEachPass, DebugPM)) { + errs() << Arg0 << ": unable to parse pass pipeline description.\n"; return false; } diff --git a/unittests/IR/CMakeLists.txt b/unittests/IR/CMakeLists.txt index 7498983b260..211ab109131 100644 --- a/unittests/IR/CMakeLists.txt +++ b/unittests/IR/CMakeLists.txt @@ -40,5 +40,3 @@ add_llvm_unittest(IRTests VerifierTest.cpp WaymarkTest.cpp ) - -target_link_libraries(IRTests PRIVATE LLVMTestingSupport) diff --git a/unittests/IR/PassBuilderCallbacksTest.cpp b/unittests/IR/PassBuilderCallbacksTest.cpp index 20c47b045e7..97bbb81a6b0 100644 --- a/unittests/IR/PassBuilderCallbacksTest.cpp +++ b/unittests/IR/PassBuilderCallbacksTest.cpp @@ -7,7 +7,6 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Testing/Support/Error.h" #include #include #include @@ -461,7 +460,7 @@ TEST_F(ModuleCallbacksTest, Passes) { .WillOnce(Invoke(getAnalysisResult)); StringRef PipelineText = "test-transform"; - ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded()) + ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true)) << "Pipeline was: " << PipelineText; PM.run(*M, AM); @@ -495,7 +494,7 @@ TEST_F(ModuleCallbacksTest, InstrumentedPasses) { .InSequence(PISequence); StringRef PipelineText = "test-transform"; - ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded()) + ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true)) << "Pipeline was: " << PipelineText; PM.run(*M, AM); @@ -526,7 +525,7 @@ TEST_F(ModuleCallbacksTest, InstrumentedSkippedPasses) { .Times(0); StringRef PipelineText = "test-transform"; - ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded()) + ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true)) << "Pipeline was: " << PipelineText; PM.run(*M, AM); @@ -538,7 +537,7 @@ TEST_F(FunctionCallbacksTest, Passes) { .WillOnce(Invoke(getAnalysisResult)); StringRef PipelineText = "test-transform"; - ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded()) + ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true)) << "Pipeline was: " << PipelineText; PM.run(*M, AM); } @@ -572,7 +571,7 @@ TEST_F(FunctionCallbacksTest, InstrumentedPasses) { .InSequence(PISequence); StringRef PipelineText = "test-transform"; - ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded()) + ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true)) << "Pipeline was: " << PipelineText; PM.run(*M, AM); } @@ -605,7 +604,7 @@ TEST_F(FunctionCallbacksTest, InstrumentedSkippedPasses) { .Times(0); StringRef PipelineText = "test-transform"; - ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded()) + ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true)) << "Pipeline was: " << PipelineText; PM.run(*M, AM); } @@ -616,7 +615,7 @@ TEST_F(LoopCallbacksTest, Passes) { .WillOnce(WithArgs<0, 1, 2>(Invoke(getAnalysisResult))); StringRef PipelineText = "test-transform"; - ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded()) + ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true)) << "Pipeline was: " << PipelineText; PM.run(*M, AM); } @@ -651,7 +650,7 @@ TEST_F(LoopCallbacksTest, InstrumentedPasses) { .InSequence(PISequence); StringRef PipelineText = "test-transform"; - ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded()) + ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true)) << "Pipeline was: " << PipelineText; PM.run(*M, AM); } @@ -683,7 +682,7 @@ TEST_F(LoopCallbacksTest, InstrumentedSkippedPasses) { .Times(0); StringRef PipelineText = "test-transform"; - ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded()) + ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true)) << "Pipeline was: " << PipelineText; PM.run(*M, AM); } @@ -694,7 +693,7 @@ TEST_F(CGSCCCallbacksTest, Passes) { .WillOnce(WithArgs<0, 1, 2>(Invoke(getAnalysisResult))); StringRef PipelineText = "test-transform"; - ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded()) + ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true)) << "Pipeline was: " << PipelineText; PM.run(*M, AM); } @@ -728,7 +727,7 @@ TEST_F(CGSCCCallbacksTest, InstrumentedPasses) { .InSequence(PISequence); StringRef PipelineText = "test-transform"; - ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded()) + ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true)) << "Pipeline was: " << PipelineText; PM.run(*M, AM); } @@ -760,7 +759,7 @@ TEST_F(CGSCCCallbacksTest, InstrumentedSkippedPasses) { .Times(0); StringRef PipelineText = "test-transform"; - ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded()) + ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true)) << "Pipeline was: " << PipelineText; PM.run(*M, AM); } @@ -775,7 +774,7 @@ TEST_F(ModuleCallbacksTest, AnalysisUtilities) { EXPECT_CALL(AnalysisHandle, invalidate(HasName(""), _, _)); StringRef PipelineText = "require,invalidate"; - ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded()) + ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true)) << "Pipeline was: " << PipelineText; PM.run(*M, AM); } @@ -785,7 +784,7 @@ TEST_F(CGSCCCallbacksTest, PassUtilities) { EXPECT_CALL(AnalysisHandle, invalidate(HasName("(foo)"), _, _)); StringRef PipelineText = "require,invalidate"; - ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded()) + ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true)) << "Pipeline was: " << PipelineText; PM.run(*M, AM); } @@ -795,7 +794,7 @@ TEST_F(FunctionCallbacksTest, AnalysisUtilities) { EXPECT_CALL(AnalysisHandle, invalidate(HasName("foo"), _, _)); StringRef PipelineText = "require,invalidate"; - ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded()) + ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true)) << "Pipeline was: " << PipelineText; PM.run(*M, AM); } @@ -806,7 +805,7 @@ TEST_F(LoopCallbacksTest, PassUtilities) { StringRef PipelineText = "require,invalidate"; - ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded()) + ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true)) << "Pipeline was: " << PipelineText; PM.run(*M, AM); } @@ -846,13 +845,13 @@ TEST_F(ModuleCallbacksTest, ParseTopLevelPipeline) { StringRef PipelineText = "another-pipeline(test-transform,invalidate)"; - ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded()) + ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true)) << "Pipeline was: " << PipelineText; PM.run(*M, AM); /// Test the negative case PipelineText = "another-pipeline(instcombine)"; - ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Failed()) + ASSERT_FALSE(PB.parsePassPipeline(PM, PipelineText, true)) << "Pipeline was: " << PipelineText; } } // end anonymous namespace diff --git a/unittests/Passes/CMakeLists.txt b/unittests/Passes/CMakeLists.txt index 415f3a71734..d90df209d4e 100644 --- a/unittests/Passes/CMakeLists.txt +++ b/unittests/Passes/CMakeLists.txt @@ -12,7 +12,6 @@ add_llvm_unittest(PluginsTests PluginsTest.cpp ) export_executable_symbols(PluginsTests) -target_link_libraries(PluginsTests PRIVATE LLVMTestingSupport) set(LLVM_LINK_COMPONENTS) add_llvm_loadable_module(TestPlugin diff --git a/unittests/Passes/PluginsTest.cpp b/unittests/Passes/PluginsTest.cpp index abb7b57ee0c..726978714e8 100644 --- a/unittests/Passes/PluginsTest.cpp +++ b/unittests/Passes/PluginsTest.cpp @@ -15,7 +15,6 @@ #include "llvm/Support/FileSystem.h" #include "llvm/Support/ManagedStatic.h" #include "llvm/Support/Path.h" -#include "llvm/Testing/Support/Error.h" #include "llvm/Transforms/Scalar/LoopPassManager.h" #include "gtest/gtest.h" @@ -55,8 +54,8 @@ TEST(PluginsTests, LoadPlugin) { PassBuilder PB; ModulePassManager PM; - ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, "plugin-pass"), Failed()); + ASSERT_FALSE(PB.parsePassPipeline(PM, "plugin-pass")); Plugin->registerPassBuilderCallbacks(PB); - ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, "plugin-pass"), Succeeded()); + ASSERT_TRUE(PB.parsePassPipeline(PM, "plugin-pass")); } -- GitLab From fb06745cac34b3c3803b41de0e7958e9e9f85dc0 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Mon, 15 Oct 2018 15:38:38 +0000 Subject: [PATCH 0202/1116] [DAGCombiner] allow undef elts in vector fma matching git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344525 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 19 ++--- test/CodeGen/X86/fma_patterns.ll | 90 ++++++++++++++++-------- 2 files changed, 70 insertions(+), 39 deletions(-) diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 7ec5fac390b..f2779a3475e 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -10794,17 +10794,18 @@ SDValue DAGCombiner::visitFMULForFMADistributiveCombine(SDNode *N) { unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA; bool Aggressive = TLI.enableAggressiveFMAFusion(VT); - // fold (fmul (fadd x, +1.0), y) -> (fma x, y, y) - // fold (fmul (fadd x, -1.0), y) -> (fma x, y, (fneg y)) + // fold (fmul (fadd x0, +1.0), y) -> (fma x0, y, y) + // fold (fmul (fadd x0, -1.0), y) -> (fma x0, y, (fneg y)) auto FuseFADD = [&](SDValue X, SDValue Y, const SDNodeFlags Flags) { if (X.getOpcode() == ISD::FADD && (Aggressive || X->hasOneUse())) { - auto XC1 = isConstOrConstSplatFP(X.getOperand(1)); - if (XC1 && XC1->isExactlyValue(+1.0)) - return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, - Y, Flags); - if (XC1 && XC1->isExactlyValue(-1.0)) - return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, - DAG.getNode(ISD::FNEG, SL, VT, Y), Flags); + if (auto *C = isConstOrConstSplatFP(X.getOperand(1), true)) { + if (C->isExactlyValue(+1.0)) + return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, + Y, Flags); + if (C->isExactlyValue(-1.0)) + return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, + DAG.getNode(ISD::FNEG, SL, VT, Y), Flags); + } } return SDValue(); }; diff --git a/test/CodeGen/X86/fma_patterns.ll b/test/CodeGen/X86/fma_patterns.ll index d0d0dfed352..5395ae46d47 100644 --- a/test/CodeGen/X86/fma_patterns.ll +++ b/test/CodeGen/X86/fma_patterns.ll @@ -637,23 +637,38 @@ define <4 x float> @test_v4f32_mul_y_add_x_one(<4 x float> %x, <4 x float> %y) { } define <4 x float> @test_v4f32_mul_y_add_x_one_undefs(<4 x float> %x, <4 x float> %y) { -; FMA-LABEL: test_v4f32_mul_y_add_x_one_undefs: -; FMA: # %bb.0: -; FMA-NEXT: vaddps {{.*}}(%rip), %xmm0, %xmm0 -; FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; FMA-NEXT: retq +; FMA-INFS-LABEL: test_v4f32_mul_y_add_x_one_undefs: +; FMA-INFS: # %bb.0: +; FMA-INFS-NEXT: vaddps {{.*}}(%rip), %xmm0, %xmm0 +; FMA-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; FMA-INFS-NEXT: retq ; -; FMA4-LABEL: test_v4f32_mul_y_add_x_one_undefs: -; FMA4: # %bb.0: -; FMA4-NEXT: vaddps {{.*}}(%rip), %xmm0, %xmm0 -; FMA4-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; FMA4-NEXT: retq +; FMA4-INFS-LABEL: test_v4f32_mul_y_add_x_one_undefs: +; FMA4-INFS: # %bb.0: +; FMA4-INFS-NEXT: vaddps {{.*}}(%rip), %xmm0, %xmm0 +; FMA4-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; FMA4-INFS-NEXT: retq ; -; AVX512-LABEL: test_v4f32_mul_y_add_x_one_undefs: -; AVX512: # %bb.0: -; AVX512-NEXT: vaddps {{.*}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX512-INFS-LABEL: test_v4f32_mul_y_add_x_one_undefs: +; AVX512-INFS: # %bb.0: +; AVX512-INFS-NEXT: vaddps {{.*}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; AVX512-INFS-NEXT: retq +; +; FMA-NOINFS-LABEL: test_v4f32_mul_y_add_x_one_undefs: +; FMA-NOINFS: # %bb.0: +; FMA-NOINFS-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1 +; FMA-NOINFS-NEXT: retq +; +; FMA4-NOINFS-LABEL: test_v4f32_mul_y_add_x_one_undefs: +; FMA4-NOINFS: # %bb.0: +; FMA4-NOINFS-NEXT: vfmaddps %xmm1, %xmm1, %xmm0, %xmm0 +; FMA4-NOINFS-NEXT: retq +; +; AVX512-NOINFS-LABEL: test_v4f32_mul_y_add_x_one_undefs: +; AVX512-NOINFS: # %bb.0: +; AVX512-NOINFS-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1 +; AVX512-NOINFS-NEXT: retq %a = fadd <4 x float> %x, %m = fmul <4 x float> %y, %a ret <4 x float> %m @@ -736,23 +751,38 @@ define <4 x float> @test_v4f32_mul_y_add_x_negone(<4 x float> %x, <4 x float> %y } define <4 x float> @test_v4f32_mul_y_add_x_negone_undefs(<4 x float> %x, <4 x float> %y) { -; FMA-LABEL: test_v4f32_mul_y_add_x_negone_undefs: -; FMA: # %bb.0: -; FMA-NEXT: vaddps {{.*}}(%rip), %xmm0, %xmm0 -; FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; FMA-NEXT: retq +; FMA-INFS-LABEL: test_v4f32_mul_y_add_x_negone_undefs: +; FMA-INFS: # %bb.0: +; FMA-INFS-NEXT: vaddps {{.*}}(%rip), %xmm0, %xmm0 +; FMA-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; FMA-INFS-NEXT: retq ; -; FMA4-LABEL: test_v4f32_mul_y_add_x_negone_undefs: -; FMA4: # %bb.0: -; FMA4-NEXT: vaddps {{.*}}(%rip), %xmm0, %xmm0 -; FMA4-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; FMA4-NEXT: retq +; FMA4-INFS-LABEL: test_v4f32_mul_y_add_x_negone_undefs: +; FMA4-INFS: # %bb.0: +; FMA4-INFS-NEXT: vaddps {{.*}}(%rip), %xmm0, %xmm0 +; FMA4-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; FMA4-INFS-NEXT: retq ; -; AVX512-LABEL: test_v4f32_mul_y_add_x_negone_undefs: -; AVX512: # %bb.0: -; AVX512-NEXT: vaddps {{.*}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX512-INFS-LABEL: test_v4f32_mul_y_add_x_negone_undefs: +; AVX512-INFS: # %bb.0: +; AVX512-INFS-NEXT: vaddps {{.*}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; AVX512-INFS-NEXT: retq +; +; FMA-NOINFS-LABEL: test_v4f32_mul_y_add_x_negone_undefs: +; FMA-NOINFS: # %bb.0: +; FMA-NOINFS-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1 +; FMA-NOINFS-NEXT: retq +; +; FMA4-NOINFS-LABEL: test_v4f32_mul_y_add_x_negone_undefs: +; FMA4-NOINFS: # %bb.0: +; FMA4-NOINFS-NEXT: vfmsubps %xmm1, %xmm1, %xmm0, %xmm0 +; FMA4-NOINFS-NEXT: retq +; +; AVX512-NOINFS-LABEL: test_v4f32_mul_y_add_x_negone_undefs: +; AVX512-NOINFS: # %bb.0: +; AVX512-NOINFS-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1 +; AVX512-NOINFS-NEXT: retq %a = fadd <4 x float> %x, %m = fmul <4 x float> %y, %a ret <4 x float> %m -- GitLab From 0ea7fc0dde41258d1352daa7f22e0438769b1a63 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Mon, 15 Oct 2018 15:47:37 +0000 Subject: [PATCH 0203/1116] [x86] add tests for fma with undef elts; NFC git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344527 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/CodeGen/X86/fma_patterns.ll | 98 ++++++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) diff --git a/test/CodeGen/X86/fma_patterns.ll b/test/CodeGen/X86/fma_patterns.ll index 5395ae46d47..9ab2b1281f7 100644 --- a/test/CodeGen/X86/fma_patterns.ll +++ b/test/CodeGen/X86/fma_patterns.ll @@ -870,6 +870,32 @@ define <4 x float> @test_v4f32_mul_y_sub_one_x(<4 x float> %x, <4 x float> %y) { ret <4 x float> %m } +define <4 x float> @test_v4f32_mul_y_sub_one_x_undefs(<4 x float> %x, <4 x float> %y) { +; FMA-LABEL: test_v4f32_mul_y_sub_one_x_undefs: +; FMA: # %bb.0: +; FMA-NEXT: vmovaps {{.*#+}} xmm2 = <1,u,1,1> +; FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 +; FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v4f32_mul_y_sub_one_x_undefs: +; FMA4: # %bb.0: +; FMA4-NEXT: vmovaps {{.*#+}} xmm2 = <1,u,1,1> +; FMA4-NEXT: vsubps %xmm0, %xmm2, %xmm0 +; FMA4-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v4f32_mul_y_sub_one_x_undefs: +; AVX512: # %bb.0: +; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] +; AVX512-NEXT: vsubps %xmm0, %xmm2, %xmm0 +; AVX512-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: retq + %s = fsub <4 x float> , %x + %m = fmul <4 x float> %y, %s + ret <4 x float> %m +} + define <4 x float> @test_v4f32_mul_sub_negone_x_y(<4 x float> %x, <4 x float> %y) { ; FMA-INFS-LABEL: test_v4f32_mul_sub_negone_x_y: ; FMA-INFS: # %bb.0: @@ -952,6 +978,32 @@ define <4 x float> @test_v4f32_mul_y_sub_negone_x(<4 x float> %x, <4 x float> %y ret <4 x float> %m } +define <4 x float> @test_v4f32_mul_y_sub_negone_x_undefs(<4 x float> %x, <4 x float> %y) { +; FMA-LABEL: test_v4f32_mul_y_sub_negone_x_undefs: +; FMA: # %bb.0: +; FMA-NEXT: vmovaps {{.*#+}} xmm2 = <-1,-1,u,-1> +; FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 +; FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v4f32_mul_y_sub_negone_x_undefs: +; FMA4: # %bb.0: +; FMA4-NEXT: vmovaps {{.*#+}} xmm2 = <-1,-1,u,-1> +; FMA4-NEXT: vsubps %xmm0, %xmm2, %xmm0 +; FMA4-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v4f32_mul_y_sub_negone_x_undefs: +; AVX512: # %bb.0: +; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [-1,-1,-1,-1] +; AVX512-NEXT: vsubps %xmm0, %xmm2, %xmm0 +; AVX512-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: retq + %s = fsub <4 x float> , %x + %m = fmul <4 x float> %y, %s + ret <4 x float> %m +} + define <4 x float> @test_v4f32_mul_sub_x_one_y(<4 x float> %x, <4 x float> %y) { ; FMA-INFS-LABEL: test_v4f32_mul_sub_x_one_y: ; FMA-INFS: # %bb.0: @@ -1028,6 +1080,29 @@ define <4 x float> @test_v4f32_mul_y_sub_x_one(<4 x float> %x, <4 x float> %y) { ret <4 x float> %m } +define <4 x float> @test_v4f32_mul_y_sub_x_one_undefs(<4 x float> %x, <4 x float> %y) { +; FMA-LABEL: test_v4f32_mul_y_sub_x_one_undefs: +; FMA: # %bb.0: +; FMA-NEXT: vsubps {{.*}}(%rip), %xmm0, %xmm0 +; FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v4f32_mul_y_sub_x_one_undefs: +; FMA4: # %bb.0: +; FMA4-NEXT: vsubps {{.*}}(%rip), %xmm0, %xmm0 +; FMA4-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v4f32_mul_y_sub_x_one_undefs: +; AVX512: # %bb.0: +; AVX512-NEXT: vsubps {{.*}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: retq + %s = fsub <4 x float> %x, + %m = fmul <4 x float> %y, %s + ret <4 x float> %m +} + define <4 x float> @test_v4f32_mul_sub_x_negone_y(<4 x float> %x, <4 x float> %y) { ; FMA-INFS-LABEL: test_v4f32_mul_sub_x_negone_y: ; FMA-INFS: # %bb.0: @@ -1104,6 +1179,29 @@ define <4 x float> @test_v4f32_mul_y_sub_x_negone(<4 x float> %x, <4 x float> %y ret <4 x float> %m } +define <4 x float> @test_v4f32_mul_y_sub_x_negone_undefs(<4 x float> %x, <4 x float> %y) { +; FMA-LABEL: test_v4f32_mul_y_sub_x_negone_undefs: +; FMA: # %bb.0: +; FMA-NEXT: vsubps {{.*}}(%rip), %xmm0, %xmm0 +; FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v4f32_mul_y_sub_x_negone_undefs: +; FMA4: # %bb.0: +; FMA4-NEXT: vsubps {{.*}}(%rip), %xmm0, %xmm0 +; FMA4-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v4f32_mul_y_sub_x_negone_undefs: +; AVX512: # %bb.0: +; AVX512-NEXT: vsubps {{.*}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: retq + %s = fsub <4 x float> %x, + %m = fmul <4 x float> %y, %s + ret <4 x float> %m +} + ; ; Interpolation Patterns: add(mul(x,t),mul(sub(1.0,t),y)) ; -- GitLab From 907565571c1306f9914da6d7ecb98b781c658b1d Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Mon, 15 Oct 2018 15:56:39 +0000 Subject: [PATCH 0204/1116] [DAGCombiner] allow undef elts in vector fma matching git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344528 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 43 ++--- test/CodeGen/X86/fma_patterns.ll | 192 +++++++++++++++-------- 2 files changed, 148 insertions(+), 87 deletions(-) diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index f2779a3475e..846830b3b28 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -10815,29 +10815,30 @@ SDValue DAGCombiner::visitFMULForFMADistributiveCombine(SDNode *N) { if (SDValue FMA = FuseFADD(N1, N0, Flags)) return FMA; - // fold (fmul (fsub +1.0, x), y) -> (fma (fneg x), y, y) - // fold (fmul (fsub -1.0, x), y) -> (fma (fneg x), y, (fneg y)) - // fold (fmul (fsub x, +1.0), y) -> (fma x, y, (fneg y)) - // fold (fmul (fsub x, -1.0), y) -> (fma x, y, y) + // fold (fmul (fsub +1.0, x1), y) -> (fma (fneg x1), y, y) + // fold (fmul (fsub -1.0, x1), y) -> (fma (fneg x1), y, (fneg y)) + // fold (fmul (fsub x0, +1.0), y) -> (fma x0, y, (fneg y)) + // fold (fmul (fsub x0, -1.0), y) -> (fma x0, y, y) auto FuseFSUB = [&](SDValue X, SDValue Y, const SDNodeFlags Flags) { if (X.getOpcode() == ISD::FSUB && (Aggressive || X->hasOneUse())) { - auto XC0 = isConstOrConstSplatFP(X.getOperand(0)); - if (XC0 && XC0->isExactlyValue(+1.0)) - return DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FNEG, SL, VT, X.getOperand(1)), Y, - Y, Flags); - if (XC0 && XC0->isExactlyValue(-1.0)) - return DAG.getNode(PreferredFusedOpcode, SL, VT, - DAG.getNode(ISD::FNEG, SL, VT, X.getOperand(1)), Y, - DAG.getNode(ISD::FNEG, SL, VT, Y), Flags); - - auto XC1 = isConstOrConstSplatFP(X.getOperand(1)); - if (XC1 && XC1->isExactlyValue(+1.0)) - return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, - DAG.getNode(ISD::FNEG, SL, VT, Y), Flags); - if (XC1 && XC1->isExactlyValue(-1.0)) - return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, - Y, Flags); + if (auto *C0 = isConstOrConstSplatFP(X.getOperand(0), true)) { + if (C0->isExactlyValue(+1.0)) + return DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FNEG, SL, VT, X.getOperand(1)), Y, + Y, Flags); + if (C0->isExactlyValue(-1.0)) + return DAG.getNode(PreferredFusedOpcode, SL, VT, + DAG.getNode(ISD::FNEG, SL, VT, X.getOperand(1)), Y, + DAG.getNode(ISD::FNEG, SL, VT, Y), Flags); + } + if (auto *C1 = isConstOrConstSplatFP(X.getOperand(1), true)) { + if (C1->isExactlyValue(+1.0)) + return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, + DAG.getNode(ISD::FNEG, SL, VT, Y), Flags); + if (C1->isExactlyValue(-1.0)) + return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, + Y, Flags); + } } return SDValue(); }; diff --git a/test/CodeGen/X86/fma_patterns.ll b/test/CodeGen/X86/fma_patterns.ll index 9ab2b1281f7..038836bd524 100644 --- a/test/CodeGen/X86/fma_patterns.ll +++ b/test/CodeGen/X86/fma_patterns.ll @@ -871,26 +871,41 @@ define <4 x float> @test_v4f32_mul_y_sub_one_x(<4 x float> %x, <4 x float> %y) { } define <4 x float> @test_v4f32_mul_y_sub_one_x_undefs(<4 x float> %x, <4 x float> %y) { -; FMA-LABEL: test_v4f32_mul_y_sub_one_x_undefs: -; FMA: # %bb.0: -; FMA-NEXT: vmovaps {{.*#+}} xmm2 = <1,u,1,1> -; FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; FMA-NEXT: retq +; FMA-INFS-LABEL: test_v4f32_mul_y_sub_one_x_undefs: +; FMA-INFS: # %bb.0: +; FMA-INFS-NEXT: vmovaps {{.*#+}} xmm2 = <1,u,1,1> +; FMA-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0 +; FMA-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; FMA-INFS-NEXT: retq ; -; FMA4-LABEL: test_v4f32_mul_y_sub_one_x_undefs: -; FMA4: # %bb.0: -; FMA4-NEXT: vmovaps {{.*#+}} xmm2 = <1,u,1,1> -; FMA4-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; FMA4-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; FMA4-NEXT: retq +; FMA4-INFS-LABEL: test_v4f32_mul_y_sub_one_x_undefs: +; FMA4-INFS: # %bb.0: +; FMA4-INFS-NEXT: vmovaps {{.*#+}} xmm2 = <1,u,1,1> +; FMA4-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0 +; FMA4-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; FMA4-INFS-NEXT: retq ; -; AVX512-LABEL: test_v4f32_mul_y_sub_one_x_undefs: -; AVX512: # %bb.0: -; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] -; AVX512-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; AVX512-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX512-INFS-LABEL: test_v4f32_mul_y_sub_one_x_undefs: +; AVX512-INFS: # %bb.0: +; AVX512-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] +; AVX512-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0 +; AVX512-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; AVX512-INFS-NEXT: retq +; +; FMA-NOINFS-LABEL: test_v4f32_mul_y_sub_one_x_undefs: +; FMA-NOINFS: # %bb.0: +; FMA-NOINFS-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm1 +; FMA-NOINFS-NEXT: retq +; +; FMA4-NOINFS-LABEL: test_v4f32_mul_y_sub_one_x_undefs: +; FMA4-NOINFS: # %bb.0: +; FMA4-NOINFS-NEXT: vfnmaddps %xmm1, %xmm1, %xmm0, %xmm0 +; FMA4-NOINFS-NEXT: retq +; +; AVX512-NOINFS-LABEL: test_v4f32_mul_y_sub_one_x_undefs: +; AVX512-NOINFS: # %bb.0: +; AVX512-NOINFS-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm1 +; AVX512-NOINFS-NEXT: retq %s = fsub <4 x float> , %x %m = fmul <4 x float> %y, %s ret <4 x float> %m @@ -979,26 +994,41 @@ define <4 x float> @test_v4f32_mul_y_sub_negone_x(<4 x float> %x, <4 x float> %y } define <4 x float> @test_v4f32_mul_y_sub_negone_x_undefs(<4 x float> %x, <4 x float> %y) { -; FMA-LABEL: test_v4f32_mul_y_sub_negone_x_undefs: -; FMA: # %bb.0: -; FMA-NEXT: vmovaps {{.*#+}} xmm2 = <-1,-1,u,-1> -; FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; FMA-NEXT: retq +; FMA-INFS-LABEL: test_v4f32_mul_y_sub_negone_x_undefs: +; FMA-INFS: # %bb.0: +; FMA-INFS-NEXT: vmovaps {{.*#+}} xmm2 = <-1,-1,u,-1> +; FMA-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0 +; FMA-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; FMA-INFS-NEXT: retq ; -; FMA4-LABEL: test_v4f32_mul_y_sub_negone_x_undefs: -; FMA4: # %bb.0: -; FMA4-NEXT: vmovaps {{.*#+}} xmm2 = <-1,-1,u,-1> -; FMA4-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; FMA4-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; FMA4-NEXT: retq +; FMA4-INFS-LABEL: test_v4f32_mul_y_sub_negone_x_undefs: +; FMA4-INFS: # %bb.0: +; FMA4-INFS-NEXT: vmovaps {{.*#+}} xmm2 = <-1,-1,u,-1> +; FMA4-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0 +; FMA4-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; FMA4-INFS-NEXT: retq ; -; AVX512-LABEL: test_v4f32_mul_y_sub_negone_x_undefs: -; AVX512: # %bb.0: -; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [-1,-1,-1,-1] -; AVX512-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; AVX512-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX512-INFS-LABEL: test_v4f32_mul_y_sub_negone_x_undefs: +; AVX512-INFS: # %bb.0: +; AVX512-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [-1,-1,-1,-1] +; AVX512-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0 +; AVX512-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; AVX512-INFS-NEXT: retq +; +; FMA-NOINFS-LABEL: test_v4f32_mul_y_sub_negone_x_undefs: +; FMA-NOINFS: # %bb.0: +; FMA-NOINFS-NEXT: vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm1 +; FMA-NOINFS-NEXT: retq +; +; FMA4-NOINFS-LABEL: test_v4f32_mul_y_sub_negone_x_undefs: +; FMA4-NOINFS: # %bb.0: +; FMA4-NOINFS-NEXT: vfnmsubps %xmm1, %xmm1, %xmm0, %xmm0 +; FMA4-NOINFS-NEXT: retq +; +; AVX512-NOINFS-LABEL: test_v4f32_mul_y_sub_negone_x_undefs: +; AVX512-NOINFS: # %bb.0: +; AVX512-NOINFS-NEXT: vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm1 +; AVX512-NOINFS-NEXT: retq %s = fsub <4 x float> , %x %m = fmul <4 x float> %y, %s ret <4 x float> %m @@ -1081,23 +1111,38 @@ define <4 x float> @test_v4f32_mul_y_sub_x_one(<4 x float> %x, <4 x float> %y) { } define <4 x float> @test_v4f32_mul_y_sub_x_one_undefs(<4 x float> %x, <4 x float> %y) { -; FMA-LABEL: test_v4f32_mul_y_sub_x_one_undefs: -; FMA: # %bb.0: -; FMA-NEXT: vsubps {{.*}}(%rip), %xmm0, %xmm0 -; FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; FMA-NEXT: retq +; FMA-INFS-LABEL: test_v4f32_mul_y_sub_x_one_undefs: +; FMA-INFS: # %bb.0: +; FMA-INFS-NEXT: vsubps {{.*}}(%rip), %xmm0, %xmm0 +; FMA-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; FMA-INFS-NEXT: retq ; -; FMA4-LABEL: test_v4f32_mul_y_sub_x_one_undefs: -; FMA4: # %bb.0: -; FMA4-NEXT: vsubps {{.*}}(%rip), %xmm0, %xmm0 -; FMA4-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; FMA4-NEXT: retq +; FMA4-INFS-LABEL: test_v4f32_mul_y_sub_x_one_undefs: +; FMA4-INFS: # %bb.0: +; FMA4-INFS-NEXT: vsubps {{.*}}(%rip), %xmm0, %xmm0 +; FMA4-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; FMA4-INFS-NEXT: retq ; -; AVX512-LABEL: test_v4f32_mul_y_sub_x_one_undefs: -; AVX512: # %bb.0: -; AVX512-NEXT: vsubps {{.*}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX512-INFS-LABEL: test_v4f32_mul_y_sub_x_one_undefs: +; AVX512-INFS: # %bb.0: +; AVX512-INFS-NEXT: vsubps {{.*}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; AVX512-INFS-NEXT: retq +; +; FMA-NOINFS-LABEL: test_v4f32_mul_y_sub_x_one_undefs: +; FMA-NOINFS: # %bb.0: +; FMA-NOINFS-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1 +; FMA-NOINFS-NEXT: retq +; +; FMA4-NOINFS-LABEL: test_v4f32_mul_y_sub_x_one_undefs: +; FMA4-NOINFS: # %bb.0: +; FMA4-NOINFS-NEXT: vfmsubps %xmm1, %xmm1, %xmm0, %xmm0 +; FMA4-NOINFS-NEXT: retq +; +; AVX512-NOINFS-LABEL: test_v4f32_mul_y_sub_x_one_undefs: +; AVX512-NOINFS: # %bb.0: +; AVX512-NOINFS-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1 +; AVX512-NOINFS-NEXT: retq %s = fsub <4 x float> %x, %m = fmul <4 x float> %y, %s ret <4 x float> %m @@ -1180,23 +1225,38 @@ define <4 x float> @test_v4f32_mul_y_sub_x_negone(<4 x float> %x, <4 x float> %y } define <4 x float> @test_v4f32_mul_y_sub_x_negone_undefs(<4 x float> %x, <4 x float> %y) { -; FMA-LABEL: test_v4f32_mul_y_sub_x_negone_undefs: -; FMA: # %bb.0: -; FMA-NEXT: vsubps {{.*}}(%rip), %xmm0, %xmm0 -; FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; FMA-NEXT: retq +; FMA-INFS-LABEL: test_v4f32_mul_y_sub_x_negone_undefs: +; FMA-INFS: # %bb.0: +; FMA-INFS-NEXT: vsubps {{.*}}(%rip), %xmm0, %xmm0 +; FMA-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; FMA-INFS-NEXT: retq ; -; FMA4-LABEL: test_v4f32_mul_y_sub_x_negone_undefs: -; FMA4: # %bb.0: -; FMA4-NEXT: vsubps {{.*}}(%rip), %xmm0, %xmm0 -; FMA4-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; FMA4-NEXT: retq +; FMA4-INFS-LABEL: test_v4f32_mul_y_sub_x_negone_undefs: +; FMA4-INFS: # %bb.0: +; FMA4-INFS-NEXT: vsubps {{.*}}(%rip), %xmm0, %xmm0 +; FMA4-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; FMA4-INFS-NEXT: retq ; -; AVX512-LABEL: test_v4f32_mul_y_sub_x_negone_undefs: -; AVX512: # %bb.0: -; AVX512-NEXT: vsubps {{.*}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX512-INFS-LABEL: test_v4f32_mul_y_sub_x_negone_undefs: +; AVX512-INFS: # %bb.0: +; AVX512-INFS-NEXT: vsubps {{.*}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; AVX512-INFS-NEXT: retq +; +; FMA-NOINFS-LABEL: test_v4f32_mul_y_sub_x_negone_undefs: +; FMA-NOINFS: # %bb.0: +; FMA-NOINFS-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1 +; FMA-NOINFS-NEXT: retq +; +; FMA4-NOINFS-LABEL: test_v4f32_mul_y_sub_x_negone_undefs: +; FMA4-NOINFS: # %bb.0: +; FMA4-NOINFS-NEXT: vfmaddps %xmm1, %xmm1, %xmm0, %xmm0 +; FMA4-NOINFS-NEXT: retq +; +; AVX512-NOINFS-LABEL: test_v4f32_mul_y_sub_x_negone_undefs: +; AVX512-NOINFS: # %bb.0: +; AVX512-NOINFS-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1 +; AVX512-NOINFS-NEXT: retq %s = fsub <4 x float> %x, %m = fmul <4 x float> %y, %s ret <4 x float> %m -- GitLab From 772e632e25431f43d61d5f20e3cb1cbea69caaee Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Mon, 15 Oct 2018 16:44:00 +0000 Subject: [PATCH 0205/1116] [AArch64] add tests for fmul x, -2.0 with undef elts; NFC Also, add tests with commuted operands. There was no coverage for that case. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344531 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/CodeGen/AArch64/fadd-combines.ll | 55 ++++++++++++++++++++++----- 1 file changed, 45 insertions(+), 10 deletions(-) diff --git a/test/CodeGen/AArch64/fadd-combines.ll b/test/CodeGen/AArch64/fadd-combines.ll index be027a7b558..c2e4430029a 100644 --- a/test/CodeGen/AArch64/fadd-combines.ll +++ b/test/CodeGen/AArch64/fadd-combines.ll @@ -51,8 +51,8 @@ define double @test4(double %a, double %b, double %c) { ret double %add2 } -define <4 x float> @test5(<4 x float> %a, <4 x float> %b) { -; CHECK-LABEL: test5: +define <4 x float> @fmulnegtwo_vec(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: fmulnegtwo_vec: ; CHECK: // %bb.0: ; CHECK-NEXT: fadd v1.4s, v1.4s, v1.4s ; CHECK-NEXT: fsub v0.4s, v0.4s, v1.4s @@ -62,6 +62,41 @@ define <4 x float> @test5(<4 x float> %a, <4 x float> %b) { ret <4 x float> %add } +define <4 x float> @fmulnegtwo_vec_commute(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: fmulnegtwo_vec_commute: +; CHECK: // %bb.0: +; CHECK-NEXT: fadd v1.4s, v1.4s, v1.4s +; CHECK-NEXT: fsub v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ret + %mul = fmul <4 x float> %b, + %add = fadd <4 x float> %mul, %a + ret <4 x float> %add +} + +define <4 x float> @fmulnegtwo_vec_undefs(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: fmulnegtwo_vec_undefs: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v2.4s, #192, lsl #24 +; CHECK-NEXT: fmul v1.4s, v1.4s, v2.4s +; CHECK-NEXT: fadd v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ret + %mul = fmul <4 x float> %b, + %add = fadd <4 x float> %a, %mul + ret <4 x float> %add +} + +define <4 x float> @fmulnegtwo_vec_commute_undefs(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: fmulnegtwo_vec_commute_undefs: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v2.4s, #192, lsl #24 +; CHECK-NEXT: fmul v1.4s, v1.4s, v2.4s +; CHECK-NEXT: fadd v0.4s, v1.4s, v0.4s +; CHECK-NEXT: ret + %mul = fmul <4 x float> %b, + %add = fadd <4 x float> %mul, %a + ret <4 x float> %add +} + define <4 x float> @test6(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: test6: ; CHECK: // %bb.0: @@ -99,10 +134,10 @@ define double @test7(double %a, double %b) nounwind { define float @fadd_const_multiuse_fmf(float %x) { ; CHECK-LABEL: fadd_const_multiuse_fmf: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI7_0 -; CHECK-NEXT: adrp x9, .LCPI7_1 -; CHECK-NEXT: ldr s1, [x8, :lo12:.LCPI7_0] -; CHECK-NEXT: ldr s2, [x9, :lo12:.LCPI7_1] +; CHECK-NEXT: adrp x8, .LCPI10_0 +; CHECK-NEXT: adrp x9, .LCPI10_1 +; CHECK-NEXT: ldr s1, [x8, :lo12:.LCPI10_0] +; CHECK-NEXT: ldr s2, [x9, :lo12:.LCPI10_1] ; CHECK-NEXT: fadd s1, s0, s1 ; CHECK-NEXT: fadd s0, s0, s2 ; CHECK-NEXT: fadd s0, s1, s0 @@ -120,10 +155,10 @@ define float @fadd_const_multiuse_fmf(float %x) { define float @fadd_const_multiuse_attr(float %x) #0 { ; CHECK-LABEL: fadd_const_multiuse_attr: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x9, .LCPI8_1 -; CHECK-NEXT: adrp x8, .LCPI8_0 -; CHECK-NEXT: ldr s1, [x9, :lo12:.LCPI8_1] -; CHECK-NEXT: ldr s2, [x8, :lo12:.LCPI8_0] +; CHECK-NEXT: adrp x9, .LCPI11_1 +; CHECK-NEXT: adrp x8, .LCPI11_0 +; CHECK-NEXT: ldr s1, [x9, :lo12:.LCPI11_1] +; CHECK-NEXT: ldr s2, [x8, :lo12:.LCPI11_0] ; CHECK-NEXT: fadd s1, s0, s1 ; CHECK-NEXT: fadd s1, s2, s1 ; CHECK-NEXT: fadd s0, s0, s1 -- GitLab From 9e0d834cc5697eeefc8b74b2fdf17895a20d4718 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Mon, 15 Oct 2018 16:47:01 +0000 Subject: [PATCH 0206/1116] [DAGCombiner] refactor folds for fadd (fmul X, -2.0), Y; NFCI The transform doesn't work if the vector constant has undef elements. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344532 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 34 +++++++++++++----------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 846830b3b28..ab871a25d07 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -10851,14 +10851,6 @@ SDValue DAGCombiner::visitFMULForFMADistributiveCombine(SDNode *N) { return SDValue(); } -static bool isFMulNegTwo(SDValue &N) { - if (N.getOpcode() != ISD::FMUL) - return false; - if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N.getOperand(1))) - return CFP->isExactlyValue(-2.0); - return false; -} - SDValue DAGCombiner::visitFADD(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -10903,14 +10895,24 @@ SDValue DAGCombiner::visitFADD(SDNode *N) { return DAG.getNode(ISD::FSUB, DL, VT, N1, GetNegatedExpression(N0, DAG, LegalOperations), Flags); - // fold (fadd A, (fmul B, -2.0)) -> (fsub A, (fadd B, B)) - // fold (fadd (fmul B, -2.0), A) -> (fsub A, (fadd B, B)) - if ((isFMulNegTwo(N0) && N0.hasOneUse()) || - (isFMulNegTwo(N1) && N1.hasOneUse())) { - bool N1IsFMul = isFMulNegTwo(N1); - SDValue AddOp = N1IsFMul ? N1.getOperand(0) : N0.getOperand(0); - SDValue Add = DAG.getNode(ISD::FADD, DL, VT, AddOp, AddOp, Flags); - return DAG.getNode(ISD::FSUB, DL, VT, N1IsFMul ? N0 : N1, Add, Flags); + auto isFMulNegTwo = [](SDValue FMul) { + if (!FMul.hasOneUse() || FMul.getOpcode() != ISD::FMUL) + return false; + auto *C = isConstOrConstSplatFP(FMul.getOperand(1)); + return C && C->isExactlyValue(-2.0); + }; + + // fadd (fmul B, -2.0), A --> fsub A, (fadd B, B) + if (isFMulNegTwo(N0)) { + SDValue B = N0.getOperand(0); + SDValue Add = DAG.getNode(ISD::FADD, DL, VT, B, B, Flags); + return DAG.getNode(ISD::FSUB, DL, VT, N1, Add, Flags); + } + // fadd A, (fmul B, -2.0) --> fsub A, (fadd B, B) + if (isFMulNegTwo(N1)) { + SDValue B = N1.getOperand(0); + SDValue Add = DAG.getNode(ISD::FADD, DL, VT, B, B, Flags); + return DAG.getNode(ISD::FSUB, DL, VT, N0, Add, Flags); } // No FP constant should be created after legalization as Instruction -- GitLab From 2ef4e14af3ffff12baf279eff50f4e4792c97c34 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Mon, 15 Oct 2018 16:54:07 +0000 Subject: [PATCH 0207/1116] [DAGCombiner] allow undef elts in vector fmul matching git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344534 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 2 +- test/CodeGen/AArch64/fadd-combines.ll | 10 ++++------ 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index ab871a25d07..11cc699ffe1 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -10898,7 +10898,7 @@ SDValue DAGCombiner::visitFADD(SDNode *N) { auto isFMulNegTwo = [](SDValue FMul) { if (!FMul.hasOneUse() || FMul.getOpcode() != ISD::FMUL) return false; - auto *C = isConstOrConstSplatFP(FMul.getOperand(1)); + auto *C = isConstOrConstSplatFP(FMul.getOperand(1), true); return C && C->isExactlyValue(-2.0); }; diff --git a/test/CodeGen/AArch64/fadd-combines.ll b/test/CodeGen/AArch64/fadd-combines.ll index c2e4430029a..7332101a481 100644 --- a/test/CodeGen/AArch64/fadd-combines.ll +++ b/test/CodeGen/AArch64/fadd-combines.ll @@ -76,9 +76,8 @@ define <4 x float> @fmulnegtwo_vec_commute(<4 x float> %a, <4 x float> %b) { define <4 x float> @fmulnegtwo_vec_undefs(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: fmulnegtwo_vec_undefs: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v2.4s, #192, lsl #24 -; CHECK-NEXT: fmul v1.4s, v1.4s, v2.4s -; CHECK-NEXT: fadd v0.4s, v0.4s, v1.4s +; CHECK-NEXT: fadd v1.4s, v1.4s, v1.4s +; CHECK-NEXT: fsub v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %mul = fmul <4 x float> %b, %add = fadd <4 x float> %a, %mul @@ -88,9 +87,8 @@ define <4 x float> @fmulnegtwo_vec_undefs(<4 x float> %a, <4 x float> %b) { define <4 x float> @fmulnegtwo_vec_commute_undefs(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: fmulnegtwo_vec_commute_undefs: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v2.4s, #192, lsl #24 -; CHECK-NEXT: fmul v1.4s, v1.4s, v2.4s -; CHECK-NEXT: fadd v0.4s, v1.4s, v0.4s +; CHECK-NEXT: fadd v1.4s, v1.4s, v1.4s +; CHECK-NEXT: fsub v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %mul = fmul <4 x float> %b, %add = fadd <4 x float> %mul, %a -- GitLab From abfefc95baa455d8beadac40792d1384228736b1 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Mon, 15 Oct 2018 18:05:34 +0000 Subject: [PATCH 0208/1116] [SelectionDAG] allow FP binops in SimplifyDemandedVectorElts This is intended to make the backend on par with functionality that was added to the IR version of SimplifyDemandedVectorElts in: rL343727 ...and the original motivation is that we need to improve demanded-vector-elements in several ways to avoid problems that would be exposed in D51553. Differential Revision: https://reviews.llvm.org/D52912 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344541 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/SelectionDAG/TargetLowering.cpp | 7 ++++- test/CodeGen/X86/avx512-hadd-hsub.ll | 30 +++++++++---------- .../X86/avx512-intrinsics-fast-isel.ll | 16 +++++----- test/CodeGen/X86/vector-shuffle-combining.ll | 4 +-- 4 files changed, 30 insertions(+), 27 deletions(-) diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp index d3a50788f79..150d22cffa7 100644 --- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -1712,7 +1712,12 @@ bool TargetLowering::SimplifyDemandedVectorElts( break; } case ISD::ADD: - case ISD::SUB: { + case ISD::SUB: + case ISD::FADD: + case ISD::FSUB: + case ISD::FMUL: + case ISD::FDIV: + case ISD::FREM: { APInt SrcUndef, SrcZero; if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, SrcUndef, SrcZero, TLO, Depth + 1)) diff --git a/test/CodeGen/X86/avx512-hadd-hsub.ll b/test/CodeGen/X86/avx512-hadd-hsub.ll index 510553b56d4..aed182179cf 100644 --- a/test/CodeGen/X86/avx512-hadd-hsub.ll +++ b/test/CodeGen/X86/avx512-hadd-hsub.ll @@ -178,16 +178,16 @@ define <8 x double> @fhadd_16_4(<8 x double> %x225, <8 x double> %x227) { define <4 x double> @fadd_noundef_low(<8 x double> %x225, <8 x double> %x227) { ; KNL-LABEL: fadd_noundef_low: ; KNL: # %bb.0: -; KNL-NEXT: vunpcklpd {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; KNL-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] +; KNL-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; KNL-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; KNL-NEXT: vaddpd %zmm0, %zmm2, %zmm0 ; KNL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: fadd_noundef_low: ; SKX: # %bb.0: -; SKX-NEXT: vunpcklpd {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] +; SKX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; SKX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; SKX-NEXT: vaddpd %zmm0, %zmm2, %zmm0 ; SKX-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; SKX-NEXT: retq @@ -252,17 +252,15 @@ define <8 x i32> @hadd_16_3_sv(<16 x i32> %x225, <16 x i32> %x227) { define double @fadd_noundef_eel(<8 x double> %x225, <8 x double> %x227) { ; KNL-LABEL: fadd_noundef_eel: ; KNL: # %bb.0: -; KNL-NEXT: vunpcklpd {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; KNL-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] -; KNL-NEXT: vaddpd %zmm0, %zmm2, %zmm0 +; KNL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; KNL-NEXT: vaddpd %zmm1, %zmm0, %zmm0 ; KNL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: fadd_noundef_eel: ; SKX: # %bb.0: -; SKX-NEXT: vunpcklpd {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] -; SKX-NEXT: vaddpd %zmm0, %zmm2, %zmm0 +; SKX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; SKX-NEXT: vaddpd %zmm1, %zmm0, %zmm0 ; SKX-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq @@ -278,18 +276,18 @@ define double @fadd_noundef_eel(<8 x double> %x225, <8 x double> %x227) { define double @fsub_noundef_ee (<8 x double> %x225, <8 x double> %x227) { ; KNL-LABEL: fsub_noundef_ee: ; KNL: # %bb.0: -; KNL-NEXT: vunpcklpd {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; KNL-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] -; KNL-NEXT: vsubpd %zmm0, %zmm2, %zmm0 +; KNL-NEXT: vextractf32x4 $2, %zmm1, %xmm0 +; KNL-NEXT: vbroadcastsd %xmm0, %zmm0 +; KNL-NEXT: vsubpd %zmm1, %zmm0, %zmm0 ; KNL-NEXT: vextractf32x4 $2, %zmm0, %xmm0 ; KNL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; KNL-NEXT: retq ; ; SKX-LABEL: fsub_noundef_ee: ; SKX: # %bb.0: -; SKX-NEXT: vunpcklpd {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] -; SKX-NEXT: vsubpd %zmm0, %zmm2, %zmm0 +; SKX-NEXT: vextractf32x4 $2, %zmm1, %xmm0 +; SKX-NEXT: vbroadcastsd %xmm0, %zmm0 +; SKX-NEXT: vsubpd %zmm1, %zmm0, %zmm0 ; SKX-NEXT: vextractf32x4 $2, %zmm0, %xmm0 ; SKX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; SKX-NEXT: vzeroupper diff --git a/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll index fa37d2148f2..aa89ee7c390 100644 --- a/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll +++ b/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll @@ -7304,7 +7304,7 @@ define float @test_mm512_reduce_add_ps(<16 x float> %__W) { ; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; X86-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2] +; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; X86-NEXT: vmovss %xmm0, (%esp) ; X86-NEXT: flds (%esp) @@ -7321,7 +7321,7 @@ define float @test_mm512_reduce_add_ps(<16 x float> %__W) { ; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; X64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2] +; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq @@ -7354,7 +7354,7 @@ define float @test_mm512_reduce_mul_ps(<16 x float> %__W) { ; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; X86-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2] +; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; X86-NEXT: vmovss %xmm0, (%esp) ; X86-NEXT: flds (%esp) @@ -7371,7 +7371,7 @@ define float @test_mm512_reduce_mul_ps(<16 x float> %__W) { ; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; X64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2] +; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq @@ -7516,7 +7516,7 @@ define float @test_mm512_mask_reduce_add_ps(i16 zeroext %__M, <16 x float> %__W) ; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; X86-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2] +; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; X86-NEXT: vmovss %xmm0, (%esp) ; X86-NEXT: flds (%esp) @@ -7535,7 +7535,7 @@ define float @test_mm512_mask_reduce_add_ps(i16 zeroext %__M, <16 x float> %__W) ; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; X64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2] +; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq @@ -7573,7 +7573,7 @@ define float @test_mm512_mask_reduce_mul_ps(i16 zeroext %__M, <16 x float> %__W) ; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; X86-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2] +; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; X86-NEXT: vmovss %xmm0, (%esp) ; X86-NEXT: flds (%esp) @@ -7593,7 +7593,7 @@ define float @test_mm512_mask_reduce_mul_ps(i16 zeroext %__M, <16 x float> %__W) ; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; X64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2] +; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq diff --git a/test/CodeGen/X86/vector-shuffle-combining.ll b/test/CodeGen/X86/vector-shuffle-combining.ll index 2eb9362947e..01e36681400 100644 --- a/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/test/CodeGen/X86/vector-shuffle-combining.ll @@ -2703,7 +2703,7 @@ define <4 x float> @PR22377(<4 x float> %a, <4 x float> %b) { ; SSE-LABEL: PR22377: ; SSE: # %bb.0: # %entry ; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm0[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,0,2] ; SSE-NEXT: addps %xmm0, %xmm1 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -2711,7 +2711,7 @@ define <4 x float> @PR22377(<4 x float> %a, <4 x float> %b) { ; ; AVX-LABEL: PR22377: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,3,1,3] +; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,3,2,3] ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,2] ; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm1 ; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -- GitLab From fcb831da6a919bd658d85ea26268dccb5cc9d086 Mon Sep 17 00:00:00 2001 From: Lang Hames Date: Mon, 15 Oct 2018 18:34:36 +0000 Subject: [PATCH 0209/1116] [ADT] Fix a bug in DenseSet's initializer_list constructor. Without this fix, DenseSet crashes with an assertion if constructed with an initializer_list whose length is not a power of two. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344542 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/ADT/DenseSet.h | 3 ++- unittests/ADT/DenseSetTest.cpp | 8 ++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/include/llvm/ADT/DenseSet.h b/include/llvm/ADT/DenseSet.h index 404b2f74766..e85a38587e4 100644 --- a/include/llvm/ADT/DenseSet.h +++ b/include/llvm/ADT/DenseSet.h @@ -16,6 +16,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseMapInfo.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Support/type_traits.h" #include #include @@ -67,7 +68,7 @@ public: explicit DenseSetImpl(unsigned InitialReserve = 0) : TheMap(InitialReserve) {} DenseSetImpl(std::initializer_list Elems) - : DenseSetImpl(Elems.size()) { + : DenseSetImpl(PowerOf2Ceil(Elems.size())) { insert(Elems.begin(), Elems.end()); } diff --git a/unittests/ADT/DenseSetTest.cpp b/unittests/ADT/DenseSetTest.cpp index 04f84e041fb..7368e2ed0e0 100644 --- a/unittests/ADT/DenseSetTest.cpp +++ b/unittests/ADT/DenseSetTest.cpp @@ -80,6 +80,14 @@ TYPED_TEST(DenseSetTest, InitializerList) { EXPECT_EQ(0u, set.count(3)); } +TYPED_TEST(DenseSetTest, InitializerListWithNonPowerOfTwoLength) { + TypeParam set({1, 2, 3}); + EXPECT_EQ(3u, set.size()); + EXPECT_EQ(1u, set.count(1)); + EXPECT_EQ(1u, set.count(2)); + EXPECT_EQ(1u, set.count(3)); +} + TYPED_TEST(DenseSetTest, ConstIteratorComparison) { TypeParam set({1}); const TypeParam &cset = set; -- GitLab From 0a2a30e517cde9ce927b9215158875c17141e794 Mon Sep 17 00:00:00 2001 From: Vedant Kumar Date: Mon, 15 Oct 2018 19:22:20 +0000 Subject: [PATCH 0210/1116] [CodeExtractor] Erase debug intrinsics in outlined thunks (fix PR22900) Variable updates within the outlined function are invisible to debuggers. This could be improved by defining a DISubprogram for the new function. For the moment, simply erase the debug intrinsics instead. This fixes verifier failures about function-local metadata being used in the wrong function, seen while testing the hot/cold splitting pass. rdar://45142482 Differential Revision: https://reviews.llvm.org/D53267 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344545 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Utils/CodeExtractor.cpp | 13 +++++ .../HotColdSplit/split-out-dbg-val-of-arg.ll | 51 +++++++++++++++++++ 2 files changed, 64 insertions(+) create mode 100644 test/Transforms/HotColdSplit/split-out-dbg-val-of-arg.ll diff --git a/lib/Transforms/Utils/CodeExtractor.cpp b/lib/Transforms/Utils/CodeExtractor.cpp index 0e9e3219033..7b45b1799c4 100644 --- a/lib/Transforms/Utils/CodeExtractor.cpp +++ b/lib/Transforms/Utils/CodeExtractor.cpp @@ -1286,6 +1286,19 @@ Function *CodeExtractor::extractCodeRegion() { } } + // Erase debug info intrinsics. Variable updates within the new function are + // invisible to debuggers. This could be improved by defining a DISubprogram + // for the new function. + for (BasicBlock &BB : *newFunction) { + auto BlockIt = BB.begin(); + while (BlockIt != BB.end()) { + Instruction *Inst = &*BlockIt; + ++BlockIt; + if (isa(Inst)) + Inst->eraseFromParent(); + } + } + LLVM_DEBUG(if (verifyFunction(*newFunction)) report_fatal_error("verifyFunction failed!")); return newFunction; diff --git a/test/Transforms/HotColdSplit/split-out-dbg-val-of-arg.ll b/test/Transforms/HotColdSplit/split-out-dbg-val-of-arg.ll new file mode 100644 index 00000000000..4b81de7b35b --- /dev/null +++ b/test/Transforms/HotColdSplit/split-out-dbg-val-of-arg.ll @@ -0,0 +1,51 @@ +; RUN: opt -hotcoldsplit -S < %s | FileCheck %s + +; CHECK-LABEL: define {{.*}}@foo_if.end +; CHECK-NOT: llvm.dbg.value + +define void @foo(i32 %arg1) !dbg !6 { +entry: + %var = add i32 0, 0, !dbg !11 + br i1 undef, label %if.then, label %if.end, !dbg !12 + +if.then: ; preds = %entry + unreachable, !dbg !13 + +if.end: ; preds = %entry + call void @llvm.dbg.value(metadata i32 %arg1, metadata !9, metadata !DIExpression()), !dbg !11 + br label %if.then12, !dbg !14 + +if.then12: ; preds = %if.end + br label %cleanup40, !dbg !15 + +cleanup40: ; preds = %if.then12 + br label %return, !dbg !16 + +return: ; preds = %cleanup40 + ret void, !dbg !17 +} + +declare void @llvm.dbg.value(metadata, metadata, metadata) + +!llvm.dbg.cu = !{!0} +!llvm.debugify = !{!3, !4} +!llvm.module.flags = !{!5} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2) +!1 = !DIFile(filename: "", directory: "/") +!2 = !{} +!3 = !{i32 7} +!4 = !{i32 1} +!5 = !{i32 2, !"Debug Info Version", i32 3} +!6 = distinct !DISubprogram(name: "foo", linkageName: "foo", scope: null, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 1, isOptimized: true, unit: !0, retainedNodes: !8) +!7 = !DISubroutineType(types: !2) +!8 = !{!9} +!9 = !DILocalVariable(name: "1", scope: !6, file: !1, line: 1, type: !10) +!10 = !DIBasicType(name: "ty32", size: 32, encoding: DW_ATE_unsigned) +!11 = !DILocation(line: 1, column: 1, scope: !6) +!12 = !DILocation(line: 2, column: 1, scope: !6) +!13 = !DILocation(line: 3, column: 1, scope: !6) +!14 = !DILocation(line: 4, column: 1, scope: !6) +!15 = !DILocation(line: 5, column: 1, scope: !6) +!16 = !DILocation(line: 6, column: 1, scope: !6) +!17 = !DILocation(line: 7, column: 1, scope: !6) -- GitLab From ecabdb23f632398ccbe0fc5a9a521469fd70de23 Mon Sep 17 00:00:00 2001 From: Jordan Rupprecht Date: Mon, 15 Oct 2018 20:15:58 +0000 Subject: [PATCH 0211/1116] [llvm-objcopy] NFC: update TODO test comment git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344550 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/tools/llvm-objcopy/input-output-target.test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/tools/llvm-objcopy/input-output-target.test b/test/tools/llvm-objcopy/input-output-target.test index e81770a239a..7a7df9fd503 100644 --- a/test/tools/llvm-objcopy/input-output-target.test +++ b/test/tools/llvm-objcopy/input-output-target.test @@ -11,7 +11,7 @@ # RUN: llvm-objcopy --target binary -B i386:x86-64 %t.txt %t.3.txt # RUN: cmp %t-copy.txt %t.3.txt -# TODO: check --target and --input-target/--output-target are incompatible +# --target is incompatibile with --input-target/--output-target # RUN: not llvm-objcopy --target binary --input-target binary -B i386:x86-64 \ # RUN: %t.txt %t.4.txt 2>&1 \ # RUN: | FileCheck %s --check-prefix=BAD-FLAG -- GitLab From bea8b730d34af6991a91a4fe563234c5ea6eeabc Mon Sep 17 00:00:00 2001 From: Konstantin Zhuravlyov Date: Mon, 15 Oct 2018 20:37:47 +0000 Subject: [PATCH 0212/1116] AMDGPU: Generate .amdgcn_target for object code v3 Differential Revision: https://reviews.llvm.org/D53221 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344552 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 13 ++++- .../CodeGen/AMDGPU/directive-amdgcn-target.ll | 58 +++++++++++++++++++ 2 files changed, 68 insertions(+), 3 deletions(-) create mode 100644 test/CodeGen/AMDGPU/directive-amdgcn-target.ll diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 7e6a406b1e3..7448dd71004 100644 --- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -116,9 +116,16 @@ AMDGPUTargetStreamer* AMDGPUAsmPrinter::getTargetStreamer() const { } void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) { - if (IsaInfo::hasCodeObjectV3(getSTI()) && - TM.getTargetTriple().getOS() == Triple::AMDHSA) - return; + if (IsaInfo::hasCodeObjectV3(getSTI())) { + std::string ExpectedTarget; + raw_string_ostream ExpectedTargetOS(ExpectedTarget); + IsaInfo::streamIsaVersion(getSTI(), ExpectedTargetOS); + + getTargetStreamer()->EmitDirectiveAMDGCNTarget(ExpectedTarget); + + if (TM.getTargetTriple().getOS() == Triple::AMDHSA) + return; + } if (TM.getTargetTriple().getOS() != Triple::AMDHSA && TM.getTargetTriple().getOS() != Triple::AMDPAL) diff --git a/test/CodeGen/AMDGPU/directive-amdgcn-target.ll b/test/CodeGen/AMDGPU/directive-amdgcn-target.ll new file mode 100644 index 00000000000..757da908af9 --- /dev/null +++ b/test/CodeGen/AMDGPU/directive-amdgcn-target.ll @@ -0,0 +1,58 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX600 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX600 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx601 -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX601 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hainan -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX601 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=oland -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX601 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=pitcairn -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX601 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=verde -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX601 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX700 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX700 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx701 -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX701 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX701 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx702 -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX702 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx703 -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX703 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kabini -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX703 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=mullins -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX703 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx704 -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX704 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=bonaire -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX704 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx801 -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX801 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=carrizo -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX801 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX802 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=iceland -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX802 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX802 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX803 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX803 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=polaris10 -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX803 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=polaris11 -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX803 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx810 -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX810 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=stoney -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX810 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx902 -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX902 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx904 -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX904 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX906 %s + +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+code-object-v3,+xnack < %s | FileCheck --check-prefixes=XNACK-GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx902 -mattr=+code-object-v3,-xnack < %s | FileCheck --check-prefixes=NO-XNACK-GFX902 %s + +; GFX600: .amdgcn_target "amdgcn-amd-amdhsa--gfx600" +; GFX601: .amdgcn_target "amdgcn-amd-amdhsa--gfx601" +; GFX700: .amdgcn_target "amdgcn-amd-amdhsa--gfx700" +; GFX701: .amdgcn_target "amdgcn-amd-amdhsa--gfx701" +; GFX702: .amdgcn_target "amdgcn-amd-amdhsa--gfx702" +; GFX703: .amdgcn_target "amdgcn-amd-amdhsa--gfx703" +; GFX704: .amdgcn_target "amdgcn-amd-amdhsa--gfx704" +; GFX801: .amdgcn_target "amdgcn-amd-amdhsa--gfx801+xnack" +; GFX802: .amdgcn_target "amdgcn-amd-amdhsa--gfx802" +; GFX803: .amdgcn_target "amdgcn-amd-amdhsa--gfx803" +; GFX810: .amdgcn_target "amdgcn-amd-amdhsa--gfx810+xnack" +; GFX900: .amdgcn_target "amdgcn-amd-amdhsa--gfx900" +; GFX902: .amdgcn_target "amdgcn-amd-amdhsa--gfx902+xnack" +; GFX904: .amdgcn_target "amdgcn-amd-amdhsa--gfx904" +; GFX906: .amdgcn_target "amdgcn-amd-amdhsa--gfx906" + +; XNACK-GFX900: .amdgcn_target "amdgcn-amd-amdhsa--gfx900+xnack" +; NO-XNACK-GFX902: .amdgcn_target "amdgcn-amd-amdhsa--gfx902 + +define amdgpu_kernel void @directive_amdgcn_target() { + ret void +} -- GitLab From b34f2ee301ef507dacf1984181f87048cd81b9b6 Mon Sep 17 00:00:00 2001 From: Chris Bieneman Date: Mon, 15 Oct 2018 21:14:19 +0000 Subject: [PATCH 0213/1116] [CMake] Change the default value of LLVM_ENABLE_IDE There really aren't any generator behaviors that we need to take `CMAKE_EXTRA_GENERATOR` into account for. Where we need to take different behaviors for IDEs is mostly in enabling or disabling certain build system features that are optional but trip up the IDE UIs. Like the generation of lots of utility targets. By changing the LLVM_ENABLE_IDE default to only being on for multi-configuration generators, we allow gating where it will impact the UI presentation, while also supporting optionally disabling the generation if your tooling workflow encounters problems. Presently being able to manually disable extra target generation is useful for Visual Studio 2017's CMake integration where the IDE has trouble displaying and working with the large number of optional targets. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344553 91177308-0d34-0410-b5e6-96231b3b80d8 --- cmake/modules/HandleLLVMOptions.cmake | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/cmake/modules/HandleLLVMOptions.cmake b/cmake/modules/HandleLLVMOptions.cmake index 85aebf6ed71..27875781d22 100644 --- a/cmake/modules/HandleLLVMOptions.cmake +++ b/cmake/modules/HandleLLVMOptions.cmake @@ -868,16 +868,19 @@ else() set(LLVM_ENABLE_PLUGINS ON) endif() -# Remove LLVM_ENABLE_IDE from the CMake cache. This is a temporary change to -# allow CMake caches to be cleaned up so that we can change the default for this -# option and how it is used. -unset(LLVM_ENABLE_IDE CACHE) -#set(LLVM_ENABLE_IDE_default OFF) -#if (XCODE OR MSVC_IDE OR CMAKE_EXTRA_GENERATOR) -# set(LLVM_ENABLE_IDE_default ON) -#endif() -#option(LLVM_ENABLE_IDE "Generate targets and process sources for use with an IDE" -# ${LLVM_ENABLE_IDE_default}) +# By default we should enable LLVM_ENABLE_IDE only for multi-configuration +# generators. This option disables optional build system features that make IDEs +# less usable. +set(LLVM_ENABLE_IDE_default OFF) +if (CMAKE_CONFIGURATION_TYPES) + set(LLVM_ENABLE_IDE_default ON) +endif() +option(LLVM_ENABLE_IDE + "Disable optional build system features that cause problems for IDE generators" + ${LLVM_ENABLE_IDE_default}) +if (CMAKE_CONFIGURATION_TYPES AND NOT LLVM_ENABLE_IDE) + message(WARNING "Disabling LLVM_ENABLE_IDE on multi-configuration generators is not recommended.") +endif() function(get_compile_definitions) get_directory_property(top_dir_definitions DIRECTORY ${CMAKE_SOURCE_DIR} COMPILE_DEFINITIONS) -- GitLab From 5e9d76b982a1307ce32d562aa26fd8256506721f Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 15 Oct 2018 21:15:58 +0000 Subject: [PATCH 0214/1116] [AARCH64] Improve vector popcnt lowering with ADDLP AARCH64 equivalent to D53257 - uses widening pairwise adds on vXi8 CTPOP to support i16/i32/i64 vectors. This is a blocker for generic vector CTPOP expansion (P32655) - this will remove the aarch64 diff from D53258. Differential Revision: https://reviews.llvm.org/D53259 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344554 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AArch64/AArch64ISelLowering.cpp | 48 +++++-- test/CodeGen/AArch64/arm64-vpopcnt.ll | 140 ++++----------------- 2 files changed, 62 insertions(+), 126 deletions(-) diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index 90633807cdf..fea1531540f 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -792,9 +792,9 @@ void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) { for (MVT InnerVT : MVT::all_valuetypes()) setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand); - // CNT supports only B element sizes. + // CNT supports only B element sizes, then use UADDLP to widen. if (VT != MVT::v8i8 && VT != MVT::v16i8) - setOperationAction(ISD::CTPOP, VT, Expand); + setOperationAction(ISD::CTPOP, VT, Custom); setOperationAction(ISD::UDIV, VT, Expand); setOperationAction(ISD::SDIV, VT, Expand); @@ -4539,18 +4539,42 @@ SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); EVT VT = Op.getValueType(); - if (VT == MVT::i32) - Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val); - Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val); + if (VT == MVT::i32 || VT == MVT::i64) { + if (VT == MVT::i32) + Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val); + Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val); - SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val); - SDValue UaddLV = DAG.getNode( - ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32, - DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop); + SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val); + SDValue UaddLV = DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32, + DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop); - if (VT == MVT::i64) - UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV); - return UaddLV; + if (VT == MVT::i64) + UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV); + return UaddLV; + } + + assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 || + VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) && + "Unexpected type for custom ctpop lowering"); + + EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8; + Val = DAG.getBitcast(VT8Bit, Val); + Val = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Val); + + // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds. + unsigned EltSize = 8; + unsigned NumElts = VT.is64BitVector() ? 8 : 16; + while (EltSize != VT.getScalarSizeInBits()) { + EltSize *= 2; + NumElts /= 2; + MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts); + Val = DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, DL, WidenVT, + DAG.getConstant(Intrinsic::aarch64_neon_uaddlp, DL, MVT::i32), Val); + } + + return Val; } SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { diff --git a/test/CodeGen/AArch64/arm64-vpopcnt.ll b/test/CodeGen/AArch64/arm64-vpopcnt.ll index 0c223ced9ac..6fe1176eaa8 100644 --- a/test/CodeGen/AArch64/arm64-vpopcnt.ll +++ b/test/CodeGen/AArch64/arm64-vpopcnt.ll @@ -17,30 +17,8 @@ declare <8 x i8> @llvm.ctpop.v8i8(<8 x i8>) nounwind readnone define <4 x i16> @ctpopv4i16(<4 x i16> %x) nounwind readnone { ; CHECK-LABEL: ctpopv4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w8, v0.h[0] -; CHECK-NEXT: fmov d1, x8 -; CHECK-NEXT: cnt v1.8b, v1.8b -; CHECK-NEXT: uaddlv h1, v1.8b -; CHECK-NEXT: umov w8, v0.h[1] -; CHECK-NEXT: fmov d2, x8 -; CHECK-NEXT: cnt v2.8b, v2.8b -; CHECK-NEXT: uaddlv h2, v2.8b -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov v1.h[1], w8 -; CHECK-NEXT: umov w8, v0.h[2] -; CHECK-NEXT: fmov d2, x8 -; CHECK-NEXT: cnt v2.8b, v2.8b -; CHECK-NEXT: uaddlv h2, v2.8b -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov v1.h[2], w8 -; CHECK-NEXT: umov w8, v0.h[3] -; CHECK-NEXT: fmov d0, x8 ; CHECK-NEXT: cnt v0.8b, v0.8b -; CHECK-NEXT: uaddlv h0, v0.8b -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: mov v1.h[3], w8 -; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: uaddlp v0.4h, v0.8b ; CHECK-NEXT: ret %cnt = tail call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> %x) ret <4 x i16> %cnt @@ -51,18 +29,9 @@ declare <4 x i16> @llvm.ctpop.v4i16(<4 x i16>) nounwind readnone define <2 x i32> @ctpopv2i32(<2 x i32> %x) nounwind readnone { ; CHECK-LABEL: ctpopv2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov w8, v0.s[1] -; CHECK-NEXT: fmov w0, s0 -; CHECK-NEXT: fmov d0, x0 ; CHECK-NEXT: cnt v0.8b, v0.8b -; CHECK-NEXT: uaddlv h0, v0.8b -; CHECK-NEXT: fmov d1, x8 -; CHECK-NEXT: cnt v1.8b, v1.8b -; CHECK-NEXT: uaddlv h1, v1.8b -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: mov v0.s[1], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: uaddlp v0.4h, v0.8b +; CHECK-NEXT: uaddlp v0.2s, v0.4h ; CHECK-NEXT: ret %cnt = tail call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %x) ret <2 x i32> %cnt @@ -70,6 +39,20 @@ define <2 x i32> @ctpopv2i32(<2 x i32> %x) nounwind readnone { declare <2 x i32> @llvm.ctpop.v2i32(<2 x i32>) nounwind readnone +define <1 x i64> @ctpopv1i64(<1 x i64> %x) nounwind readnone { +; CHECK-LABEL: ctpopv1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: cnt v0.8b, v0.8b +; CHECK-NEXT: uaddlp v0.4h, v0.8b +; CHECK-NEXT: uaddlp v0.2s, v0.4h +; CHECK-NEXT: uaddlp v0.1d, v0.2s +; CHECK-NEXT: ret + %cnt = tail call <1 x i64> @llvm.ctpop.v1i64(<1 x i64> %x) + ret <1 x i64> %cnt +} + +declare <1 x i64> @llvm.ctpop.v1i64(<1 x i64>) nounwind readnone + define <16 x i8> @ctpopv16i8(<16 x i8> %x) nounwind readnone { ; CHECK-LABEL: ctpopv16i8: ; CHECK: // %bb.0: @@ -84,53 +67,8 @@ declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>) nounwind readnone define <8 x i16> @ctpopv8i16(<8 x i16> %x) nounwind readnone { ; CHECK-LABEL: ctpopv8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: umov w8, v0.h[1] -; CHECK-NEXT: fmov d1, x8 -; CHECK-NEXT: cnt v1.8b, v1.8b -; CHECK-NEXT: uaddlv h1, v1.8b -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: umov w9, v0.h[0] -; CHECK-NEXT: fmov d1, x9 -; CHECK-NEXT: cnt v1.8b, v1.8b -; CHECK-NEXT: uaddlv h1, v1.8b -; CHECK-NEXT: mov v1.h[1], w8 -; CHECK-NEXT: umov w8, v0.h[2] -; CHECK-NEXT: fmov d2, x8 -; CHECK-NEXT: cnt v2.8b, v2.8b -; CHECK-NEXT: uaddlv h2, v2.8b -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov v1.h[2], w8 -; CHECK-NEXT: umov w8, v0.h[3] -; CHECK-NEXT: fmov d2, x8 -; CHECK-NEXT: cnt v2.8b, v2.8b -; CHECK-NEXT: uaddlv h2, v2.8b -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov v1.h[3], w8 -; CHECK-NEXT: umov w8, v0.h[4] -; CHECK-NEXT: fmov d2, x8 -; CHECK-NEXT: cnt v2.8b, v2.8b -; CHECK-NEXT: uaddlv h2, v2.8b -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov v1.h[4], w8 -; CHECK-NEXT: umov w8, v0.h[5] -; CHECK-NEXT: fmov d2, x8 -; CHECK-NEXT: cnt v2.8b, v2.8b -; CHECK-NEXT: uaddlv h2, v2.8b -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov v1.h[5], w8 -; CHECK-NEXT: umov w8, v0.h[6] -; CHECK-NEXT: fmov d2, x8 -; CHECK-NEXT: cnt v2.8b, v2.8b -; CHECK-NEXT: uaddlv h2, v2.8b -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov v1.h[6], w8 -; CHECK-NEXT: umov w8, v0.h[7] -; CHECK-NEXT: fmov d0, x8 -; CHECK-NEXT: cnt v0.8b, v0.8b -; CHECK-NEXT: uaddlv h0, v0.8b -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: mov v1.h[7], w8 -; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: uaddlp v0.8h, v0.16b ; CHECK-NEXT: ret %cnt = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %x) ret <8 x i16> %cnt @@ -141,28 +79,9 @@ declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>) nounwind readnone define <4 x i32> @ctpopv4i32(<4 x i32> %x) nounwind readnone { ; CHECK-LABEL: ctpopv4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, v0.s[1] -; CHECK-NEXT: mov w9, v0.s[2] -; CHECK-NEXT: mov w10, v0.s[3] -; CHECK-NEXT: fmov w0, s0 -; CHECK-NEXT: fmov d0, x0 -; CHECK-NEXT: cnt v0.8b, v0.8b -; CHECK-NEXT: uaddlv h0, v0.8b -; CHECK-NEXT: fmov d1, x8 -; CHECK-NEXT: cnt v1.8b, v1.8b -; CHECK-NEXT: uaddlv h1, v1.8b -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: mov v0.s[1], w8 -; CHECK-NEXT: fmov d1, x9 -; CHECK-NEXT: cnt v1.8b, v1.8b -; CHECK-NEXT: uaddlv h1, v1.8b -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: mov v0.s[2], w8 -; CHECK-NEXT: fmov d1, x10 -; CHECK-NEXT: cnt v1.8b, v1.8b -; CHECK-NEXT: uaddlv h1, v1.8b -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: mov v0.s[3], w8 +; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: uaddlp v0.8h, v0.16b +; CHECK-NEXT: uaddlp v0.4s, v0.8h ; CHECK-NEXT: ret %cnt = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %x) ret <4 x i32> %cnt @@ -173,17 +92,10 @@ declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>) nounwind readnone define <2 x i64> @ctpopv2i64(<2 x i64> %x) nounwind readnone { ; CHECK-LABEL: ctpopv2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: cnt v1.8b, v0.8b -; CHECK-NEXT: uaddlv h1, v1.8b -; CHECK-NEXT: fmov w0, s1 -; CHECK-NEXT: fmov d1, x0 -; CHECK-NEXT: mov x8, v0.d[1] -; CHECK-NEXT: fmov d0, x8 -; CHECK-NEXT: cnt v0.8b, v0.8b -; CHECK-NEXT: uaddlv h0, v0.8b -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: mov v1.d[1], x8 -; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: uaddlp v0.8h, v0.16b +; CHECK-NEXT: uaddlp v0.4s, v0.8h +; CHECK-NEXT: uaddlp v0.2d, v0.4s ; CHECK-NEXT: ret %cnt = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %x) ret <2 x i64> %cnt -- GitLab From 11b69c205d90be3ef1ebc5c676bdaa485a3c6475 Mon Sep 17 00:00:00 2001 From: Chris Bieneman Date: Mon, 15 Oct 2018 21:20:02 +0000 Subject: [PATCH 0215/1116] [CMake] Use LLVM_ENABLE_IDE instead of CMAKE_CONFIGURATION_TYPES There are several places where we use CMAKE_CONFIGURATION_TYPES to determine if we are using an IDE generator and in turn decide not to generate some of the convenience targets (like all the install-* and check-llvm-* targets). This decision is made because IDEs don't always deal well with the thousands of targets LLVM can generate. This approach does not work for Visual Studio 15's new CMake integration. Because VS15 uses a Ninja generator, it isn't a multi-configuration build, and generating all these extra targets mucks up the UI and adds little value. With this change we still don't generate these targets by default for Visual Studio and Xcode generators, and LLVM_ENABLE_IDE becomes a switch that can be enabled on the VS15 CMake builds, to improve the IDE experience. This is a re-land of r340435, with a few minor fix-ups. The issues causing the revert were addressed in r344218, r344219, and r344553. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344555 91177308-0d34-0410-b5e6-96231b3b80d8 --- CMakeLists.txt | 6 +++--- cmake/modules/AddLLVM.cmake | 12 ++++++------ cmake/modules/CMakeLists.txt | 2 +- tools/xcode-toolchain/CMakeLists.txt | 2 +- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3e7a9dd8988..374bddbec2d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -974,7 +974,7 @@ if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY) add_custom_target(llvm-headers DEPENDS intrinsics_gen) set_target_properties(llvm-headers PROPERTIES FOLDER "Misc") - if (NOT CMAKE_CONFIGURATION_TYPES) + if (NOT LLVM_ENABLE_IDE) add_llvm_install_targets(install-llvm-headers DEPENDS llvm-headers COMPONENT llvm-headers) @@ -984,7 +984,7 @@ if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY) add_custom_target(llvm-libraries) set_target_properties(llvm-libraries PROPERTIES FOLDER "Misc") - if (NOT CMAKE_CONFIGURATION_TYPES) + if (NOT LLVM_ENABLE_IDE) add_llvm_install_targets(install-llvm-libraries DEPENDS llvm-libraries COMPONENT llvm-libraries) @@ -1005,7 +1005,7 @@ endif() # This must be at the end of the LLVM root CMakeLists file because it must run # after all targets are created. if(LLVM_DISTRIBUTION_COMPONENTS) - if(CMAKE_CONFIGURATION_TYPES) + if(LLVM_ENABLE_IDE) message(FATAL_ERROR "LLVM_DISTRIBUTION_COMPONENTS cannot be specified with multi-configuration generators (i.e. Xcode or Visual Studio)") endif() diff --git a/cmake/modules/AddLLVM.cmake b/cmake/modules/AddLLVM.cmake index 4dde95e30f3..410308d46d6 100644 --- a/cmake/modules/AddLLVM.cmake +++ b/cmake/modules/AddLLVM.cmake @@ -659,7 +659,7 @@ macro(add_llvm_library name) ${install_type} DESTINATION ${install_dir} COMPONENT ${name}) - if (NOT CMAKE_CONFIGURATION_TYPES) + if (NOT LLVM_ENABLE_IDE) add_llvm_install_targets(install-${name} DEPENDS ${name} COMPONENT ${name}) @@ -890,7 +890,7 @@ macro(add_llvm_tool name) RUNTIME DESTINATION ${LLVM_TOOLS_INSTALL_DIR} COMPONENT ${name}) - if (NOT CMAKE_CONFIGURATION_TYPES) + if (NOT LLVM_ENABLE_IDE) add_llvm_install_targets(install-${name} DEPENDS ${name} COMPONENT ${name}) @@ -928,7 +928,7 @@ macro(add_llvm_utility name) install (TARGETS ${name} RUNTIME DESTINATION ${LLVM_UTILS_INSTALL_DIR} COMPONENT ${name}) - if (NOT CMAKE_CONFIGURATION_TYPES) + if (NOT LLVM_ENABLE_IDE) add_llvm_install_targets(install-${name} DEPENDS ${name} COMPONENT ${name}) @@ -1409,7 +1409,7 @@ function(add_lit_testsuite target comment) endfunction() function(add_lit_testsuites project directory) - if (NOT CMAKE_CONFIGURATION_TYPES) + if (NOT LLVM_ENABLE_IDE) cmake_parse_arguments(ARG "" "" "PARAMS;DEPENDS;ARGS" ${ARGN}) # Search recursively for test directories by assuming anything not @@ -1468,7 +1468,7 @@ function(llvm_install_library_symlink name dest type) CODE "install_symlink(${full_name} ${full_dest} ${output_dir})" COMPONENT ${component}) - if (NOT CMAKE_CONFIGURATION_TYPES AND NOT ARG_ALWAYS_GENERATE) + if (NOT LLVM_ENABLE_IDE AND NOT ARG_ALWAYS_GENERATE) add_llvm_install_targets(install-${name} DEPENDS ${name} ${dest} install-${dest} COMPONENT ${name}) @@ -1501,7 +1501,7 @@ function(llvm_install_symlink name dest) CODE "install_symlink(${full_name} ${full_dest} ${LLVM_TOOLS_INSTALL_DIR})" COMPONENT ${component}) - if (NOT CMAKE_CONFIGURATION_TYPES AND NOT ARG_ALWAYS_GENERATE) + if (NOT LLVM_ENABLE_IDE AND NOT ARG_ALWAYS_GENERATE) add_llvm_install_targets(install-${name} DEPENDS ${name} ${dest} install-${dest} COMPONENT ${name}) diff --git a/cmake/modules/CMakeLists.txt b/cmake/modules/CMakeLists.txt index 6c316a2f04f..f5cc0006fa0 100644 --- a/cmake/modules/CMakeLists.txt +++ b/cmake/modules/CMakeLists.txt @@ -132,7 +132,7 @@ if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY) PATTERN LLVM-Config.cmake EXCLUDE PATTERN GetHostTriple.cmake EXCLUDE) - if (NOT CMAKE_CONFIGURATION_TYPES) + if (NOT LLVM_ENABLE_IDE) # Add a dummy target so this can be used with LLVM_DISTRIBUTION_COMPONENTS add_custom_target(cmake-exports) add_llvm_install_targets(install-cmake-exports diff --git a/tools/xcode-toolchain/CMakeLists.txt b/tools/xcode-toolchain/CMakeLists.txt index 0ae5e374fe9..6167f5f6bdd 100644 --- a/tools/xcode-toolchain/CMakeLists.txt +++ b/tools/xcode-toolchain/CMakeLists.txt @@ -100,7 +100,7 @@ add_llvm_install_targets(install-xcode-toolchain PREFIX ${LLVMToolchainDir}/usr/) if(LLVM_DISTRIBUTION_COMPONENTS) - if(CMAKE_CONFIGURATION_TYPES) + if(LLVM_ENABLE_IDE) message(FATAL_ERROR "LLVM_DISTRIBUTION_COMPONENTS cannot be specified with multi-configuration generators (i.e. Xcode or Visual Studio)") endif() -- GitLab From e2d6b27abc89920c7b964c3b2d6544a2e51480d2 Mon Sep 17 00:00:00 2001 From: Sebastian Pop Date: Mon, 15 Oct 2018 21:43:11 +0000 Subject: [PATCH 0216/1116] [hot-cold-split] fix static analysis of cold regions Make the code of blockEndsInUnreachable to match the function blockEndsInUnreachable in CodeGen/BranchFolding.cpp. I also have added a note to make sure the code of this function will not be modified unless the back-end version is also modified. An early return before outlining has been added to avoid outlining the full function body when the first block in the function is marked cold. The static analysis of cold code has been amended to avoid marking the whole function as cold by back-propagation because the back-propagation would mark blocks with return statements as cold. The patch adds debug statements to help discover these problems. Differential Revision: https://reviews.llvm.org/D52904 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344558 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/IPO/HotColdSplitting.cpp | 48 +++++++++++++++++--- test/Transforms/HotColdSplit/split-cold-1.ll | 24 ++++++++-- test/Transforms/HotColdSplit/split-cold-2.ll | 4 ++ 3 files changed, 66 insertions(+), 10 deletions(-) diff --git a/lib/Transforms/IPO/HotColdSplitting.cpp b/lib/Transforms/IPO/HotColdSplitting.cpp index 9d2634f1bc9..fcea40dffd7 100644 --- a/lib/Transforms/IPO/HotColdSplitting.cpp +++ b/lib/Transforms/IPO/HotColdSplitting.cpp @@ -101,14 +101,19 @@ static bool isSingleEntrySingleExit(BasicBlock *Entry, const BasicBlock *Exit, return true; } +// Same as blockEndsInUnreachable in CodeGen/BranchFolding.cpp. Do not modify +// this function unless you modify the MBB version as well. +// +/// A no successor, non-return block probably ends in unreachable and is cold. +/// Also consider a block that ends in an indirect branch to be a return block, +/// since many targets use plain indirect branches to return. bool blockEndsInUnreachable(const BasicBlock &BB) { + if (!succ_empty(&BB)) + return false; if (BB.empty()) return true; const Instruction *I = BB.getTerminator(); - if (isa(I) || isa(I)) - return true; - // Unreachable blocks do not have any successor. - return succ_empty(&BB); + return !(isa(I) || isa(I)); } static bool exceptionHandlingFunctions(const CallInst *CI) { @@ -123,8 +128,7 @@ static bool exceptionHandlingFunctions(const CallInst *CI) { FName == "__cxa_end_catch"; } -static -bool unlikelyExecuted(const BasicBlock &BB) { +static bool unlikelyExecuted(const BasicBlock &BB) { if (blockEndsInUnreachable(BB)) return true; // Exception handling blocks are unlikely executed. @@ -145,13 +149,32 @@ bool unlikelyExecuted(const BasicBlock &BB) { return false; } +static bool returnsOrHasSideEffects(const BasicBlock &BB) { + const TerminatorInst *I = BB.getTerminator(); + if (isa(I) || isa(I) || isa(I)) + return true; + + for (const Instruction &I : BB) + if (const CallInst *CI = dyn_cast(&I)) { + if (CI->hasFnAttr(Attribute::NoReturn)) + return true; + + if (isa(CI->getCalledValue())) + return true; + } + + return false; +} + static DenseSetBB getHotBlocks(Function &F) { // Mark all cold basic blocks. DenseSetBB ColdBlocks; for (BasicBlock &BB : F) - if (unlikelyExecuted(BB)) + if (unlikelyExecuted(BB)) { + LLVM_DEBUG(llvm::dbgs() << "\nForward propagation marks cold: " << BB); ColdBlocks.insert((const BasicBlock *)&BB); + } // Forward propagation: basic blocks are hot when they are reachable from the // beginning of the function through a path that does not contain cold blocks. @@ -203,7 +226,12 @@ static DenseSetBB getHotBlocks(Function &F) { if (ColdBlocks.count(It)) continue; + // Do not back-propagate to blocks that return or have side effects. + if (returnsOrHasSideEffects(*It)) + continue; + // Move the block from HotBlocks to ColdBlocks. + LLVM_DEBUG(llvm::dbgs() << "\nBack propagation marks cold: " << *It); HotBlocks.erase(It); ColdBlocks.insert(It); @@ -353,6 +381,12 @@ const Function *HotColdSplitting::outlineColdBlocks(Function &F, // Walking the dominator tree allows us to find the largest // cold region. BasicBlock *Begin = DT->getRootNode()->getBlock(); + + // Early return if the beginning of the function has been marked cold, + // otherwise all the function gets outlined. + if (PSI->isColdBB(Begin, BFI) || !HotBlocks.count(Begin)) + return nullptr; + for (auto I = df_begin(Begin), E = df_end(Begin); I != E; ++I) { BasicBlock *BB = *I; if (PSI->isColdBB(BB, BFI) || !HotBlocks.count(BB)) { diff --git a/test/Transforms/HotColdSplit/split-cold-1.ll b/test/Transforms/HotColdSplit/split-cold-1.ll index 60ec234ab83..1a8138fe0d3 100644 --- a/test/Transforms/HotColdSplit/split-cold-1.ll +++ b/test/Transforms/HotColdSplit/split-cold-1.ll @@ -1,9 +1,11 @@ ; RUN: opt -hotcoldsplit -S < %s | FileCheck %s ; RUN: opt -passes=hotcoldsplit -S < %s | FileCheck %s -; Outlined function is called from a basic block named codeRepl -; CHECK: codeRepl: -; CHECK-NEXT: call void @foo +; Check that the function is not split. Outlined function is called from a +; basic block named codeRepl. + +; CHECK-LABEL: @foo +; CHECK-NOT: codeRepl define void @foo() { entry: br i1 undef, label %if.then, label %if.end @@ -23,3 +25,19 @@ cleanup40: ; preds = %if.then12 return: ; preds = %cleanup40 ret void } + +; Check that the function is not split. We used to outline the full function. + +; CHECK-LABEL: @fun +; CHECK-NOT: codeRepl + +define void @fun() { +entry: + br i1 undef, label %if.then, label %if.end + +if.then: ; preds = %entry + br label %if.end + +if.end: ; preds = %entry + ret void +} diff --git a/test/Transforms/HotColdSplit/split-cold-2.ll b/test/Transforms/HotColdSplit/split-cold-2.ll index 101bc11cba9..e243a47623a 100644 --- a/test/Transforms/HotColdSplit/split-cold-2.ll +++ b/test/Transforms/HotColdSplit/split-cold-2.ll @@ -4,6 +4,10 @@ ; Make sure this compiles. This test used to fail with an invalid phi node: the ; two predecessors were outlined and the SSA representation was invalid. +; CHECK-LABEL: @fun +; CHECK: codeRepl: +; CHECK-NEXT: call void @fun_if.else + define void @fun() { entry: br i1 undef, label %if.then, label %if.else -- GitLab From a379b4f9b55f90f8263719f18aa5c2f408d321b6 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Mon, 15 Oct 2018 21:43:53 +0000 Subject: [PATCH 0217/1116] [InstCombine] add tests for bitwise logic --> select; NFC git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344559 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/Transforms/InstCombine/logical-select.ll | 36 +++++++++++++++++++ test/Transforms/InstCombine/vec_sext.ll | 8 ++--- 2 files changed, 40 insertions(+), 4 deletions(-) diff --git a/test/Transforms/InstCombine/logical-select.ll b/test/Transforms/InstCombine/logical-select.ll index dd95cc02751..3ee0ba169b3 100644 --- a/test/Transforms/InstCombine/logical-select.ll +++ b/test/Transforms/InstCombine/logical-select.ll @@ -531,3 +531,39 @@ define <4 x i32> @vec_sel_xor_multi_use(<4 x i32> %a, <4 x i32> %b, <4 x i1> %c) ret <4 x i32> %add } +; The 'ashr' guarantees that we have a bitmask, so this is select with truncated condition. + +define i32 @allSignBits(i32 %cond, i32 %tval, i32 %fval) { +; CHECK-LABEL: @allSignBits( +; CHECK-NEXT: [[BITMASK:%.*]] = ashr i32 [[COND:%.*]], 31 +; CHECK-NEXT: [[NOT_BITMASK:%.*]] = xor i32 [[BITMASK]], -1 +; CHECK-NEXT: [[A1:%.*]] = and i32 [[BITMASK]], [[TVAL:%.*]] +; CHECK-NEXT: [[A2:%.*]] = and i32 [[NOT_BITMASK]], [[FVAL:%.*]] +; CHECK-NEXT: [[SEL:%.*]] = or i32 [[A1]], [[A2]] +; CHECK-NEXT: ret i32 [[SEL]] +; + %bitmask = ashr i32 %cond, 31 + %not_bitmask = xor i32 %bitmask, -1 + %a1 = and i32 %tval, %bitmask + %a2 = and i32 %not_bitmask, %fval + %sel = or i32 %a1, %a2 + ret i32 %sel +} + +define <4 x i8> @allSignBits_vec(<4 x i8> %cond, <4 x i8> %tval, <4 x i8> %fval) { +; CHECK-LABEL: @allSignBits_vec( +; CHECK-NEXT: [[BITMASK:%.*]] = ashr <4 x i8> [[COND:%.*]], +; CHECK-NEXT: [[NOT_BITMASK:%.*]] = xor <4 x i8> [[BITMASK]], +; CHECK-NEXT: [[A1:%.*]] = and <4 x i8> [[BITMASK]], [[TVAL:%.*]] +; CHECK-NEXT: [[A2:%.*]] = and <4 x i8> [[NOT_BITMASK]], [[FVAL:%.*]] +; CHECK-NEXT: [[SEL:%.*]] = or <4 x i8> [[A2]], [[A1]] +; CHECK-NEXT: ret <4 x i8> [[SEL]] +; + %bitmask = ashr <4 x i8> %cond, + %not_bitmask = xor <4 x i8> %bitmask, + %a1 = and <4 x i8> %tval, %bitmask + %a2 = and <4 x i8> %fval, %not_bitmask + %sel = or <4 x i8> %a2, %a1 + ret <4 x i8> %sel +} + diff --git a/test/Transforms/InstCombine/vec_sext.ll b/test/Transforms/InstCombine/vec_sext.ll index ea76115fc44..f244d49527b 100644 --- a/test/Transforms/InstCombine/vec_sext.ll +++ b/test/Transforms/InstCombine/vec_sext.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -instcombine -S | FileCheck %s -define <4 x i32> @psignd_3(<4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: @psignd_3( +define <4 x i32> @vec_select(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: @vec_select( ; CHECK-NEXT: [[SUB:%.*]] = sub nsw <4 x i32> zeroinitializer, [[A:%.*]] ; CHECK-NEXT: [[B_LOBIT1:%.*]] = ashr <4 x i32> [[B:%.*]], ; CHECK-NEXT: [[T1:%.*]] = xor <4 x i32> [[B_LOBIT1]], @@ -23,8 +23,8 @@ define <4 x i32> @psignd_3(<4 x i32> %a, <4 x i32> %b) { ret <4 x i32> %cond } -define <4 x i32> @test1(<4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: @test1( +define <4 x i32> @vec_select_alternate_sign_bit_test(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: @vec_select_alternate_sign_bit_test( ; CHECK-NEXT: [[SUB:%.*]] = sub nsw <4 x i32> zeroinitializer, [[A:%.*]] ; CHECK-NEXT: [[B_LOBIT1:%.*]] = ashr <4 x i32> [[B:%.*]], ; CHECK-NEXT: [[B_LOBIT1_NOT:%.*]] = xor <4 x i32> [[B_LOBIT1]], -- GitLab From dea373926eb58e3af50777a7c643c4d75a7c1a61 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 15 Oct 2018 21:51:22 +0000 Subject: [PATCH 0218/1116] [X86] Regenerate avx2-intrinsics-x86.ll to compress the 32 vs 64 bit mode checks. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344560 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/CodeGen/X86/avx2-intrinsics-x86.ll | 1747 ++++++++--------------- 1 file changed, 559 insertions(+), 1188 deletions(-) diff --git a/test/CodeGen/X86/avx2-intrinsics-x86.ll b/test/CodeGen/X86/avx2-intrinsics-x86.ll index 7eaa7f1cf98..5b649df410b 100644 --- a/test/CodeGen/X86/avx2-intrinsics-x86.ll +++ b/test/CodeGen/X86/avx2-intrinsics-x86.ll @@ -5,25 +5,15 @@ ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512VL --check-prefix=X64 --check-prefix=X64-AVX512VL define <16 x i16> @test_x86_avx2_packssdw(<8 x i32> %a0, <8 x i32> %a1) { -; X86-AVX-LABEL: test_x86_avx2_packssdw: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x6b,0xc1] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_packssdw: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6b,0xc1] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_packssdw: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x6b,0xc1] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_packssdw: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6b,0xc1] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_packssdw: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x6b,0xc1] +; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_packssdw: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6b,0xc1] +; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a0, <8 x i32> %a1) ; <<16 x i16>> [#uses=1] ret <16 x i16> %res } @@ -64,25 +54,15 @@ define <16 x i16> @test_x86_avx2_packssdw_fold() { define <32 x i8> @test_x86_avx2_packsswb(<16 x i16> %a0, <16 x i16> %a1) { -; X86-AVX-LABEL: test_x86_avx2_packsswb: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x63,0xc1] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_packsswb: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x63,0xc1] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_packsswb: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x63,0xc1] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_packsswb: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x63,0xc1] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_packsswb: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x63,0xc1] +; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_packsswb: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x63,0xc1] +; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a0, <16 x i16> %a1) ; <<32 x i8>> [#uses=1] ret <32 x i8> %res } @@ -123,25 +103,15 @@ define <32 x i8> @test_x86_avx2_packsswb_fold() { define <32 x i8> @test_x86_avx2_packuswb(<16 x i16> %a0, <16 x i16> %a1) { -; X86-AVX-LABEL: test_x86_avx2_packuswb: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x67,0xc1] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_packuswb: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x67,0xc1] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_packuswb: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x67,0xc1] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_packuswb: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x67,0xc1] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_packuswb: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x67,0xc1] +; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_packuswb: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x67,0xc1] +; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a0, <16 x i16> %a1) ; <<32 x i8>> [#uses=1] ret <32 x i8> %res } @@ -182,25 +152,15 @@ define <32 x i8> @test_x86_avx2_packuswb_fold() { define <32 x i8> @test_x86_avx2_padds_b(<32 x i8> %a0, <32 x i8> %a1) { -; X86-AVX-LABEL: test_x86_avx2_padds_b: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xec,0xc1] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_padds_b: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xec,0xc1] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_padds_b: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xec,0xc1] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_padds_b: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xec,0xc1] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_padds_b: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xec,0xc1] +; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_padds_b: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xec,0xc1] +; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] ret <32 x i8> %res } @@ -208,25 +168,15 @@ declare <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8>, <32 x i8>) nounwind readnone define <16 x i16> @test_x86_avx2_padds_w(<16 x i16> %a0, <16 x i16> %a1) { -; X86-AVX-LABEL: test_x86_avx2_padds_w: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xed,0xc1] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_padds_w: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xed,0xc1] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_padds_w: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xed,0xc1] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_padds_w: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xed,0xc1] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_padds_w: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xed,0xc1] +; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_padds_w: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xed,0xc1] +; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] ret <16 x i16> %res } @@ -234,25 +184,15 @@ declare <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16>, <16 x i16>) nounwind readn define <8 x i32> @test_x86_avx2_pmadd_wd(<16 x i16> %a0, <16 x i16> %a1) { -; X86-AVX-LABEL: test_x86_avx2_pmadd_wd: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xf5,0xc1] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_pmadd_wd: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf5,0xc1] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_pmadd_wd: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xf5,0xc1] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_pmadd_wd: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf5,0xc1] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_pmadd_wd: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xf5,0xc1] +; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_pmadd_wd: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf5,0xc1] +; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a0, <16 x i16> %a1) ; <<8 x i32>> [#uses=1] ret <8 x i32> %res } @@ -260,25 +200,15 @@ declare <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>) nounwind readn define <16 x i16> @test_x86_avx2_pmaxs_w(<16 x i16> %a0, <16 x i16> %a1) { -; X86-AVX-LABEL: test_x86_avx2_pmaxs_w: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xee,0xc1] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_pmaxs_w: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xee,0xc1] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_pmaxs_w: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xee,0xc1] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_pmaxs_w: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xee,0xc1] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_pmaxs_w: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xee,0xc1] +; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_pmaxs_w: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xee,0xc1] +; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx2.pmaxs.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] ret <16 x i16> %res } @@ -286,25 +216,15 @@ declare <16 x i16> @llvm.x86.avx2.pmaxs.w(<16 x i16>, <16 x i16>) nounwind readn define <32 x i8> @test_x86_avx2_pmaxu_b(<32 x i8> %a0, <32 x i8> %a1) { -; X86-AVX-LABEL: test_x86_avx2_pmaxu_b: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xde,0xc1] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_pmaxu_b: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xde,0xc1] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_pmaxu_b: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xde,0xc1] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_pmaxu_b: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xde,0xc1] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_pmaxu_b: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xde,0xc1] +; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_pmaxu_b: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xde,0xc1] +; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <32 x i8> @llvm.x86.avx2.pmaxu.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] ret <32 x i8> %res } @@ -312,25 +232,15 @@ declare <32 x i8> @llvm.x86.avx2.pmaxu.b(<32 x i8>, <32 x i8>) nounwind readnone define <16 x i16> @test_x86_avx2_pmins_w(<16 x i16> %a0, <16 x i16> %a1) { -; X86-AVX-LABEL: test_x86_avx2_pmins_w: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpminsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xea,0xc1] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_pmins_w: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpminsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xea,0xc1] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_pmins_w: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpminsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xea,0xc1] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_pmins_w: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpminsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xea,0xc1] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_pmins_w: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xea,0xc1] +; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_pmins_w: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpminsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xea,0xc1] +; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx2.pmins.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] ret <16 x i16> %res } @@ -338,25 +248,15 @@ declare <16 x i16> @llvm.x86.avx2.pmins.w(<16 x i16>, <16 x i16>) nounwind readn define <32 x i8> @test_x86_avx2_pminu_b(<32 x i8> %a0, <32 x i8> %a1) { -; X86-AVX-LABEL: test_x86_avx2_pminu_b: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpminub %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xda,0xc1] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_pminu_b: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpminub %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xda,0xc1] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_pminu_b: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpminub %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xda,0xc1] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_pminu_b: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpminub %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xda,0xc1] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_pminu_b: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xda,0xc1] +; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_pminu_b: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpminub %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xda,0xc1] +; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <32 x i8> @llvm.x86.avx2.pminu.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] ret <32 x i8> %res } @@ -364,17 +264,11 @@ declare <32 x i8> @llvm.x86.avx2.pminu.b(<32 x i8>, <32 x i8>) nounwind readnone define i32 @test_x86_avx2_pmovmskb(<32 x i8> %a0) { -; X86-LABEL: test_x86_avx2_pmovmskb: -; X86: ## %bb.0: -; X86-NEXT: vpmovmskb %ymm0, %eax ## encoding: [0xc5,0xfd,0xd7,0xc0] -; X86-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; X86-NEXT: retl ## encoding: [0xc3] -; -; X64-LABEL: test_x86_avx2_pmovmskb: -; X64: ## %bb.0: -; X64-NEXT: vpmovmskb %ymm0, %eax ## encoding: [0xc5,0xfd,0xd7,0xc0] -; X64-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] -; X64-NEXT: retq ## encoding: [0xc3] +; CHECK-LABEL: test_x86_avx2_pmovmskb: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpmovmskb %ymm0, %eax ## encoding: [0xc5,0xfd,0xd7,0xc0] +; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> %a0) ; [#uses=1] ret i32 %res } @@ -382,25 +276,15 @@ declare i32 @llvm.x86.avx2.pmovmskb(<32 x i8>) nounwind readnone define <16 x i16> @test_x86_avx2_pmulh_w(<16 x i16> %a0, <16 x i16> %a1) { -; X86-AVX-LABEL: test_x86_avx2_pmulh_w: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpmulhw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe5,0xc1] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_pmulh_w: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpmulhw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe5,0xc1] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_pmulh_w: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpmulhw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe5,0xc1] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_pmulh_w: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpmulhw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe5,0xc1] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_pmulh_w: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpmulhw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe5,0xc1] +; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_pmulh_w: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpmulhw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe5,0xc1] +; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] ret <16 x i16> %res } @@ -408,25 +292,15 @@ declare <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16>, <16 x i16>) nounwind readn define <16 x i16> @test_x86_avx2_pmulhu_w(<16 x i16> %a0, <16 x i16> %a1) { -; X86-AVX-LABEL: test_x86_avx2_pmulhu_w: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe4,0xc1] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_pmulhu_w: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe4,0xc1] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_pmulhu_w: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe4,0xc1] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_pmulhu_w: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe4,0xc1] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_pmulhu_w: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe4,0xc1] +; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_pmulhu_w: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe4,0xc1] +; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] ret <16 x i16> %res } @@ -434,25 +308,15 @@ declare <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16>, <16 x i16>) nounwind read define <4 x i64> @test_x86_avx2_psad_bw(<32 x i8> %a0, <32 x i8> %a1) { -; X86-AVX-LABEL: test_x86_avx2_psad_bw: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xf6,0xc1] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_psad_bw: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf6,0xc1] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_psad_bw: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xf6,0xc1] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_psad_bw: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf6,0xc1] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_psad_bw: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xf6,0xc1] +; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_psad_bw: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf6,0xc1] +; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8> %a0, <32 x i8> %a1) ; <<4 x i64>> [#uses=1] ret <4 x i64> %res } @@ -460,25 +324,15 @@ declare <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8>, <32 x i8>) nounwind readnone define <8 x i32> @test_x86_avx2_psll_d(<8 x i32> %a0, <4 x i32> %a1) { -; X86-AVX-LABEL: test_x86_avx2_psll_d: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpslld %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xf2,0xc1] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_psll_d: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpslld %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf2,0xc1] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_psll_d: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpslld %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xf2,0xc1] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_psll_d: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpslld %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf2,0xc1] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_psll_d: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpslld %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xf2,0xc1] +; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_psll_d: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpslld %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf2,0xc1] +; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %a0, <4 x i32> %a1) ; <<8 x i32>> [#uses=1] ret <8 x i32> %res } @@ -486,25 +340,15 @@ declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) nounwind readnone define <4 x i64> @test_x86_avx2_psll_q(<4 x i64> %a0, <2 x i64> %a1) { -; X86-AVX-LABEL: test_x86_avx2_psll_q: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpsllq %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xf3,0xc1] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_psll_q: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpsllq %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf3,0xc1] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_psll_q: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpsllq %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xf3,0xc1] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_psll_q: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpsllq %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf3,0xc1] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_psll_q: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpsllq %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xf3,0xc1] +; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_psll_q: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpsllq %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf3,0xc1] +; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %a0, <2 x i64> %a1) ; <<4 x i64>> [#uses=1] ret <4 x i64> %res } @@ -512,25 +356,15 @@ declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) nounwind readnone define <16 x i16> @test_x86_avx2_psll_w(<16 x i16> %a0, <8 x i16> %a1) { -; X86-AVX-LABEL: test_x86_avx2_psll_w: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpsllw %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xf1,0xc1] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_psll_w: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpsllw %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf1,0xc1] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_psll_w: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpsllw %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xf1,0xc1] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_psll_w: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpsllw %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf1,0xc1] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_psll_w: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpsllw %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xf1,0xc1] +; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_psll_w: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpsllw %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf1,0xc1] +; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %a0, <8 x i16> %a1) ; <<16 x i16>> [#uses=1] ret <16 x i16> %res } @@ -538,25 +372,15 @@ declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) nounwind readnon define <8 x i32> @test_x86_avx2_pslli_d(<8 x i32> %a0) { -; X86-AVX-LABEL: test_x86_avx2_pslli_d: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpslld $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x72,0xf0,0x07] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_pslli_d: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpslld $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x72,0xf0,0x07] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_pslli_d: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpslld $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x72,0xf0,0x07] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_pslli_d: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpslld $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x72,0xf0,0x07] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_pslli_d: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpslld $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x72,0xf0,0x07] +; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_pslli_d: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpslld $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x72,0xf0,0x07] +; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %a0, i32 7) ; <<8 x i32>> [#uses=1] ret <8 x i32> %res } @@ -564,25 +388,15 @@ declare <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32>, i32) nounwind readnone define <4 x i64> @test_x86_avx2_pslli_q(<4 x i64> %a0) { -; X86-AVX-LABEL: test_x86_avx2_pslli_q: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpsllq $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x73,0xf0,0x07] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_pslli_q: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpsllq $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x73,0xf0,0x07] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_pslli_q: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpsllq $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x73,0xf0,0x07] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_pslli_q: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpsllq $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x73,0xf0,0x07] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_pslli_q: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpsllq $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x73,0xf0,0x07] +; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_pslli_q: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpsllq $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x73,0xf0,0x07] +; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1] ret <4 x i64> %res } @@ -590,25 +404,15 @@ declare <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64>, i32) nounwind readnone define <16 x i16> @test_x86_avx2_pslli_w(<16 x i16> %a0) { -; X86-AVX-LABEL: test_x86_avx2_pslli_w: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpsllw $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x71,0xf0,0x07] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_pslli_w: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x71,0xf0,0x07] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_pslli_w: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpsllw $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x71,0xf0,0x07] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_pslli_w: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x71,0xf0,0x07] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_pslli_w: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x71,0xf0,0x07] +; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_pslli_w: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x71,0xf0,0x07] +; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %a0, i32 7) ; <<16 x i16>> [#uses=1] ret <16 x i16> %res } @@ -616,25 +420,15 @@ declare <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16>, i32) nounwind readnone define <8 x i32> @test_x86_avx2_psra_d(<8 x i32> %a0, <4 x i32> %a1) { -; X86-AVX-LABEL: test_x86_avx2_psra_d: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpsrad %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe2,0xc1] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_psra_d: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpsrad %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe2,0xc1] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_psra_d: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpsrad %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe2,0xc1] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_psra_d: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpsrad %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe2,0xc1] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_psra_d: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpsrad %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe2,0xc1] +; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_psra_d: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpsrad %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe2,0xc1] +; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %a0, <4 x i32> %a1) ; <<8 x i32>> [#uses=1] ret <8 x i32> %res } @@ -642,25 +436,15 @@ declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>) nounwind readnone define <16 x i16> @test_x86_avx2_psra_w(<16 x i16> %a0, <8 x i16> %a1) { -; X86-AVX-LABEL: test_x86_avx2_psra_w: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpsraw %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe1,0xc1] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_psra_w: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpsraw %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe1,0xc1] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_psra_w: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpsraw %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe1,0xc1] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_psra_w: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpsraw %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe1,0xc1] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_psra_w: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpsraw %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe1,0xc1] +; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_psra_w: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpsraw %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe1,0xc1] +; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %a0, <8 x i16> %a1) ; <<16 x i16>> [#uses=1] ret <16 x i16> %res } @@ -668,25 +452,15 @@ declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>) nounwind readnon define <8 x i32> @test_x86_avx2_psrai_d(<8 x i32> %a0) { -; X86-AVX-LABEL: test_x86_avx2_psrai_d: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpsrad $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x72,0xe0,0x07] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_psrai_d: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpsrad $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x72,0xe0,0x07] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_psrai_d: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpsrad $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x72,0xe0,0x07] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_psrai_d: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpsrad $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x72,0xe0,0x07] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_psrai_d: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpsrad $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x72,0xe0,0x07] +; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_psrai_d: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpsrad $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x72,0xe0,0x07] +; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %a0, i32 7) ; <<8 x i32>> [#uses=1] ret <8 x i32> %res } @@ -694,25 +468,15 @@ declare <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32>, i32) nounwind readnone define <16 x i16> @test_x86_avx2_psrai_w(<16 x i16> %a0) { -; X86-AVX-LABEL: test_x86_avx2_psrai_w: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpsraw $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x71,0xe0,0x07] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_psrai_w: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpsraw $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x71,0xe0,0x07] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_psrai_w: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpsraw $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x71,0xe0,0x07] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_psrai_w: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpsraw $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x71,0xe0,0x07] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_psrai_w: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpsraw $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x71,0xe0,0x07] +; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_psrai_w: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpsraw $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x71,0xe0,0x07] +; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %a0, i32 7) ; <<16 x i16>> [#uses=1] ret <16 x i16> %res } @@ -720,25 +484,15 @@ declare <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16>, i32) nounwind readnone define <8 x i32> @test_x86_avx2_psrl_d(<8 x i32> %a0, <4 x i32> %a1) { -; X86-AVX-LABEL: test_x86_avx2_psrl_d: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpsrld %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd2,0xc1] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_psrl_d: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpsrld %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd2,0xc1] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_psrl_d: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpsrld %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd2,0xc1] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_psrl_d: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpsrld %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd2,0xc1] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_psrl_d: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpsrld %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd2,0xc1] +; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_psrl_d: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpsrld %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd2,0xc1] +; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %a0, <4 x i32> %a1) ; <<8 x i32>> [#uses=1] ret <8 x i32> %res } @@ -746,25 +500,15 @@ declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) nounwind readnone define <4 x i64> @test_x86_avx2_psrl_q(<4 x i64> %a0, <2 x i64> %a1) { -; X86-AVX-LABEL: test_x86_avx2_psrl_q: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd3,0xc1] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_psrl_q: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd3,0xc1] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_psrl_q: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd3,0xc1] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_psrl_q: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd3,0xc1] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_psrl_q: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd3,0xc1] +; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_psrl_q: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd3,0xc1] +; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %a0, <2 x i64> %a1) ; <<4 x i64>> [#uses=1] ret <4 x i64> %res } @@ -772,25 +516,15 @@ declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) nounwind readnone define <16 x i16> @test_x86_avx2_psrl_w(<16 x i16> %a0, <8 x i16> %a1) { -; X86-AVX-LABEL: test_x86_avx2_psrl_w: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd1,0xc1] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_psrl_w: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd1,0xc1] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_psrl_w: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd1,0xc1] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_psrl_w: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd1,0xc1] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_psrl_w: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd1,0xc1] +; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_psrl_w: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd1,0xc1] +; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %a0, <8 x i16> %a1) ; <<16 x i16>> [#uses=1] ret <16 x i16> %res } @@ -798,25 +532,15 @@ declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) nounwind readnon define <8 x i32> @test_x86_avx2_psrli_d(<8 x i32> %a0) { -; X86-AVX-LABEL: test_x86_avx2_psrli_d: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpsrld $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x72,0xd0,0x07] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_psrli_d: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpsrld $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x72,0xd0,0x07] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_psrli_d: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpsrld $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x72,0xd0,0x07] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_psrli_d: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpsrld $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x72,0xd0,0x07] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_psrli_d: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpsrld $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x72,0xd0,0x07] +; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_psrli_d: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpsrld $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x72,0xd0,0x07] +; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %a0, i32 7) ; <<8 x i32>> [#uses=1] ret <8 x i32> %res } @@ -824,25 +548,15 @@ declare <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32>, i32) nounwind readnone define <4 x i64> @test_x86_avx2_psrli_q(<4 x i64> %a0) { -; X86-AVX-LABEL: test_x86_avx2_psrli_q: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpsrlq $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x73,0xd0,0x07] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_psrli_q: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpsrlq $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x73,0xd0,0x07] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_psrli_q: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpsrlq $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x73,0xd0,0x07] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_psrli_q: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpsrlq $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x73,0xd0,0x07] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_psrli_q: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpsrlq $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x73,0xd0,0x07] +; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_psrli_q: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpsrlq $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x73,0xd0,0x07] +; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1] ret <4 x i64> %res } @@ -850,25 +564,15 @@ declare <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64>, i32) nounwind readnone define <16 x i16> @test_x86_avx2_psrli_w(<16 x i16> %a0) { -; X86-AVX-LABEL: test_x86_avx2_psrli_w: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpsrlw $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x71,0xd0,0x07] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_psrli_w: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x71,0xd0,0x07] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_psrli_w: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpsrlw $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x71,0xd0,0x07] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_psrli_w: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x71,0xd0,0x07] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_psrli_w: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x71,0xd0,0x07] +; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_psrli_w: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x71,0xd0,0x07] +; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %a0, i32 7) ; <<16 x i16>> [#uses=1] ret <16 x i16> %res } @@ -876,25 +580,15 @@ declare <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16>, i32) nounwind readnone define <32 x i8> @test_x86_avx2_psubs_b(<32 x i8> %a0, <32 x i8> %a1) { -; X86-AVX-LABEL: test_x86_avx2_psubs_b: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe8,0xc1] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_psubs_b: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe8,0xc1] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_psubs_b: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe8,0xc1] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_psubs_b: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe8,0xc1] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_psubs_b: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe8,0xc1] +; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_psubs_b: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe8,0xc1] +; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] ret <32 x i8> %res } @@ -902,40 +596,25 @@ declare <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8>, <32 x i8>) nounwind readnone define <16 x i16> @test_x86_avx2_psubs_w(<16 x i16> %a0, <16 x i16> %a1) { -; X86-AVX-LABEL: test_x86_avx2_psubs_w: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe9,0xc1] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_psubs_w: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe9,0xc1] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_psubs_w: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe9,0xc1] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_psubs_w: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe9,0xc1] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_psubs_w: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe9,0xc1] +; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_psubs_w: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe9,0xc1] +; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] ret <16 x i16> %res } declare <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16>, <16 x i16>) nounwind readnone define <8 x i32> @test_x86_avx2_phadd_d(<8 x i32> %a0, <8 x i32> %a1) { -; X86-LABEL: test_x86_avx2_phadd_d: -; X86: ## %bb.0: -; X86-NEXT: vphaddd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x02,0xc1] -; X86-NEXT: retl ## encoding: [0xc3] -; -; X64-LABEL: test_x86_avx2_phadd_d: -; X64: ## %bb.0: -; X64-NEXT: vphaddd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x02,0xc1] -; X64-NEXT: retq ## encoding: [0xc3] +; CHECK-LABEL: test_x86_avx2_phadd_d: +; CHECK: ## %bb.0: +; CHECK-NEXT: vphaddd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x02,0xc1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1] ret <8 x i32> %res } @@ -943,15 +622,10 @@ declare <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32>, <8 x i32>) nounwind readnone define <16 x i16> @test_x86_avx2_phadd_sw(<16 x i16> %a0, <16 x i16> %a1) { -; X86-LABEL: test_x86_avx2_phadd_sw: -; X86: ## %bb.0: -; X86-NEXT: vphaddsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x03,0xc1] -; X86-NEXT: retl ## encoding: [0xc3] -; -; X64-LABEL: test_x86_avx2_phadd_sw: -; X64: ## %bb.0: -; X64-NEXT: vphaddsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x03,0xc1] -; X64-NEXT: retq ## encoding: [0xc3] +; CHECK-LABEL: test_x86_avx2_phadd_sw: +; CHECK: ## %bb.0: +; CHECK-NEXT: vphaddsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x03,0xc1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] ret <16 x i16> %res } @@ -959,15 +633,10 @@ declare <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16>, <16 x i16>) nounwind read define <16 x i16> @test_x86_avx2_phadd_w(<16 x i16> %a0, <16 x i16> %a1) { -; X86-LABEL: test_x86_avx2_phadd_w: -; X86: ## %bb.0: -; X86-NEXT: vphaddw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x01,0xc1] -; X86-NEXT: retl ## encoding: [0xc3] -; -; X64-LABEL: test_x86_avx2_phadd_w: -; X64: ## %bb.0: -; X64-NEXT: vphaddw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x01,0xc1] -; X64-NEXT: retq ## encoding: [0xc3] +; CHECK-LABEL: test_x86_avx2_phadd_w: +; CHECK: ## %bb.0: +; CHECK-NEXT: vphaddw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x01,0xc1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] ret <16 x i16> %res } @@ -975,15 +644,10 @@ declare <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16>, <16 x i16>) nounwind readn define <8 x i32> @test_x86_avx2_phsub_d(<8 x i32> %a0, <8 x i32> %a1) { -; X86-LABEL: test_x86_avx2_phsub_d: -; X86: ## %bb.0: -; X86-NEXT: vphsubd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x06,0xc1] -; X86-NEXT: retl ## encoding: [0xc3] -; -; X64-LABEL: test_x86_avx2_phsub_d: -; X64: ## %bb.0: -; X64-NEXT: vphsubd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x06,0xc1] -; X64-NEXT: retq ## encoding: [0xc3] +; CHECK-LABEL: test_x86_avx2_phsub_d: +; CHECK: ## %bb.0: +; CHECK-NEXT: vphsubd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x06,0xc1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1] ret <8 x i32> %res } @@ -991,15 +655,10 @@ declare <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32>, <8 x i32>) nounwind readnone define <16 x i16> @test_x86_avx2_phsub_sw(<16 x i16> %a0, <16 x i16> %a1) { -; X86-LABEL: test_x86_avx2_phsub_sw: -; X86: ## %bb.0: -; X86-NEXT: vphsubsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x07,0xc1] -; X86-NEXT: retl ## encoding: [0xc3] -; -; X64-LABEL: test_x86_avx2_phsub_sw: -; X64: ## %bb.0: -; X64-NEXT: vphsubsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x07,0xc1] -; X64-NEXT: retq ## encoding: [0xc3] +; CHECK-LABEL: test_x86_avx2_phsub_sw: +; CHECK: ## %bb.0: +; CHECK-NEXT: vphsubsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x07,0xc1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] ret <16 x i16> %res } @@ -1007,15 +666,10 @@ declare <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16>, <16 x i16>) nounwind read define <16 x i16> @test_x86_avx2_phsub_w(<16 x i16> %a0, <16 x i16> %a1) { -; X86-LABEL: test_x86_avx2_phsub_w: -; X86: ## %bb.0: -; X86-NEXT: vphsubw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x05,0xc1] -; X86-NEXT: retl ## encoding: [0xc3] -; -; X64-LABEL: test_x86_avx2_phsub_w: -; X64: ## %bb.0: -; X64-NEXT: vphsubw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x05,0xc1] -; X64-NEXT: retq ## encoding: [0xc3] +; CHECK-LABEL: test_x86_avx2_phsub_w: +; CHECK: ## %bb.0: +; CHECK-NEXT: vphsubw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x05,0xc1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] ret <16 x i16> %res } @@ -1023,25 +677,15 @@ declare <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16>, <16 x i16>) nounwind readn define <16 x i16> @test_x86_avx2_pmadd_ub_sw(<32 x i8> %a0, <32 x i8> %a1) { -; X86-AVX-LABEL: test_x86_avx2_pmadd_ub_sw: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x04,0xc1] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_pmadd_ub_sw: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x04,0xc1] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_pmadd_ub_sw: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x04,0xc1] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_pmadd_ub_sw: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x04,0xc1] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_pmadd_ub_sw: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x04,0xc1] +; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_pmadd_ub_sw: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x04,0xc1] +; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %a0, <32 x i8> %a1) ; <<16 x i16>> [#uses=1] ret <16 x i16> %res } @@ -1080,25 +724,15 @@ define <16 x i16> @test_x86_avx2_pmadd_ub_sw_load_op0(<32 x i8>* %ptr, <32 x i8> } define <16 x i16> @test_x86_avx2_pmul_hr_sw(<16 x i16> %a0, <16 x i16> %a1) { -; X86-AVX-LABEL: test_x86_avx2_pmul_hr_sw: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpmulhrsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0b,0xc1] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_pmul_hr_sw: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpmulhrsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x0b,0xc1] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_pmul_hr_sw: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpmulhrsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0b,0xc1] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_pmul_hr_sw: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpmulhrsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x0b,0xc1] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_pmul_hr_sw: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpmulhrsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0b,0xc1] +; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_pmul_hr_sw: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpmulhrsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x0b,0xc1] +; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] ret <16 x i16> %res } @@ -1106,25 +740,15 @@ declare <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16>, <16 x i16>) nounwind re define <32 x i8> @test_x86_avx2_pshuf_b(<32 x i8> %a0, <32 x i8> %a1) { -; X86-AVX-LABEL: test_x86_avx2_pshuf_b: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpshufb %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x00,0xc1] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_pshuf_b: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpshufb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x00,0xc1] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_pshuf_b: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpshufb %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x00,0xc1] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_pshuf_b: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpshufb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x00,0xc1] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_pshuf_b: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x00,0xc1] +; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_pshuf_b: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpshufb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x00,0xc1] +; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> %a1) ; <<16 x i8>> [#uses=1] ret <32 x i8> %res } @@ -1132,15 +756,10 @@ declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>) nounwind readnone define <32 x i8> @test_x86_avx2_psign_b(<32 x i8> %a0, <32 x i8> %a1) { -; X86-LABEL: test_x86_avx2_psign_b: -; X86: ## %bb.0: -; X86-NEXT: vpsignb %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x08,0xc1] -; X86-NEXT: retl ## encoding: [0xc3] -; -; X64-LABEL: test_x86_avx2_psign_b: -; X64: ## %bb.0: -; X64-NEXT: vpsignb %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x08,0xc1] -; X64-NEXT: retq ## encoding: [0xc3] +; CHECK-LABEL: test_x86_avx2_psign_b: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsignb %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x08,0xc1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] ret <32 x i8> %res } @@ -1148,15 +767,10 @@ declare <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8>, <32 x i8>) nounwind readnone define <8 x i32> @test_x86_avx2_psign_d(<8 x i32> %a0, <8 x i32> %a1) { -; X86-LABEL: test_x86_avx2_psign_d: -; X86: ## %bb.0: -; X86-NEXT: vpsignd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0a,0xc1] -; X86-NEXT: retl ## encoding: [0xc3] -; -; X64-LABEL: test_x86_avx2_psign_d: -; X64: ## %bb.0: -; X64-NEXT: vpsignd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0a,0xc1] -; X64-NEXT: retq ## encoding: [0xc3] +; CHECK-LABEL: test_x86_avx2_psign_d: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsignd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0a,0xc1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32> %a0, <8 x i32> %a1) ; <<4 x i32>> [#uses=1] ret <8 x i32> %res } @@ -1164,15 +778,10 @@ declare <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32>, <8 x i32>) nounwind readnone define <16 x i16> @test_x86_avx2_psign_w(<16 x i16> %a0, <16 x i16> %a1) { -; X86-LABEL: test_x86_avx2_psign_w: -; X86: ## %bb.0: -; X86-NEXT: vpsignw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x09,0xc1] -; X86-NEXT: retl ## encoding: [0xc3] -; -; X64-LABEL: test_x86_avx2_psign_w: -; X64: ## %bb.0: -; X64-NEXT: vpsignw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x09,0xc1] -; X64-NEXT: retq ## encoding: [0xc3] +; CHECK-LABEL: test_x86_avx2_psign_w: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsignw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x09,0xc1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] ret <16 x i16> %res } @@ -1180,15 +789,10 @@ declare <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16>, <16 x i16>) nounwind readn define <16 x i16> @test_x86_avx2_mpsadbw(<32 x i8> %a0, <32 x i8> %a1) { -; X86-LABEL: test_x86_avx2_mpsadbw: -; X86: ## %bb.0: -; X86-NEXT: vmpsadbw $7, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x42,0xc1,0x07] -; X86-NEXT: retl ## encoding: [0xc3] -; -; X64-LABEL: test_x86_avx2_mpsadbw: -; X64: ## %bb.0: -; X64-NEXT: vmpsadbw $7, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x42,0xc1,0x07] -; X64-NEXT: retq ## encoding: [0xc3] +; CHECK-LABEL: test_x86_avx2_mpsadbw: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmpsadbw $7, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x42,0xc1,0x07] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %a0, <32 x i8> %a1, i8 7) ; <<16 x i16>> [#uses=1] ret <16 x i16> %res } @@ -1196,25 +800,15 @@ declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i8) nounwind rea define <16 x i16> @test_x86_avx2_packusdw(<8 x i32> %a0, <8 x i32> %a1) { -; X86-AVX-LABEL: test_x86_avx2_packusdw: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x2b,0xc1] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_packusdw: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x2b,0xc1] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_packusdw: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x2b,0xc1] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_packusdw: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x2b,0xc1] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_packusdw: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x2b,0xc1] +; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_packusdw: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x2b,0xc1] +; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a0, <8 x i32> %a1) ; <<16 x i16>> [#uses=1] ret <16 x i16> %res } @@ -1255,15 +849,10 @@ define <16 x i16> @test_x86_avx2_packusdw_fold() { define <32 x i8> @test_x86_avx2_pblendvb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> %a2) { -; X86-LABEL: test_x86_avx2_pblendvb: -; X86: ## %bb.0: -; X86-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x4c,0xc1,0x20] -; X86-NEXT: retl ## encoding: [0xc3] -; -; X64-LABEL: test_x86_avx2_pblendvb: -; X64: ## %bb.0: -; X64-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x4c,0xc1,0x20] -; X64-NEXT: retq ## encoding: [0xc3] +; CHECK-LABEL: test_x86_avx2_pblendvb: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x4c,0xc1,0x20] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> %a2) ; <<32 x i8>> [#uses=1] ret <32 x i8> %res } @@ -1271,17 +860,11 @@ declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>) nounw define <16 x i16> @test_x86_avx2_pblendw(<16 x i16> %a0, <16 x i16> %a1) { -; X86-LABEL: test_x86_avx2_pblendw: -; X86: ## %bb.0: -; X86-NEXT: vpblendw $7, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x0e,0xc1,0x07] -; X86-NEXT: ## ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; X86-NEXT: retl ## encoding: [0xc3] -; -; X64-LABEL: test_x86_avx2_pblendw: -; X64: ## %bb.0: -; X64-NEXT: vpblendw $7, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x0e,0xc1,0x07] -; X64-NEXT: ## ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; X64-NEXT: retq ## encoding: [0xc3] +; CHECK-LABEL: test_x86_avx2_pblendw: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpblendw $7, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x0e,0xc1,0x07] +; CHECK-NEXT: ## ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %a0, <16 x i16> %a1, i8 7) ; <<16 x i16>> [#uses=1] ret <16 x i16> %res } @@ -1289,25 +872,15 @@ declare <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16>, <16 x i16>, i8) nounwind r define <32 x i8> @test_x86_avx2_pmaxsb(<32 x i8> %a0, <32 x i8> %a1) { -; X86-AVX-LABEL: test_x86_avx2_pmaxsb: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x3c,0xc1] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_pmaxsb: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3c,0xc1] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_pmaxsb: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x3c,0xc1] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_pmaxsb: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3c,0xc1] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_pmaxsb: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x3c,0xc1] +; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_pmaxsb: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3c,0xc1] +; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <32 x i8> @llvm.x86.avx2.pmaxs.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] ret <32 x i8> %res } @@ -1315,25 +888,15 @@ declare <32 x i8> @llvm.x86.avx2.pmaxs.b(<32 x i8>, <32 x i8>) nounwind readnone define <8 x i32> @test_x86_avx2_pmaxsd(<8 x i32> %a0, <8 x i32> %a1) { -; X86-AVX-LABEL: test_x86_avx2_pmaxsd: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x3d,0xc1] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_pmaxsd: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3d,0xc1] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_pmaxsd: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x3d,0xc1] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_pmaxsd: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3d,0xc1] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_pmaxsd: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x3d,0xc1] +; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_pmaxsd: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3d,0xc1] +; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1] ret <8 x i32> %res } @@ -1341,25 +904,15 @@ declare <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32>, <8 x i32>) nounwind readnone define <8 x i32> @test_x86_avx2_pmaxud(<8 x i32> %a0, <8 x i32> %a1) { -; X86-AVX-LABEL: test_x86_avx2_pmaxud: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x3f,0xc1] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_pmaxud: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3f,0xc1] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_pmaxud: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x3f,0xc1] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_pmaxud: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3f,0xc1] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_pmaxud: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x3f,0xc1] +; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_pmaxud: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3f,0xc1] +; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1] ret <8 x i32> %res } @@ -1367,25 +920,15 @@ declare <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32>, <8 x i32>) nounwind readnone define <16 x i16> @test_x86_avx2_pmaxuw(<16 x i16> %a0, <16 x i16> %a1) { -; X86-AVX-LABEL: test_x86_avx2_pmaxuw: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x3e,0xc1] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_pmaxuw: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3e,0xc1] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_pmaxuw: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x3e,0xc1] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_pmaxuw: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3e,0xc1] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_pmaxuw: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x3e,0xc1] +; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_pmaxuw: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3e,0xc1] +; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx2.pmaxu.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] ret <16 x i16> %res } @@ -1393,25 +936,15 @@ declare <16 x i16> @llvm.x86.avx2.pmaxu.w(<16 x i16>, <16 x i16>) nounwind readn define <32 x i8> @test_x86_avx2_pminsb(<32 x i8> %a0, <32 x i8> %a1) { -; X86-AVX-LABEL: test_x86_avx2_pminsb: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpminsb %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x38,0xc1] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_pminsb: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpminsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x38,0xc1] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_pminsb: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpminsb %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x38,0xc1] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_pminsb: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpminsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x38,0xc1] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_pminsb: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x38,0xc1] +; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_pminsb: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpminsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x38,0xc1] +; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <32 x i8> @llvm.x86.avx2.pmins.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] ret <32 x i8> %res } @@ -1419,25 +952,15 @@ declare <32 x i8> @llvm.x86.avx2.pmins.b(<32 x i8>, <32 x i8>) nounwind readnone define <8 x i32> @test_x86_avx2_pminsd(<8 x i32> %a0, <8 x i32> %a1) { -; X86-AVX-LABEL: test_x86_avx2_pminsd: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x39,0xc1] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_pminsd: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x39,0xc1] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_pminsd: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x39,0xc1] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_pminsd: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x39,0xc1] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_pminsd: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x39,0xc1] +; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_pminsd: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x39,0xc1] +; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1] ret <8 x i32> %res } @@ -1445,25 +968,15 @@ declare <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32>, <8 x i32>) nounwind readnone define <8 x i32> @test_x86_avx2_pminud(<8 x i32> %a0, <8 x i32> %a1) { -; X86-AVX-LABEL: test_x86_avx2_pminud: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpminud %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x3b,0xc1] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_pminud: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpminud %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3b,0xc1] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_pminud: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpminud %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x3b,0xc1] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_pminud: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpminud %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3b,0xc1] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_pminud: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x3b,0xc1] +; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_pminud: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpminud %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3b,0xc1] +; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1] ret <8 x i32> %res } @@ -1471,25 +984,15 @@ declare <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32>, <8 x i32>) nounwind readnone define <16 x i16> @test_x86_avx2_pminuw(<16 x i16> %a0, <16 x i16> %a1) { -; X86-AVX-LABEL: test_x86_avx2_pminuw: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpminuw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x3a,0xc1] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_pminuw: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpminuw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3a,0xc1] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_pminuw: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpminuw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x3a,0xc1] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_pminuw: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpminuw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3a,0xc1] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_pminuw: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x3a,0xc1] +; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_pminuw: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpminuw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3a,0xc1] +; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx2.pminu.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] ret <16 x i16> %res } @@ -1497,17 +1000,11 @@ declare <16 x i16> @llvm.x86.avx2.pminu.w(<16 x i16>, <16 x i16>) nounwind readn define <4 x i32> @test_x86_avx2_pblendd_128(<4 x i32> %a0, <4 x i32> %a1) { -; X86-LABEL: test_x86_avx2_pblendd_128: -; X86: ## %bb.0: -; X86-NEXT: vblendps $8, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x08] -; X86-NEXT: ## xmm0 = xmm1[0,1,2],xmm0[3] -; X86-NEXT: retl ## encoding: [0xc3] -; -; X64-LABEL: test_x86_avx2_pblendd_128: -; X64: ## %bb.0: -; X64-NEXT: vblendps $8, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x08] -; X64-NEXT: ## xmm0 = xmm1[0,1,2],xmm0[3] -; X64-NEXT: retq ## encoding: [0xc3] +; CHECK-LABEL: test_x86_avx2_pblendd_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vblendps $8, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x08] +; CHECK-NEXT: ## xmm0 = xmm1[0,1,2],xmm0[3] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %a0, <4 x i32> %a1, i8 7) ; <<4 x i32>> [#uses=1] ret <4 x i32> %res } @@ -1515,17 +1012,11 @@ declare <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32>, <4 x i32>, i8) nounwind define <8 x i32> @test_x86_avx2_pblendd_256(<8 x i32> %a0, <8 x i32> %a1) { -; X86-LABEL: test_x86_avx2_pblendd_256: -; X86: ## %bb.0: -; X86-NEXT: vblendps $7, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x0c,0xc1,0x07] -; X86-NEXT: ## ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; X86-NEXT: retl ## encoding: [0xc3] -; -; X64-LABEL: test_x86_avx2_pblendd_256: -; X64: ## %bb.0: -; X64-NEXT: vblendps $7, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x0c,0xc1,0x07] -; X64-NEXT: ## ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; X64-NEXT: retq ## encoding: [0xc3] +; CHECK-LABEL: test_x86_avx2_pblendd_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vblendps $7, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x0c,0xc1,0x07] +; CHECK-NEXT: ## ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %a0, <8 x i32> %a1, i8 7) ; <<8 x i32>> [#uses=1] ret <8 x i32> %res } @@ -1536,25 +1027,15 @@ declare <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32>, <8 x i32>, i8) nounwind ; and its lowering. Indeed, the offsets are the first source in ; the instruction. define <8 x i32> @test_x86_avx2_permd(<8 x i32> %a0, <8 x i32> %a1) { -; X86-AVX-LABEL: test_x86_avx2_permd: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpermps %ymm0, %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x75,0x16,0xc0] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_permd: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x16,0xc0] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_permd: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpermps %ymm0, %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x75,0x16,0xc0] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_permd: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x16,0xc0] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_permd: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x75,0x16,0xc0] +; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_permd: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x16,0xc0] +; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1] ret <8 x i32> %res } @@ -1565,25 +1046,15 @@ declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly ; and its lowering. Indeed, the offsets are the first source in ; the instruction. define <8 x float> @test_x86_avx2_permps(<8 x float> %a0, <8 x i32> %a1) { -; X86-AVX-LABEL: test_x86_avx2_permps: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpermps %ymm0, %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x75,0x16,0xc0] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_permps: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x16,0xc0] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_permps: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpermps %ymm0, %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x75,0x16,0xc0] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_permps: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x16,0xc0] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_permps: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x75,0x16,0xc0] +; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_permps: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x16,0xc0] +; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> %a1) ; <<8 x float>> [#uses=1] ret <8 x float> %res } @@ -1731,25 +1202,15 @@ declare void @llvm.x86.avx2.maskstore.d.256(i8*, <8 x i32>, <8 x i32>) nounwind define <4 x i32> @test_x86_avx2_psllv_d(<4 x i32> %a0, <4 x i32> %a1) { -; X86-AVX-LABEL: test_x86_avx2_psllv_d: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x47,0xc1] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_psllv_d: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x47,0xc1] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_psllv_d: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x47,0xc1] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_psllv_d: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x47,0xc1] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_psllv_d: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x47,0xc1] +; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_psllv_d: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x47,0xc1] +; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1] ret <4 x i32> %res } @@ -1757,25 +1218,15 @@ declare <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32>, <4 x i32>) nounwind readnone define <8 x i32> @test_x86_avx2_psllv_d_256(<8 x i32> %a0, <8 x i32> %a1) { -; X86-AVX-LABEL: test_x86_avx2_psllv_d_256: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x47,0xc1] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_psllv_d_256: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x47,0xc1] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_psllv_d_256: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x47,0xc1] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_psllv_d_256: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x47,0xc1] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_psllv_d_256: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x47,0xc1] +; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_psllv_d_256: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x47,0xc1] +; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1] ret <8 x i32> %res } @@ -1783,25 +1234,15 @@ declare <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32>, <8 x i32>) nounwind read define <2 x i64> @test_x86_avx2_psllv_q(<2 x i64> %a0, <2 x i64> %a1) { -; X86-AVX-LABEL: test_x86_avx2_psllv_q: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0xf9,0x47,0xc1] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_psllv_q: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0x47,0xc1] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_psllv_q: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0xf9,0x47,0xc1] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_psllv_q: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0x47,0xc1] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_psllv_q: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0xf9,0x47,0xc1] +; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_psllv_q: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0x47,0xc1] +; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1] ret <2 x i64> %res } @@ -1809,25 +1250,15 @@ declare <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64>, <2 x i64>) nounwind readnone define <4 x i64> @test_x86_avx2_psllv_q_256(<4 x i64> %a0, <4 x i64> %a1) { -; X86-AVX-LABEL: test_x86_avx2_psllv_q_256: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpsllvq %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0xfd,0x47,0xc1] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_psllv_q_256: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpsllvq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0x47,0xc1] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_psllv_q_256: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpsllvq %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0xfd,0x47,0xc1] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_psllv_q_256: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpsllvq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0x47,0xc1] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_psllv_q_256: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpsllvq %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0xfd,0x47,0xc1] +; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_psllv_q_256: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpsllvq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0x47,0xc1] +; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %a0, <4 x i64> %a1) ; <<4 x i64>> [#uses=1] ret <4 x i64> %res } @@ -1835,25 +1266,15 @@ declare <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64>, <4 x i64>) nounwind read define <4 x i32> @test_x86_avx2_psrlv_d(<4 x i32> %a0, <4 x i32> %a1) { -; X86-AVX-LABEL: test_x86_avx2_psrlv_d: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x45,0xc1] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_psrlv_d: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x45,0xc1] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_psrlv_d: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x45,0xc1] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_psrlv_d: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x45,0xc1] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_psrlv_d: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x45,0xc1] +; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_psrlv_d: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x45,0xc1] +; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1] ret <4 x i32> %res } @@ -1861,25 +1282,15 @@ declare <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32>, <4 x i32>) nounwind readnone define <8 x i32> @test_x86_avx2_psrlv_d_256(<8 x i32> %a0, <8 x i32> %a1) { -; X86-AVX-LABEL: test_x86_avx2_psrlv_d_256: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x45,0xc1] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_psrlv_d_256: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x45,0xc1] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_psrlv_d_256: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x45,0xc1] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_psrlv_d_256: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x45,0xc1] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_psrlv_d_256: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x45,0xc1] +; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_psrlv_d_256: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x45,0xc1] +; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1] ret <8 x i32> %res } @@ -1887,25 +1298,15 @@ declare <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32>, <8 x i32>) nounwind read define <2 x i64> @test_x86_avx2_psrlv_q(<2 x i64> %a0, <2 x i64> %a1) { -; X86-AVX-LABEL: test_x86_avx2_psrlv_q: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0xf9,0x45,0xc1] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_psrlv_q: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0x45,0xc1] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_psrlv_q: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0xf9,0x45,0xc1] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_psrlv_q: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0x45,0xc1] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_psrlv_q: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0xf9,0x45,0xc1] +; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_psrlv_q: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0x45,0xc1] +; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1] ret <2 x i64> %res } @@ -1913,25 +1314,15 @@ declare <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64>, <2 x i64>) nounwind readnone define <4 x i64> @test_x86_avx2_psrlv_q_256(<4 x i64> %a0, <4 x i64> %a1) { -; X86-AVX-LABEL: test_x86_avx2_psrlv_q_256: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0xfd,0x45,0xc1] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_psrlv_q_256: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0x45,0xc1] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_psrlv_q_256: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0xfd,0x45,0xc1] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_psrlv_q_256: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0x45,0xc1] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_psrlv_q_256: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0xfd,0x45,0xc1] +; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_psrlv_q_256: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0x45,0xc1] +; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %a0, <4 x i64> %a1) ; <<4 x i64>> [#uses=1] ret <4 x i64> %res } @@ -1939,25 +1330,15 @@ declare <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64>, <4 x i64>) nounwind read define <4 x i32> @test_x86_avx2_psrav_d(<4 x i32> %a0, <4 x i32> %a1) { -; X86-AVX-LABEL: test_x86_avx2_psrav_d: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpsravd %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x46,0xc1] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_psrav_d: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpsravd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0xc1] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_psrav_d: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpsravd %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x46,0xc1] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_psrav_d: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpsravd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0xc1] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_psrav_d: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpsravd %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x46,0xc1] +; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_psrav_d: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpsravd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0xc1] +; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1] ret <4 x i32> %res } @@ -2004,25 +1385,15 @@ define <4 x i32> @test_x86_avx2_psrav_d_const(<4 x i32> %a0, <4 x i32> %a1) { declare <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32>, <4 x i32>) nounwind readnone define <8 x i32> @test_x86_avx2_psrav_d_256(<8 x i32> %a0, <8 x i32> %a1) { -; X86-AVX-LABEL: test_x86_avx2_psrav_d_256: -; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpsravd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x46,0xc1] -; X86-AVX-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512VL-LABEL: test_x86_avx2_psrav_d_256: -; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vpsravd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0xc1] -; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] -; -; X64-AVX-LABEL: test_x86_avx2_psrav_d_256: -; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpsravd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x46,0xc1] -; X64-AVX-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512VL-LABEL: test_x86_avx2_psrav_d_256: -; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vpsravd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0xc1] -; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] +; AVX2-LABEL: test_x86_avx2_psrav_d_256: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x46,0xc1] +; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_psrav_d_256: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpsravd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0xc1] +; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1] ret <8 x i32> %res } -- GitLab From 672e9ba7300c48646dcdc16a46be60c4e0acadef Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 15 Oct 2018 21:51:26 +0000 Subject: [PATCH 0219/1116] [X86] Disable the peephole pass on avx2-intrinsics-x86.ll and avx512bw-intrinsics.ll to ensure any load folding tests are testing isel not load folding tables. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344561 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/CodeGen/X86/avx2-intrinsics-x86.ll | 8 ++++---- test/CodeGen/X86/avx512bw-intrinsics.ll | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/test/CodeGen/X86/avx2-intrinsics-x86.ll b/test/CodeGen/X86/avx2-intrinsics-x86.ll index 5b649df410b..10d40e556c8 100644 --- a/test/CodeGen/X86/avx2-intrinsics-x86.ll +++ b/test/CodeGen/X86/avx2-intrinsics-x86.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=avx2 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2 --check-prefix=X86 --check-prefix=X86-AVX -; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512VL --check-prefix=X86 --check-prefix=X86-AVX512VL -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx2 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2 --check-prefix=X64 --check-prefix=X64-AVX -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512VL --check-prefix=X64 --check-prefix=X64-AVX512VL +; RUN: llc < %s -disable-peephole -mtriple=i686-apple-darwin -mattr=avx2 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2 --check-prefix=X86 --check-prefix=X86-AVX +; RUN: llc < %s -disable-peephole -mtriple=i686-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512VL --check-prefix=X86 --check-prefix=X86-AVX512VL +; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx2 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2 --check-prefix=X64 --check-prefix=X64-AVX +; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512VL --check-prefix=X64 --check-prefix=X64-AVX512VL define <16 x i16> @test_x86_avx2_packssdw(<8 x i32> %a0, <8 x i32> %a1) { ; AVX2-LABEL: test_x86_avx2_packssdw: diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll index c17ba57d11a..4cd51bc1e91 100644 --- a/test/CodeGen/X86/avx512bw-intrinsics.ll +++ b/test/CodeGen/X86/avx512bw-intrinsics.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512bw --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64 +; RUN: llc < %s -disable-peephole -mtriple=i686-unknown-unknown -mattr=+avx512bw --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86 +; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512bw --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64 define i32 @test_int_x86_avx512_kadd_d(<32 x i16> %A, <32 x i16> %B) nounwind { ; CHECK-LABEL: test_int_x86_avx512_kadd_d: -- GitLab From f27f35e3318010160d579446899eec6ec12569bb Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 15 Oct 2018 21:51:29 +0000 Subject: [PATCH 0220/1116] [X86] Add test cases showing failure to fold load into vpsrlw when EVEX encoded instructions are used. There's a bad bitcast being used in the isel patterns for the vXi16 shift instructions. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344562 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/CodeGen/X86/avx2-intrinsics-x86.ll | 84 +++++++++++++++++-------- test/CodeGen/X86/avx512bw-intrinsics.ll | 17 +++++ test/CodeGen/X86/sse2-intrinsics-x86.ll | 41 ++++++++++++ 3 files changed, 115 insertions(+), 27 deletions(-) diff --git a/test/CodeGen/X86/avx2-intrinsics-x86.ll b/test/CodeGen/X86/avx2-intrinsics-x86.ll index 10d40e556c8..bba70b139e2 100644 --- a/test/CodeGen/X86/avx2-intrinsics-x86.ll +++ b/test/CodeGen/X86/avx2-intrinsics-x86.ll @@ -531,6 +531,36 @@ define <16 x i16> @test_x86_avx2_psrl_w(<16 x i16> %a0, <8 x i16> %a1) { declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) nounwind readnone +define <16 x i16> @test_x86_avx2_psrl_w_load(<16 x i16> %a0, <8 x i16>* %p) { +; X86-AVX-LABEL: test_x86_avx2_psrl_w_load: +; X86-AVX: ## %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] +; X86-AVX-NEXT: vpsrlw (%eax), %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd1,0x00] +; X86-AVX-NEXT: retl ## encoding: [0xc3] +; +; X86-AVX512VL-LABEL: test_x86_avx2_psrl_w_load: +; X86-AVX512VL: ## %bb.0: +; X86-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] +; X86-AVX512VL-NEXT: vmovdqa (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x08] +; X86-AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd1,0xc1] +; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] +; +; X64-AVX-LABEL: test_x86_avx2_psrl_w_load: +; X64-AVX: ## %bb.0: +; X64-AVX-NEXT: vpsrlw (%rdi), %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd1,0x07] +; X64-AVX-NEXT: retq ## encoding: [0xc3] +; +; X64-AVX512VL-LABEL: test_x86_avx2_psrl_w_load: +; X64-AVX512VL: ## %bb.0: +; X64-AVX512VL-NEXT: vmovdqa (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x0f] +; X64-AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd1,0xc1] +; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] + %a1 = load <8 x i16>, <8 x i16>* %p + %res = call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %a0, <8 x i16> %a1) ; <<16 x i16>> [#uses=1] + ret <16 x i16> %res +} + + define <8 x i32> @test_x86_avx2_psrli_d(<8 x i32> %a0) { ; AVX2-LABEL: test_x86_avx2_psrli_d: ; AVX2: ## %bb.0: @@ -820,28 +850,28 @@ define <16 x i16> @test_x86_avx2_packusdw_fold() { ; X86-AVX: ## %bb.0: ; X86-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0] ; X86-AVX-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] -; X86-AVX-NEXT: ## fixup A - offset: 4, value: LCPI50_0, kind: FK_Data_4 +; X86-AVX-NEXT: ## fixup A - offset: 4, value: LCPI51_0, kind: FK_Data_4 ; X86-AVX-NEXT: retl ## encoding: [0xc3] ; ; X86-AVX512VL-LABEL: test_x86_avx2_packusdw_fold: ; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vmovaps LCPI50_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0] +; X86-AVX512VL-NEXT: vmovaps LCPI51_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0] ; X86-AVX512VL-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI50_0, kind: FK_Data_4 +; X86-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI51_0, kind: FK_Data_4 ; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] ; ; X64-AVX-LABEL: test_x86_avx2_packusdw_fold: ; X64-AVX: ## %bb.0: ; X64-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0] ; X64-AVX-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] -; X64-AVX-NEXT: ## fixup A - offset: 4, value: LCPI50_0-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: ## fixup A - offset: 4, value: LCPI51_0-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: retq ## encoding: [0xc3] ; ; X64-AVX512VL-LABEL: test_x86_avx2_packusdw_fold: ; X64-AVX512VL: ## %bb.0: ; X64-AVX512VL-NEXT: vmovaps {{.*}}(%rip), %ymm0 ## EVEX TO VEX Compression ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0] ; X64-AVX512VL-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] -; X64-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI50_0-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI51_0-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> zeroinitializer, <8 x i32> ) ret <16 x i16> %res @@ -1348,36 +1378,36 @@ define <4 x i32> @test_x86_avx2_psrav_d_const(<4 x i32> %a0, <4 x i32> %a1) { ; X86-AVX: ## %bb.0: ; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [2,9,4294967284,23] ; X86-AVX-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] -; X86-AVX-NEXT: ## fixup A - offset: 4, value: LCPI82_0, kind: FK_Data_4 -; X86-AVX-NEXT: vpsravd LCPI82_1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A] -; X86-AVX-NEXT: ## fixup A - offset: 5, value: LCPI82_1, kind: FK_Data_4 +; X86-AVX-NEXT: ## fixup A - offset: 4, value: LCPI83_0, kind: FK_Data_4 +; X86-AVX-NEXT: vpsravd LCPI83_1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A] +; X86-AVX-NEXT: ## fixup A - offset: 5, value: LCPI83_1, kind: FK_Data_4 ; X86-AVX-NEXT: retl ## encoding: [0xc3] ; ; X86-AVX512VL-LABEL: test_x86_avx2_psrav_d_const: ; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vmovdqa LCPI82_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [2,9,4294967284,23] +; X86-AVX512VL-NEXT: vmovdqa LCPI83_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [2,9,4294967284,23] ; X86-AVX512VL-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI82_0, kind: FK_Data_4 -; X86-AVX512VL-NEXT: vpsravd LCPI82_1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI82_1, kind: FK_Data_4 +; X86-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI83_0, kind: FK_Data_4 +; X86-AVX512VL-NEXT: vpsravd LCPI83_1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A] +; X86-AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI83_1, kind: FK_Data_4 ; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] ; ; X64-AVX-LABEL: test_x86_avx2_psrav_d_const: ; X64-AVX: ## %bb.0: ; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [2,9,4294967284,23] ; X64-AVX-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] -; X64-AVX-NEXT: ## fixup A - offset: 4, value: LCPI82_0-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: ## fixup A - offset: 4, value: LCPI83_0-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A] -; X64-AVX-NEXT: ## fixup A - offset: 5, value: LCPI82_1-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: ## fixup A - offset: 5, value: LCPI83_1-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: retq ## encoding: [0xc3] ; ; X64-AVX512VL-LABEL: test_x86_avx2_psrav_d_const: ; X64-AVX512VL: ## %bb.0: ; X64-AVX512VL-NEXT: vmovdqa {{.*}}(%rip), %xmm0 ## EVEX TO VEX Compression xmm0 = [2,9,4294967284,23] ; X64-AVX512VL-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] -; X64-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI82_0-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI83_0-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A] -; X64-AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI82_1-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI83_1-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> , <4 x i32> ) ret <4 x i32> %res @@ -1403,36 +1433,36 @@ define <8 x i32> @test_x86_avx2_psrav_d_256_const(<8 x i32> %a0, <8 x i32> %a1) ; X86-AVX: ## %bb.0: ; X86-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] ; X86-AVX-NEXT: ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] -; X86-AVX-NEXT: ## fixup A - offset: 4, value: LCPI84_0, kind: FK_Data_4 -; X86-AVX-NEXT: vpsravd LCPI84_1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] -; X86-AVX-NEXT: ## fixup A - offset: 5, value: LCPI84_1, kind: FK_Data_4 +; X86-AVX-NEXT: ## fixup A - offset: 4, value: LCPI85_0, kind: FK_Data_4 +; X86-AVX-NEXT: vpsravd LCPI85_1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] +; X86-AVX-NEXT: ## fixup A - offset: 5, value: LCPI85_1, kind: FK_Data_4 ; X86-AVX-NEXT: retl ## encoding: [0xc3] ; ; X86-AVX512VL-LABEL: test_x86_avx2_psrav_d_256_const: ; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vmovdqa LCPI84_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] +; X86-AVX512VL-NEXT: vmovdqa LCPI85_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] ; X86-AVX512VL-NEXT: ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI84_0, kind: FK_Data_4 -; X86-AVX512VL-NEXT: vpsravd LCPI84_1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI84_1, kind: FK_Data_4 +; X86-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI85_0, kind: FK_Data_4 +; X86-AVX512VL-NEXT: vpsravd LCPI85_1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] +; X86-AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI85_1, kind: FK_Data_4 ; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] ; ; X64-AVX-LABEL: test_x86_avx2_psrav_d_256_const: ; X64-AVX: ## %bb.0: ; X64-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] ; X64-AVX-NEXT: ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] -; X64-AVX-NEXT: ## fixup A - offset: 4, value: LCPI84_0-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: ## fixup A - offset: 4, value: LCPI85_0-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] -; X64-AVX-NEXT: ## fixup A - offset: 5, value: LCPI84_1-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: ## fixup A - offset: 5, value: LCPI85_1-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: retq ## encoding: [0xc3] ; ; X64-AVX512VL-LABEL: test_x86_avx2_psrav_d_256_const: ; X64-AVX512VL: ## %bb.0: ; X64-AVX512VL-NEXT: vmovdqa {{.*}}(%rip), %ymm0 ## EVEX TO VEX Compression ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] ; X64-AVX512VL-NEXT: ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] -; X64-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI84_0-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI85_0-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] -; X64-AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI84_1-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI85_1-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> , <8 x i32> ) ret <8 x i32> %res diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll index 4cd51bc1e91..650235d51b3 100644 --- a/test/CodeGen/X86/avx512bw-intrinsics.ll +++ b/test/CodeGen/X86/avx512bw-intrinsics.ll @@ -1948,6 +1948,23 @@ define <32 x i16> @test_x86_avx512_maskz_psrl_w_512(<32 x i16> %a0, <8 x i16> %a } declare <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16>, <8 x i16>) nounwind readnone +define <32 x i16> @test_x86_avx512_psrl_w_512_load(<32 x i16> %a0, <8 x i16>* %p) { +; X86-LABEL: test_x86_avx512_psrl_w_512_load: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vmovdqa (%eax), %xmm1 # encoding: [0xc5,0xf9,0x6f,0x08] +; X86-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xd1,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_x86_avx512_psrl_w_512_load: +; X64: # %bb.0: +; X64-NEXT: vmovdqa (%rdi), %xmm1 # encoding: [0xc5,0xf9,0x6f,0x0f] +; X64-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xd1,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %a1 = load <8 x i16>, <8 x i16>* %p + %res = call <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16> %a0, <8 x i16> %a1) ; <<32 x i16>> [#uses=1] + ret <32 x i16> %res +} define <32 x i16> @test_x86_avx512_psrli_w_512(<32 x i16> %a0) { ; CHECK-LABEL: test_x86_avx512_psrli_w_512: diff --git a/test/CodeGen/X86/sse2-intrinsics-x86.ll b/test/CodeGen/X86/sse2-intrinsics-x86.ll index 068b0421a0b..020c4985943 100644 --- a/test/CodeGen/X86/sse2-intrinsics-x86.ll +++ b/test/CodeGen/X86/sse2-intrinsics-x86.ll @@ -1418,6 +1418,47 @@ define <8 x i16> @test_x86_sse2_psrl_w(<8 x i16> %a0, <8 x i16> %a1) { declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) nounwind readnone +define <8 x i16> @test_x86_sse2_psrl_w_load(<8 x i16> %a0, <8 x i16>* %p) { +; X86-SSE-LABEL: test_x86_sse2_psrl_w_load: +; X86-SSE: ## %bb.0: +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] +; X86-SSE-NEXT: psrlw (%eax), %xmm0 ## encoding: [0x66,0x0f,0xd1,0x00] +; X86-SSE-NEXT: retl ## encoding: [0xc3] +; +; X86-AVX1-LABEL: test_x86_sse2_psrl_w_load: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] +; X86-AVX1-NEXT: vpsrlw (%eax), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xd1,0x00] +; X86-AVX1-NEXT: retl ## encoding: [0xc3] +; +; X86-AVX512-LABEL: test_x86_sse2_psrl_w_load: +; X86-AVX512: ## %bb.0: +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] +; X86-AVX512-NEXT: vmovdqa (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x08] +; X86-AVX512-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd1,0xc1] +; X86-AVX512-NEXT: retl ## encoding: [0xc3] +; +; X64-SSE-LABEL: test_x86_sse2_psrl_w_load: +; X64-SSE: ## %bb.0: +; X64-SSE-NEXT: psrlw (%rdi), %xmm0 ## encoding: [0x66,0x0f,0xd1,0x07] +; X64-SSE-NEXT: retq ## encoding: [0xc3] +; +; X64-AVX1-LABEL: test_x86_sse2_psrl_w_load: +; X64-AVX1: ## %bb.0: +; X64-AVX1-NEXT: vpsrlw (%rdi), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xd1,0x07] +; X64-AVX1-NEXT: retq ## encoding: [0xc3] +; +; X64-AVX512-LABEL: test_x86_sse2_psrl_w_load: +; X64-AVX512: ## %bb.0: +; X64-AVX512-NEXT: vmovdqa (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x0f] +; X64-AVX512-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd1,0xc1] +; X64-AVX512-NEXT: retq ## encoding: [0xc3] + %a1 = load <8 x i16>, <8 x i16>* %p + %res = call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] + ret <8 x i16> %res +} + + define <4 x i32> @test_x86_sse2_psrli_d(<4 x i32> %a0) { ; SSE-LABEL: test_x86_sse2_psrli_d: ; SSE: ## %bb.0: -- GitLab From c6a0661256ae8e6d00a1ed6e87cd646818b3b1d4 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 15 Oct 2018 21:51:32 +0000 Subject: [PATCH 0221/1116] [X86] Fix a bad bitcast in the load form of vXi16 uniform shift patterns for EVEX encoded instructions. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344563 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86InstrAVX512.td | 19 ++++++++++--------- test/CodeGen/X86/avx2-intrinsics-x86.ll | 6 ++---- test/CodeGen/X86/avx512bw-intrinsics.ll | 6 ++---- test/CodeGen/X86/sse2-intrinsics-x86.ll | 6 ++---- 4 files changed, 16 insertions(+), 21 deletions(-) diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index b1cb1545ec4..158aba447ed 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -5826,7 +5826,7 @@ multiclass avx512_shift_rmbi opc, Format ImmFormM, multiclass avx512_shift_rrm opc, string OpcodeStr, SDNode OpNode, X86FoldableSchedWrite sched, ValueType SrcVT, - PatFrag bc_frag, X86VectorVTInfo _> { + X86VectorVTInfo _> { // src2 is always 128-bit let ExeDomain = _.ExeDomain in { defm rr : AVX512_maskable opc, string OpcodeStr, SDNode OpNode, defm rm : AVX512_maskable, + (_.VT (OpNode _.RC:$src1, + (SrcVT (bitconvert (loadv2i64 addr:$src2)))))>, AVX512BIBase, EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -5845,18 +5846,18 @@ multiclass avx512_shift_rrm opc, string OpcodeStr, SDNode OpNode, multiclass avx512_shift_sizes opc, string OpcodeStr, SDNode OpNode, X86SchedWriteWidths sched, ValueType SrcVT, - PatFrag bc_frag, AVX512VLVectorVTInfo VTInfo, + AVX512VLVectorVTInfo VTInfo, Predicate prd> { let Predicates = [prd] in defm Z : avx512_shift_rrm, EVEX_V512, + VTInfo.info512>, EVEX_V512, EVEX_CD8 ; let Predicates = [prd, HasVLX] in { defm Z256 : avx512_shift_rrm, EVEX_V256, + VTInfo.info256>, EVEX_V256, EVEX_CD8; defm Z128 : avx512_shift_rrm, EVEX_V128, + VTInfo.info128>, EVEX_V128, EVEX_CD8; } } @@ -5866,12 +5867,12 @@ multiclass avx512_shift_types opcd, bits<8> opcq, bits<8> opcw, X86SchedWriteWidths sched, bit NotEVEX2VEXConvertibleQ = 0> { defm D : avx512_shift_sizes; + avx512vl_i32_info, HasAVX512>; let notEVEX2VEXConvertible = NotEVEX2VEXConvertibleQ in defm Q : avx512_shift_sizes, VEX_W; + avx512vl_i64_info, HasAVX512>, VEX_W; defm W : avx512_shift_sizes; + avx512vl_i16_info, HasBWI>; } multiclass avx512_shift_rmi_sizes opc, Format ImmFormR, Format ImmFormM, diff --git a/test/CodeGen/X86/avx2-intrinsics-x86.ll b/test/CodeGen/X86/avx2-intrinsics-x86.ll index bba70b139e2..101448e22ac 100644 --- a/test/CodeGen/X86/avx2-intrinsics-x86.ll +++ b/test/CodeGen/X86/avx2-intrinsics-x86.ll @@ -541,8 +541,7 @@ define <16 x i16> @test_x86_avx2_psrl_w_load(<16 x i16> %a0, <8 x i16>* %p) { ; X86-AVX512VL-LABEL: test_x86_avx2_psrl_w_load: ; X86-AVX512VL: ## %bb.0: ; X86-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512VL-NEXT: vmovdqa (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x08] -; X86-AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd1,0xc1] +; X86-AVX512VL-NEXT: vpsrlw (%eax), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd1,0x00] ; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] ; ; X64-AVX-LABEL: test_x86_avx2_psrl_w_load: @@ -552,8 +551,7 @@ define <16 x i16> @test_x86_avx2_psrl_w_load(<16 x i16> %a0, <8 x i16>* %p) { ; ; X64-AVX512VL-LABEL: test_x86_avx2_psrl_w_load: ; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vmovdqa (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x0f] -; X64-AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd1,0xc1] +; X64-AVX512VL-NEXT: vpsrlw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd1,0x07] ; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] %a1 = load <8 x i16>, <8 x i16>* %p %res = call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %a0, <8 x i16> %a1) ; <<16 x i16>> [#uses=1] diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll index 650235d51b3..cf52746c3a5 100644 --- a/test/CodeGen/X86/avx512bw-intrinsics.ll +++ b/test/CodeGen/X86/avx512bw-intrinsics.ll @@ -1952,14 +1952,12 @@ define <32 x i16> @test_x86_avx512_psrl_w_512_load(<32 x i16> %a0, <8 x i16>* %p ; X86-LABEL: test_x86_avx512_psrl_w_512_load: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vmovdqa (%eax), %xmm1 # encoding: [0xc5,0xf9,0x6f,0x08] -; X86-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xd1,0xc1] +; X86-NEXT: vpsrlw (%eax), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xd1,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_x86_avx512_psrl_w_512_load: ; X64: # %bb.0: -; X64-NEXT: vmovdqa (%rdi), %xmm1 # encoding: [0xc5,0xf9,0x6f,0x0f] -; X64-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xd1,0xc1] +; X64-NEXT: vpsrlw (%rdi), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xd1,0x07] ; X64-NEXT: retq # encoding: [0xc3] %a1 = load <8 x i16>, <8 x i16>* %p %res = call <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16> %a0, <8 x i16> %a1) ; <<32 x i16>> [#uses=1] diff --git a/test/CodeGen/X86/sse2-intrinsics-x86.ll b/test/CodeGen/X86/sse2-intrinsics-x86.ll index 020c4985943..8dedce5fc8b 100644 --- a/test/CodeGen/X86/sse2-intrinsics-x86.ll +++ b/test/CodeGen/X86/sse2-intrinsics-x86.ll @@ -1434,8 +1434,7 @@ define <8 x i16> @test_x86_sse2_psrl_w_load(<8 x i16> %a0, <8 x i16>* %p) { ; X86-AVX512-LABEL: test_x86_sse2_psrl_w_load: ; X86-AVX512: ## %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vmovdqa (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x08] -; X86-AVX512-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd1,0xc1] +; X86-AVX512-NEXT: vpsrlw (%eax), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd1,0x00] ; X86-AVX512-NEXT: retl ## encoding: [0xc3] ; ; X64-SSE-LABEL: test_x86_sse2_psrl_w_load: @@ -1450,8 +1449,7 @@ define <8 x i16> @test_x86_sse2_psrl_w_load(<8 x i16> %a0, <8 x i16>* %p) { ; ; X64-AVX512-LABEL: test_x86_sse2_psrl_w_load: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vmovdqa (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x0f] -; X64-AVX512-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd1,0xc1] +; X64-AVX512-NEXT: vpsrlw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd1,0x07] ; X64-AVX512-NEXT: retq ## encoding: [0xc3] %a1 = load <8 x i16>, <8 x i16>* %p %res = call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] -- GitLab From 726b0ec4982cd872ba4577900f4d6adb2b1753b6 Mon Sep 17 00:00:00 2001 From: Erik Pilkington Date: Mon, 15 Oct 2018 22:03:53 +0000 Subject: [PATCH 0222/1116] NFC: Fix a -Wsign-conversion warning git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344564 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Demangle/ItaniumDemangle.cpp | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/lib/Demangle/ItaniumDemangle.cpp b/lib/Demangle/ItaniumDemangle.cpp index 2165cbab7e7..8d132c7580f 100644 --- a/lib/Demangle/ItaniumDemangle.cpp +++ b/lib/Demangle/ItaniumDemangle.cpp @@ -112,14 +112,20 @@ struct DumpVisitor { printStr("}"); --Depth; } + // Overload used when T is exactly 'bool', not merely convertible to 'bool'. - template - void print(T B) { - printStr(B ? "true" : "false"); + void print(bool B) { printStr(B ? "true" : "false"); } + + template + typename std::enable_if::value>::type print(T N) { + fprintf(stderr, "%llu", (unsigned long long)N); } - void print(size_t N) { - fprintf(stderr, "%zu", N); + + template + typename std::enable_if::value>::type print(T N) { + fprintf(stderr, "%lld", (long long)N); } + void print(ReferenceKind RK) { switch (RK) { case ReferenceKind::LValue: -- GitLab From 52ff03cff907cf240f95cbcbbec5e0f3b43c76b0 Mon Sep 17 00:00:00 2001 From: Lang Hames Date: Mon, 15 Oct 2018 22:27:02 +0000 Subject: [PATCH 0223/1116] [ORC] Switch to DenseMap/DenseSet for ORC symbol map/set types. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344565 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/ExecutionEngine/Orc/Core.h | 19 +++---- .../ExecutionEngine/Orc/SymbolStringPool.h | 27 ++++++++++ lib/ExecutionEngine/Orc/Core.cpp | 53 +++++++++++-------- lib/ExecutionEngine/Orc/ExecutionUtils.cpp | 14 ++--- 4 files changed, 73 insertions(+), 40 deletions(-) diff --git a/include/llvm/ExecutionEngine/Orc/Core.h b/include/llvm/ExecutionEngine/Orc/Core.h index 67b16894f6c..86c5ebb6d27 100644 --- a/include/llvm/ExecutionEngine/Orc/Core.h +++ b/include/llvm/ExecutionEngine/Orc/Core.h @@ -20,10 +20,7 @@ #include "llvm/IR/Module.h" #include "llvm/Support/Debug.h" -#include -#include #include -#include #include #define DEBUG_TYPE "orc" @@ -44,18 +41,18 @@ using VModuleKey = uint64_t; /// A set of symbol names (represented by SymbolStringPtrs for // efficiency). -using SymbolNameSet = std::set; +using SymbolNameSet = DenseSet; /// A map from symbol names (as SymbolStringPtrs) to JITSymbols /// (address/flags pairs). -using SymbolMap = std::map; +using SymbolMap = DenseMap; /// A map from symbol names (as SymbolStringPtrs) to JITSymbolFlags. -using SymbolFlagsMap = std::map; +using SymbolFlagsMap = DenseMap; /// A base class for materialization failures that allows the failing /// symbols to be obtained for logging. -using SymbolDependenceMap = std::map; +using SymbolDependenceMap = DenseMap; /// A list of JITDylib pointers. using JITDylibList = std::vector; @@ -339,7 +336,7 @@ struct SymbolAliasMapEntry { }; /// A map of Symbols to (Symbol, Flags) pairs. -using SymbolAliasMap = std::map; +using SymbolAliasMap = DenseMap; /// A materialization unit for symbol aliases. Allows existing symbols to be /// aliased with alternate flags. @@ -489,7 +486,7 @@ public: JITDylib &Parent, const SymbolNameSet &Names)>; using AsynchronousSymbolQuerySet = - std::set>; + std::set>; JITDylib(const JITDylib &) = delete; JITDylib &operator=(const JITDylib &) = delete; @@ -609,7 +606,7 @@ private: }; using UnmaterializedInfosMap = - std::map>; + DenseMap>; struct MaterializingInfo { AsynchronousSymbolQueryList PendingQueries; @@ -618,7 +615,7 @@ private: bool IsEmitted = false; }; - using MaterializingInfosMap = std::map; + using MaterializingInfosMap = DenseMap; using LookupImplActionFlags = enum { None = 0, diff --git a/include/llvm/ExecutionEngine/Orc/SymbolStringPool.h b/include/llvm/ExecutionEngine/Orc/SymbolStringPool.h index 4c45cfd199d..717076e2560 100644 --- a/include/llvm/ExecutionEngine/Orc/SymbolStringPool.h +++ b/include/llvm/ExecutionEngine/Orc/SymbolStringPool.h @@ -14,6 +14,7 @@ #ifndef LLVM_EXECUTIONENGINE_ORC_SYMBOLSTRINGPOOL_H #define LLVM_EXECUTIONENGINE_ORC_SYMBOLSTRINGPOOL_H +#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/StringMap.h" #include #include @@ -49,10 +50,13 @@ private: /// Pointer to a pooled string representing a symbol name. class SymbolStringPtr { friend class SymbolStringPool; + friend struct DenseMapInfo; friend bool operator==(const SymbolStringPtr &LHS, const SymbolStringPtr &RHS); friend bool operator<(const SymbolStringPtr &LHS, const SymbolStringPtr &RHS); + static SymbolStringPool::PoolMapEntry Tombstone; + public: SymbolStringPtr() = default; SymbolStringPtr(const SymbolStringPtr &Other) @@ -142,6 +146,29 @@ inline bool SymbolStringPool::empty() const { } } // end namespace orc + +template <> +struct DenseMapInfo { + + static orc::SymbolStringPtr getEmptyKey() { + return orc::SymbolStringPtr(); + } + + static orc::SymbolStringPtr getTombstoneKey() { + return orc::SymbolStringPtr(&orc::SymbolStringPtr::Tombstone); + } + + static unsigned getHashValue(orc::SymbolStringPtr V) { + uintptr_t IV = reinterpret_cast(V.S); + return unsigned(IV) ^ unsigned(IV >> 9); + } + + static bool isEqual(const orc::SymbolStringPtr &LHS, + const orc::SymbolStringPtr &RHS) { + return LHS.S == RHS.S; + } +}; + } // end namespace llvm #endif // LLVM_EXECUTIONENGINE_ORC_SYMBOLSTRINGPOOL_H diff --git a/lib/ExecutionEngine/Orc/Core.cpp b/lib/ExecutionEngine/Orc/Core.cpp index 3fa28a5af6f..d477ca523d8 100644 --- a/lib/ExecutionEngine/Orc/Core.cpp +++ b/lib/ExecutionEngine/Orc/Core.cpp @@ -134,6 +134,8 @@ struct PrintSymbolMapElemsMatchingCLOpts { namespace llvm { namespace orc { + SymbolStringPool::PoolMapEntry SymbolStringPtr::Tombstone(0); + char FailedToMaterialize::ID = 0; char SymbolsNotFound::ID = 0; char SymbolsCouldNotBeRemoved::ID = 0; @@ -575,20 +577,22 @@ void ReExportsMaterializationUnit::materialize( SymbolNameSet QuerySymbols; SymbolAliasMap QueryAliases; - for (auto I = RequestedAliases.begin(), E = RequestedAliases.end(); - I != E;) { - auto Tmp = I++; - + // Collect as many aliases as we can without including a chain. + for (auto &KV : RequestedAliases) { // Chain detected. Skip this symbol for this round. - if (&SrcJD == &TgtJD && (QueryAliases.count(Tmp->second.Aliasee) || - RequestedAliases.count(Tmp->second.Aliasee))) + if (&SrcJD == &TgtJD && (QueryAliases.count(KV.second.Aliasee) || + RequestedAliases.count(KV.second.Aliasee))) continue; - ResponsibilitySymbols.insert(Tmp->first); - QuerySymbols.insert(Tmp->second.Aliasee); - QueryAliases[Tmp->first] = std::move(Tmp->second); - RequestedAliases.erase(Tmp); + ResponsibilitySymbols.insert(KV.first); + QuerySymbols.insert(KV.second.Aliasee); + QueryAliases[KV.first] = std::move(KV.second); } + + // Remove the aliases collected this round from the RequestedAliases map. + for (auto &KV : QueryAliases) + RequestedAliases.erase(KV.first); + assert(!QuerySymbols.empty() && "Alias cycle detected!"); auto QueryInfo = std::make_shared( @@ -1172,10 +1176,9 @@ void JITDylib::lodgeQueryImpl( std::shared_ptr &Q, SymbolNameSet &Unresolved, JITDylib *MatchNonExportedInJD, bool MatchNonExported, std::vector> &MUs) { - for (auto I = Unresolved.begin(), E = Unresolved.end(); I != E;) { - auto TmpI = I++; - auto Name = *TmpI; + std::vector ToRemove; + for (auto Name : Unresolved) { // Search for the name in Symbols. Skip it if not found. auto SymI = Symbols.find(Name); if (SymI == Symbols.end()) @@ -1188,9 +1191,9 @@ void JITDylib::lodgeQueryImpl( if (!MatchNonExported && MatchNonExportedInJD != this) continue; - // If we matched against Name in JD, remove it frome the Unresolved set and - // add it to the added set. - Unresolved.erase(TmpI); + // If we matched against Name in JD, mark it to be removed from the Unresolved + // set. + ToRemove.push_back(Name); // If the symbol has an address then resolve it. if (SymI->second.getAddress() != 0) @@ -1235,6 +1238,10 @@ void JITDylib::lodgeQueryImpl( MI.PendingQueries.push_back(Q); Q->addQueryDependence(*this, Name); } + + // Remove any symbols that we found. + for (auto &Name : ToRemove) + Unresolved.erase(Name); } SymbolNameSet JITDylib::legacyLookup(std::shared_ptr Q, @@ -1294,19 +1301,17 @@ JITDylib::lookupImpl(std::shared_ptr &Q, std::vector> &MUs, SymbolNameSet &Unresolved) { LookupImplActionFlags ActionFlags = None; + std::vector ToRemove; - for (auto I = Unresolved.begin(), E = Unresolved.end(); I != E;) { - auto TmpI = I++; - auto Name = *TmpI; + for (auto Name : Unresolved) { // Search for the name in Symbols. Skip it if not found. auto SymI = Symbols.find(Name); if (SymI == Symbols.end()) continue; - // If we found Name, remove it frome the Unresolved set and add it - // to the dependencies set. - Unresolved.erase(TmpI); + // If we found Name, mark it to be removed from the Unresolved set. + ToRemove.push_back(Name); // If the symbol has an address then resolve it. if (SymI->second.getAddress() != 0) { @@ -1357,6 +1362,10 @@ JITDylib::lookupImpl(std::shared_ptr &Q, Q->addQueryDependence(*this, Name); } + // Remove any marked symbols from the Unresolved set. + for (auto &Name : ToRemove) + Unresolved.erase(Name); + return ActionFlags; } diff --git a/lib/ExecutionEngine/Orc/ExecutionUtils.cpp b/lib/ExecutionEngine/Orc/ExecutionUtils.cpp index 667237373ca..4c8f725df54 100644 --- a/lib/ExecutionEngine/Orc/ExecutionUtils.cpp +++ b/lib/ExecutionEngine/Orc/ExecutionUtils.cpp @@ -167,13 +167,13 @@ int LocalCXXRuntimeOverridesBase::CXAAtExitOverride(DestructorPtr Destructor, Error LocalCXXRuntimeOverrides2::enable(JITDylib &JD, MangleAndInterner &Mangle) { - SymbolMap RuntimeInterposes( - {{Mangle("__dso_handle"), - JITEvaluatedSymbol(toTargetAddress(&DSOHandleOverride), - JITSymbolFlags::Exported)}, - {Mangle("__cxa_atexit"), - JITEvaluatedSymbol(toTargetAddress(&CXAAtExitOverride), - JITSymbolFlags::Exported)}}); + SymbolMap RuntimeInterposes; + RuntimeInterposes[Mangle("__dso_handle")] = + JITEvaluatedSymbol(toTargetAddress(&DSOHandleOverride), + JITSymbolFlags::Exported); + RuntimeInterposes[Mangle("__cxa_atexit")] = + JITEvaluatedSymbol(toTargetAddress(&CXAAtExitOverride), + JITSymbolFlags::Exported); return JD.define(absoluteSymbols(std::move(RuntimeInterposes))); } -- GitLab From 6712561e1900fcf553d66bb73f90759319ef17f8 Mon Sep 17 00:00:00 2001 From: Lang Hames Date: Mon, 15 Oct 2018 22:27:03 +0000 Subject: [PATCH 0224/1116] Change a TerminatorInst* to an Instruction* in HotColdSplitting.cpp. r344558 added an assignment to a TerminatorInst* from BasicBlock::getTerminatorInst(), but BasicBlock::getTerminatorInst() returns an Instruction* rather than a TerminatorInst* since r344504 so this fails to compile. Changing the variable to an Instruction* should get the bots building again. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344566 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/IPO/HotColdSplitting.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Transforms/IPO/HotColdSplitting.cpp b/lib/Transforms/IPO/HotColdSplitting.cpp index fcea40dffd7..be4da249955 100644 --- a/lib/Transforms/IPO/HotColdSplitting.cpp +++ b/lib/Transforms/IPO/HotColdSplitting.cpp @@ -150,7 +150,7 @@ static bool unlikelyExecuted(const BasicBlock &BB) { } static bool returnsOrHasSideEffects(const BasicBlock &BB) { - const TerminatorInst *I = BB.getTerminator(); + const Instruction *I = BB.getTerminator(); if (isa(I) || isa(I) || isa(I)) return true; -- GitLab From 4812114f295f744ad7b77d4006b8bcdeaa46c1f2 Mon Sep 17 00:00:00 2001 From: Lang Hames Date: Mon, 15 Oct 2018 22:36:22 +0000 Subject: [PATCH 0225/1116] [ORC] Rename MultiThreadedSimpleCompiler to ConcurrentIRCompiler. The new name is a better fit: This class does not actually spawn any new threads for compilation, it is just safe to call from multiple threads concurrently. The "Simple" part of the name did not convey much either, so it was dropped. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344567 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/ExecutionEngine/Orc/CompileUtils.h | 8 ++++---- lib/ExecutionEngine/Orc/LLJIT.cpp | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/include/llvm/ExecutionEngine/Orc/CompileUtils.h b/include/llvm/ExecutionEngine/Orc/CompileUtils.h index 3d02f9d05e4..f34f88311ba 100644 --- a/include/llvm/ExecutionEngine/Orc/CompileUtils.h +++ b/include/llvm/ExecutionEngine/Orc/CompileUtils.h @@ -38,7 +38,7 @@ namespace orc { /// Simple compile functor: Takes a single IR module and returns an ObjectFile. /// This compiler supports a single compilation thread and LLVMContext only. -/// For multithreaded compilation, use MultiThreadedSimpleCompiler below. +/// For multithreaded compilation, use ConcurrentIRCompiler below. class SimpleCompiler { public: using CompileResult = std::unique_ptr; @@ -105,10 +105,10 @@ private: /// /// This class creates a new TargetMachine and SimpleCompiler instance for each /// compile. -class MultiThreadedSimpleCompiler { +class ConcurrentIRCompiler { public: - MultiThreadedSimpleCompiler(JITTargetMachineBuilder JTMB, - ObjectCache *ObjCache = nullptr) + ConcurrentIRCompiler(JITTargetMachineBuilder JTMB, + ObjectCache *ObjCache = nullptr) : JTMB(std::move(JTMB)), ObjCache(ObjCache) {} void setObjectCache(ObjectCache *ObjCache) { this->ObjCache = ObjCache; } diff --git a/lib/ExecutionEngine/Orc/LLJIT.cpp b/lib/ExecutionEngine/Orc/LLJIT.cpp index 39bb4c48067..478ac2e2148 100644 --- a/lib/ExecutionEngine/Orc/LLJIT.cpp +++ b/lib/ExecutionEngine/Orc/LLJIT.cpp @@ -96,7 +96,7 @@ LLJIT::LLJIT(std::unique_ptr ES, JITTargetMachineBuilder JTMB, ObjLinkingLayer(*this->ES, [this](VModuleKey K) { return getMemoryManager(K); }), CompileLayer(*this->ES, ObjLinkingLayer, - MultiThreadedSimpleCompiler(std::move(JTMB))), + ConcurrentIRCompiler(std::move(JTMB))), CtorRunner(Main), DtorRunner(Main) { assert(NumCompileThreads != 0 && "Multithreaded LLJIT instance can not be created with 0 threads"); -- GitLab From 01e314f12f6e0a8cbc69995cdceb6e0480df189a Mon Sep 17 00:00:00 2001 From: Chris Bieneman Date: Mon, 15 Oct 2018 22:36:59 +0000 Subject: [PATCH 0226/1116] [CMake] Fix a missing LLVM_ENABLE_IDE from r344555 This is just one place I missed swapping CMAKE_CONFIGURATION_TYPES with LLVM_ENABLE_IDE. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344568 91177308-0d34-0410-b5e6-96231b3b80d8 --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 374bddbec2d..c189bd875b4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -995,7 +995,7 @@ if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY) list(REMOVE_DUPLICATES LLVM_LIBS) foreach(lib ${LLVM_LIBS}) add_dependencies(llvm-libraries ${lib}) - if (NOT CMAKE_CONFIGURATION_TYPES) + if (NOT LLVM_ENABLE_IDE) add_dependencies(install-llvm-libraries install-${lib}) endif() endforeach() -- GitLab From 270bd836f891657ed003961b1ae43da950d531cb Mon Sep 17 00:00:00 2001 From: Nicolai Haehnle Date: Mon, 15 Oct 2018 22:37:46 +0000 Subject: [PATCH 0227/1116] StructurizeCFG,AMDGPU: Test case of a redundant phi and codegen consequences Change-Id: I9681f9e41ca30f82576f3d1f965c3a550a34b171 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344569 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/CodeGen/AMDGPU/smrd.ll | 34 ++++++++++++++ .../StructurizeCFG/loop-continue-phi.ll | 45 +++++++++++++++++++ 2 files changed, 79 insertions(+) create mode 100644 test/Transforms/StructurizeCFG/loop-continue-phi.ll diff --git a/test/CodeGen/AMDGPU/smrd.ll b/test/CodeGen/AMDGPU/smrd.ll index 6596119f8b3..b4220c25f00 100644 --- a/test/CodeGen/AMDGPU/smrd.ll +++ b/test/CodeGen/AMDGPU/smrd.ll @@ -535,6 +535,40 @@ exit: } +; GCN-LABEL: {{^}}smrd_uniform_loop2: +; (this test differs from smrd_uniform_loop by the more complex structure of phis, +; which currently confuses the DivergenceAnalysis after structurization) +; +; TODO: this should use an s_buffer_load +; +; GCN: buffer_load_dword +define amdgpu_ps float @smrd_uniform_loop2(<4 x i32> inreg %desc, i32 %bound, i32 %bound.a) #0 { +main_body: + br label %loop + +loop: + %counter = phi i32 [ 0, %main_body ], [ %counter.next, %loop.a ], [ %counter.next, %loop.b ] + %sum = phi float [ 0.0, %main_body ], [ %sum.next, %loop.a ], [ %sum.next.b, %loop.b ] + %offset = shl i32 %counter, 2 + %v = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 %offset) + %sum.next = fadd float %sum, %v + %counter.next = add i32 %counter, 1 + %cc = icmp uge i32 %counter.next, %bound + br i1 %cc, label %exit, label %loop.a + +loop.a: + %cc.a = icmp uge i32 %counter.next, %bound.a + br i1 %cc, label %loop, label %loop.b + +loop.b: + %sum.next.b = fadd float %sum.next, 1.0 + br label %loop + +exit: + ret float %sum.next +} + + declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #1 declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #2 diff --git a/test/Transforms/StructurizeCFG/loop-continue-phi.ll b/test/Transforms/StructurizeCFG/loop-continue-phi.ll new file mode 100644 index 00000000000..7e1c0b9413f --- /dev/null +++ b/test/Transforms/StructurizeCFG/loop-continue-phi.ll @@ -0,0 +1,45 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -o - -structurizecfg < %s | FileCheck %s + +; +; TODO: eliminate redundant phis for the loop counter +; +define void @test1() { +; CHECK-LABEL: @test1( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: Flow: +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[CTR_NEXT:%.*]], [[LOOP_B:%.*]] ], [ [[CTR_NEXT]], [[LOOP_A:%.*]] ] +; CHECK-NEXT: br label [[FLOW1:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[CTR:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP1:%.*]], [[FLOW1]] ] +; CHECK-NEXT: [[CTR_NEXT]] = add i32 [[CTR]], 1 +; CHECK-NEXT: br i1 undef, label [[LOOP_A]], label [[FLOW1]] +; CHECK: loop.a: +; CHECK-NEXT: br i1 undef, label [[LOOP_B]], label [[FLOW:%.*]] +; CHECK: loop.b: +; CHECK-NEXT: br label [[FLOW]] +; CHECK: Flow1: +; CHECK-NEXT: [[TMP1]] = phi i32 [ [[TMP0]], [[FLOW]] ], [ undef, [[LOOP]] ] +; CHECK-NEXT: [[TMP2:%.*]] = phi i1 [ false, [[FLOW]] ], [ true, [[LOOP]] ] +; CHECK-NEXT: br i1 [[TMP2]], label [[EXIT:%.*]], label [[LOOP]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %ctr = phi i32 [ 0, %entry ], [ %ctr.next, %loop.a ], [ %ctr.next, %loop.b ] + %ctr.next = add i32 %ctr, 1 + br i1 undef, label %exit, label %loop.a + +loop.a: + br i1 undef, label %loop, label %loop.b + +loop.b: + br label %loop + +exit: + ret void +} -- GitLab From 582b11962408a54fc88125ba5c75f7470998fe51 Mon Sep 17 00:00:00 2001 From: Lang Hames Date: Mon, 15 Oct 2018 22:56:10 +0000 Subject: [PATCH 0228/1116] [ORC] Rename ORC layers to make the "new" ORC layers the default. This commit adds a 'Legacy' prefix to old ORC layers and utilities, and removes the '2' suffix from the new ORC layers. If you wish to continue using the old ORC layers you will need to add a 'Legacy' prefix to your classes. If you were already using the new ORC layers you will need to drop the '2' suffix. The legacy layers will remain in-tree until the new layers reach feature parity with them. This will involve adding support for removing code from the new layers, and ensuring that performance is comperable. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344572 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../BuildingAJIT/Chapter1/KaleidoscopeJIT.h | 6 +- .../BuildingAJIT/Chapter2/KaleidoscopeJIT.h | 8 +- .../BuildingAJIT/Chapter3/KaleidoscopeJIT.h | 10 +- .../BuildingAJIT/Chapter4/KaleidoscopeJIT.h | 8 +- .../BuildingAJIT/Chapter5/KaleidoscopeJIT.h | 8 +- .../Kaleidoscope/include/KaleidoscopeJIT.h | 4 +- .../Orc/CompileOnDemandLayer.h | 24 +- .../llvm/ExecutionEngine/Orc/ExecutionUtils.h | 14 +- .../llvm/ExecutionEngine/Orc/IRCompileLayer.h | 12 +- .../ExecutionEngine/Orc/IRTransformLayer.h | 12 +- include/llvm/ExecutionEngine/Orc/LLJIT.h | 16 +- .../Orc/ObjectTransformLayer.h | 12 +- .../Orc/RTDyldObjectLinkingLayer.h | 26 +- .../Orc/CompileOnDemandLayer.cpp | 30 +- lib/ExecutionEngine/Orc/ExecutionUtils.cpp | 6 +- lib/ExecutionEngine/Orc/IRCompileLayer.cpp | 6 +- lib/ExecutionEngine/Orc/IRTransformLayer.cpp | 4 +- lib/ExecutionEngine/Orc/LLJIT.cpp | 2 +- .../Orc/ObjectTransformLayer.cpp | 10 +- lib/ExecutionEngine/Orc/OrcCBindingsStack.h | 16 +- .../Orc/OrcMCJITReplacement.cpp | 2 +- lib/ExecutionEngine/Orc/OrcMCJITReplacement.h | 4 +- .../Orc/RTDyldObjectLinkingLayer.cpp | 8 +- tools/lli/lli.cpp | 6 +- unittests/ExecutionEngine/Orc/CMakeLists.txt | 4 +- ...cpp => LegacyCompileOnDemandLayerTest.cpp} | 4 +- .../LegacyRTDyldObjectLinkingLayerTest.cpp | 282 +++++++++++++++ .../Orc/ObjectTransformLayerTest.cpp | 24 +- .../Orc/RTDyldObjectLinkingLayer2Test.cpp | 228 ------------ .../Orc/RTDyldObjectLinkingLayerTest.cpp | 332 ++++++++---------- 30 files changed, 564 insertions(+), 564 deletions(-) rename unittests/ExecutionEngine/Orc/{CompileOnDemandLayerTest.cpp => LegacyCompileOnDemandLayerTest.cpp} (95%) create mode 100644 unittests/ExecutionEngine/Orc/LegacyRTDyldObjectLinkingLayerTest.cpp delete mode 100644 unittests/ExecutionEngine/Orc/RTDyldObjectLinkingLayer2Test.cpp diff --git a/examples/Kaleidoscope/BuildingAJIT/Chapter1/KaleidoscopeJIT.h b/examples/Kaleidoscope/BuildingAJIT/Chapter1/KaleidoscopeJIT.h index 0b8bb381d08..8c1af40be15 100644 --- a/examples/Kaleidoscope/BuildingAJIT/Chapter1/KaleidoscopeJIT.h +++ b/examples/Kaleidoscope/BuildingAJIT/Chapter1/KaleidoscopeJIT.h @@ -42,8 +42,8 @@ private: std::shared_ptr Resolver; std::unique_ptr TM; const DataLayout DL; - RTDyldObjectLinkingLayer ObjectLayer; - IRCompileLayer CompileLayer; + LegacyRTDyldObjectLinkingLayer ObjectLayer; + LegacyIRCompileLayer CompileLayer; public: KaleidoscopeJIT() @@ -63,7 +63,7 @@ public: TM(EngineBuilder().selectTarget()), DL(TM->createDataLayout()), ObjectLayer(ES, [this](VModuleKey) { - return RTDyldObjectLinkingLayer::Resources{ + return LegacyRTDyldObjectLinkingLayer::Resources{ std::make_shared(), Resolver}; }), CompileLayer(ObjectLayer, SimpleCompiler(*TM)) { diff --git a/examples/Kaleidoscope/BuildingAJIT/Chapter2/KaleidoscopeJIT.h b/examples/Kaleidoscope/BuildingAJIT/Chapter2/KaleidoscopeJIT.h index 9ea84d1a858..7c803b138c0 100644 --- a/examples/Kaleidoscope/BuildingAJIT/Chapter2/KaleidoscopeJIT.h +++ b/examples/Kaleidoscope/BuildingAJIT/Chapter2/KaleidoscopeJIT.h @@ -47,13 +47,13 @@ private: std::shared_ptr Resolver; std::unique_ptr TM; const DataLayout DL; - RTDyldObjectLinkingLayer ObjectLayer; - IRCompileLayer CompileLayer; + LegacyRTDyldObjectLinkingLayer ObjectLayer; + LegacyIRCompileLayer CompileLayer; using OptimizeFunction = std::function(std::unique_ptr)>; - IRTransformLayer OptimizeLayer; + LegacyIRTransformLayer OptimizeLayer; public: KaleidoscopeJIT() @@ -73,7 +73,7 @@ public: TM(EngineBuilder().selectTarget()), DL(TM->createDataLayout()), ObjectLayer(ES, [this](VModuleKey) { - return RTDyldObjectLinkingLayer::Resources{ + return LegacyRTDyldObjectLinkingLayer::Resources{ std::make_shared(), Resolver}; }), CompileLayer(ObjectLayer, SimpleCompiler(*TM)), diff --git a/examples/Kaleidoscope/BuildingAJIT/Chapter3/KaleidoscopeJIT.h b/examples/Kaleidoscope/BuildingAJIT/Chapter3/KaleidoscopeJIT.h index 80c39bd70f7..ce0111d2f6b 100644 --- a/examples/Kaleidoscope/BuildingAJIT/Chapter3/KaleidoscopeJIT.h +++ b/examples/Kaleidoscope/BuildingAJIT/Chapter3/KaleidoscopeJIT.h @@ -51,23 +51,23 @@ private: std::map> Resolvers; std::unique_ptr TM; const DataLayout DL; - RTDyldObjectLinkingLayer ObjectLayer; - IRCompileLayer CompileLayer; + LegacyRTDyldObjectLinkingLayer ObjectLayer; + LegacyIRCompileLayer CompileLayer; using OptimizeFunction = std::function(std::unique_ptr)>; - IRTransformLayer OptimizeLayer; + LegacyIRTransformLayer OptimizeLayer; std::unique_ptr CompileCallbackManager; - CompileOnDemandLayer CODLayer; + LegacyCompileOnDemandLayer CODLayer; public: KaleidoscopeJIT() : TM(EngineBuilder().selectTarget()), DL(TM->createDataLayout()), ObjectLayer(ES, [this](VModuleKey K) { - return RTDyldObjectLinkingLayer::Resources{ + return LegacyRTDyldObjectLinkingLayer::Resources{ std::make_shared(), Resolvers[K]}; }), diff --git a/examples/Kaleidoscope/BuildingAJIT/Chapter4/KaleidoscopeJIT.h b/examples/Kaleidoscope/BuildingAJIT/Chapter4/KaleidoscopeJIT.h index 04ad86e34bf..ffca65fbcd4 100644 --- a/examples/Kaleidoscope/BuildingAJIT/Chapter4/KaleidoscopeJIT.h +++ b/examples/Kaleidoscope/BuildingAJIT/Chapter4/KaleidoscopeJIT.h @@ -77,13 +77,13 @@ private: std::shared_ptr Resolver; std::unique_ptr TM; const DataLayout DL; - RTDyldObjectLinkingLayer ObjectLayer; - IRCompileLayer CompileLayer; + LegacyRTDyldObjectLinkingLayer ObjectLayer; + LegacyIRCompileLayer CompileLayer; using OptimizeFunction = std::function(std::unique_ptr)>; - IRTransformLayer OptimizeLayer; + LegacyIRTransformLayer OptimizeLayer; std::unique_ptr CompileCallbackMgr; std::unique_ptr IndirectStubsMgr; @@ -108,7 +108,7 @@ public: TM(EngineBuilder().selectTarget()), DL(TM->createDataLayout()), ObjectLayer(ES, [this](VModuleKey K) { - return RTDyldObjectLinkingLayer::Resources{ + return LegacyRTDyldObjectLinkingLayer::Resources{ std::make_shared(), Resolver}; }), CompileLayer(ObjectLayer, SimpleCompiler(*TM)), diff --git a/examples/Kaleidoscope/BuildingAJIT/Chapter5/KaleidoscopeJIT.h b/examples/Kaleidoscope/BuildingAJIT/Chapter5/KaleidoscopeJIT.h index 010f5436377..f1ae5b02289 100644 --- a/examples/Kaleidoscope/BuildingAJIT/Chapter5/KaleidoscopeJIT.h +++ b/examples/Kaleidoscope/BuildingAJIT/Chapter5/KaleidoscopeJIT.h @@ -82,13 +82,13 @@ private: std::shared_ptr Resolver; std::unique_ptr TM; const DataLayout DL; - RTDyldObjectLinkingLayer ObjectLayer; - IRCompileLayer CompileLayer; + LegacyRTDyldObjectLinkingLayer ObjectLayer; + LegacyIRCompileLayer CompileLayer; using OptimizeFunction = std::function(std::unique_ptr)>; - IRTransformLayer OptimizeLayer; + LegacyIRTransformLayer OptimizeLayer; JITCompileCallbackManager *CompileCallbackMgr; std::unique_ptr IndirectStubsMgr; @@ -116,7 +116,7 @@ public: DL(TM->createDataLayout()), ObjectLayer(ES, [this](VModuleKey K) { - return RTDyldObjectLinkingLayer::Resources{ + return LegacyRTDyldObjectLinkingLayer::Resources{ cantFail(this->Remote.createRemoteMemoryManager()), Resolver}; }), diff --git a/examples/Kaleidoscope/include/KaleidoscopeJIT.h b/examples/Kaleidoscope/include/KaleidoscopeJIT.h index 7239aea7ba1..972773a64f7 100644 --- a/examples/Kaleidoscope/include/KaleidoscopeJIT.h +++ b/examples/Kaleidoscope/include/KaleidoscopeJIT.h @@ -40,8 +40,8 @@ namespace orc { class KaleidoscopeJIT { public: - using ObjLayerT = RTDyldObjectLinkingLayer; - using CompileLayerT = IRCompileLayer; + using ObjLayerT = LegacyRTDyldObjectLinkingLayer; + using CompileLayerT = LegacyIRCompileLayer; KaleidoscopeJIT() : Resolver(createLegacyLookupResolver( diff --git a/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h b/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h index 2003f8e43b8..7721f74fe0c 100644 --- a/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h +++ b/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h @@ -62,7 +62,7 @@ namespace orc { class ExtractingIRMaterializationUnit; -class CompileOnDemandLayer2 : public IRLayer { +class CompileOnDemandLayer : public IRLayer { friend class PartitioningIRMaterializationUnit; public: @@ -84,8 +84,8 @@ public: /// symbol in them is requested. static Optional compileWholeModule(GlobalValueSet Requested); - /// Construct a CompileOnDemandLayer2. - CompileOnDemandLayer2(ExecutionSession &ES, IRLayer &BaseLayer, + /// Construct a CompileOnDemandLayer. + CompileOnDemandLayer(ExecutionSession &ES, IRLayer &BaseLayer, LazyCallThroughManager &LCTMgr, IndirectStubsManagerBuilder BuildIndirectStubsManager); @@ -142,7 +142,7 @@ private: template -class CompileOnDemandLayer { +class LegacyCompileOnDemandLayer { private: template class LambdaMaterializer final : public ValueMaterializer { @@ -266,13 +266,13 @@ public: std::function R)>; /// Construct a compile-on-demand layer instance. - CompileOnDemandLayer(ExecutionSession &ES, BaseLayerT &BaseLayer, - SymbolResolverGetter GetSymbolResolver, - SymbolResolverSetter SetSymbolResolver, - PartitioningFtor Partition, - CompileCallbackMgrT &CallbackMgr, - IndirectStubsManagerBuilderT CreateIndirectStubsManager, - bool CloneStubsIntoPartitions = true) + LegacyCompileOnDemandLayer(ExecutionSession &ES, BaseLayerT &BaseLayer, + SymbolResolverGetter GetSymbolResolver, + SymbolResolverSetter SetSymbolResolver, + PartitioningFtor Partition, + CompileCallbackMgrT &CallbackMgr, + IndirectStubsManagerBuilderT CreateIndirectStubsManager, + bool CloneStubsIntoPartitions = true) : ES(ES), BaseLayer(BaseLayer), GetSymbolResolver(std::move(GetSymbolResolver)), SetSymbolResolver(std::move(SetSymbolResolver)), @@ -280,7 +280,7 @@ public: CreateIndirectStubsManager(std::move(CreateIndirectStubsManager)), CloneStubsIntoPartitions(CloneStubsIntoPartitions) {} - ~CompileOnDemandLayer() { + ~LegacyCompileOnDemandLayer() { // FIXME: Report error on log. while (!LogicalDylibs.empty()) consumeError(removeModule(LogicalDylibs.begin()->first)); diff --git a/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h b/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h index 662ed7b78e4..88559f822e5 100644 --- a/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h +++ b/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h @@ -94,11 +94,11 @@ iterator_range getDestructors(const Module &M); /// Convenience class for recording constructor/destructor names for /// later execution. template -class CtorDtorRunner { +class LegacyCtorDtorRunner { public: /// Construct a CtorDtorRunner for the given range using the given /// name mangling function. - CtorDtorRunner(std::vector CtorDtorNames, VModuleKey K) + LegacyCtorDtorRunner(std::vector CtorDtorNames, VModuleKey K) : CtorDtorNames(std::move(CtorDtorNames)), K(K) {} /// Run the recorded constructors/destructors through the given JIT @@ -129,9 +129,9 @@ private: orc::VModuleKey K; }; -class CtorDtorRunner2 { +class CtorDtorRunner { public: - CtorDtorRunner2(JITDylib &JD) : JD(JD) {} + CtorDtorRunner(JITDylib &JD) : JD(JD) {} void add(iterator_range CtorDtors); Error run(); @@ -177,11 +177,11 @@ protected: void *DSOHandle); }; -class LocalCXXRuntimeOverrides : public LocalCXXRuntimeOverridesBase { +class LegacyLocalCXXRuntimeOverrides : public LocalCXXRuntimeOverridesBase { public: /// Create a runtime-overrides class. template - LocalCXXRuntimeOverrides(const MangleFtorT &Mangle) { + LegacyLocalCXXRuntimeOverrides(const MangleFtorT &Mangle) { addOverride(Mangle("__dso_handle"), toTargetAddress(&DSOHandleOverride)); addOverride(Mangle("__cxa_atexit"), toTargetAddress(&CXAAtExitOverride)); } @@ -202,7 +202,7 @@ private: StringMap CXXRuntimeOverrides; }; -class LocalCXXRuntimeOverrides2 : public LocalCXXRuntimeOverridesBase { +class LocalCXXRuntimeOverrides : public LocalCXXRuntimeOverridesBase { public: Error enable(JITDylib &JD, MangleAndInterner &Mangler); }; diff --git a/include/llvm/ExecutionEngine/Orc/IRCompileLayer.h b/include/llvm/ExecutionEngine/Orc/IRCompileLayer.h index cb8df26bfdc..a62d8be2fa6 100644 --- a/include/llvm/ExecutionEngine/Orc/IRCompileLayer.h +++ b/include/llvm/ExecutionEngine/Orc/IRCompileLayer.h @@ -28,7 +28,7 @@ class Module; namespace orc { -class IRCompileLayer2 : public IRLayer { +class IRCompileLayer : public IRLayer { public: using CompileFunction = std::function>(Module &)>; @@ -36,8 +36,8 @@ public: using NotifyCompiledFunction = std::function; - IRCompileLayer2(ExecutionSession &ES, ObjectLayer &BaseLayer, - CompileFunction Compile); + IRCompileLayer(ExecutionSession &ES, ObjectLayer &BaseLayer, + CompileFunction Compile); void setNotifyCompiled(NotifyCompiledFunction NotifyCompiled); @@ -57,15 +57,15 @@ private: /// object file and adds this module file to the layer below, which must /// implement the object layer concept. template -class IRCompileLayer { +class LegacyIRCompileLayer { public: /// Callback type for notifications when modules are compiled. using NotifyCompiledCallback = std::function)>; - /// Construct an IRCompileLayer with the given BaseLayer, which must + /// Construct an LegacyIRCompileLayer with the given BaseLayer, which must /// implement the ObjectLayer concept. - IRCompileLayer( + LegacyIRCompileLayer( BaseLayerT &BaseLayer, CompileFtor Compile, NotifyCompiledCallback NotifyCompiled = NotifyCompiledCallback()) : BaseLayer(BaseLayer), Compile(std::move(Compile)), diff --git a/include/llvm/ExecutionEngine/Orc/IRTransformLayer.h b/include/llvm/ExecutionEngine/Orc/IRTransformLayer.h index d5f91cef359..55a1ce4c930 100644 --- a/include/llvm/ExecutionEngine/Orc/IRTransformLayer.h +++ b/include/llvm/ExecutionEngine/Orc/IRTransformLayer.h @@ -23,13 +23,13 @@ namespace llvm { class Module; namespace orc { -class IRTransformLayer2 : public IRLayer { +class IRTransformLayer : public IRLayer { public: using TransformFunction = std::function( ThreadSafeModule, const MaterializationResponsibility &R)>; - IRTransformLayer2(ExecutionSession &ES, IRLayer &BaseLayer, - TransformFunction Transform = identityTransform); + IRTransformLayer(ExecutionSession &ES, IRLayer &BaseLayer, + TransformFunction Transform = identityTransform); void setTransform(TransformFunction Transform) { this->Transform = std::move(Transform); @@ -54,11 +54,11 @@ private: /// This layer applies a user supplied transform to each module that is added, /// then adds the transformed module to the layer below. template -class IRTransformLayer { +class LegacyIRTransformLayer { public: - /// Construct an IRTransformLayer with the given BaseLayer - IRTransformLayer(BaseLayerT &BaseLayer, + /// Construct an LegacyIRTransformLayer with the given BaseLayer + LegacyIRTransformLayer(BaseLayerT &BaseLayer, TransformFtor Transform = TransformFtor()) : BaseLayer(BaseLayer), Transform(std::move(Transform)) {} diff --git a/include/llvm/ExecutionEngine/Orc/LLJIT.h b/include/llvm/ExecutionEngine/Orc/LLJIT.h index 400d4cbe7f0..05a566fedb6 100644 --- a/include/llvm/ExecutionEngine/Orc/LLJIT.h +++ b/include/llvm/ExecutionEngine/Orc/LLJIT.h @@ -99,7 +99,7 @@ public: Error runDestructors() { return DtorRunner.run(); } /// Returns a reference to the ObjLinkingLayer - RTDyldObjectLinkingLayer2 &getObjLinkingLayer() { return ObjLinkingLayer; } + RTDyldObjectLinkingLayer &getObjLinkingLayer() { return ObjLinkingLayer; } protected: @@ -125,10 +125,10 @@ protected: DataLayout DL; std::unique_ptr CompileThreads; - RTDyldObjectLinkingLayer2 ObjLinkingLayer; - IRCompileLayer2 CompileLayer; + RTDyldObjectLinkingLayer ObjLinkingLayer; + IRCompileLayer CompileLayer; - CtorDtorRunner2 CtorRunner, DtorRunner; + CtorDtorRunner CtorRunner, DtorRunner; }; /// An extended version of LLJIT that supports lazy function-at-a-time @@ -145,13 +145,13 @@ public: /// Set an IR transform (e.g. pass manager pipeline) to run on each function /// when it is compiled. - void setLazyCompileTransform(IRTransformLayer2::TransformFunction Transform) { + void setLazyCompileTransform(IRTransformLayer::TransformFunction Transform) { TransformLayer.setTransform(std::move(Transform)); } /// Sets the partition function. void - setPartitionFunction(CompileOnDemandLayer2::PartitionFunction Partition) { + setPartitionFunction(CompileOnDemandLayer::PartitionFunction Partition) { CODLayer.setPartitionFunction(std::move(Partition)); } @@ -180,8 +180,8 @@ private: std::unique_ptr LCTMgr; std::function()> ISMBuilder; - IRTransformLayer2 TransformLayer; - CompileOnDemandLayer2 CODLayer; + IRTransformLayer TransformLayer; + CompileOnDemandLayer CODLayer; }; } // End namespace orc diff --git a/include/llvm/ExecutionEngine/Orc/ObjectTransformLayer.h b/include/llvm/ExecutionEngine/Orc/ObjectTransformLayer.h index c6b43a9c8ed..6cd688ad58a 100644 --- a/include/llvm/ExecutionEngine/Orc/ObjectTransformLayer.h +++ b/include/llvm/ExecutionEngine/Orc/ObjectTransformLayer.h @@ -23,14 +23,14 @@ namespace llvm { namespace orc { -class ObjectTransformLayer2 : public ObjectLayer { +class ObjectTransformLayer : public ObjectLayer { public: using TransformFunction = std::function>( std::unique_ptr)>; - ObjectTransformLayer2(ExecutionSession &ES, ObjectLayer &BaseLayer, - TransformFunction Transform); + ObjectTransformLayer(ExecutionSession &ES, ObjectLayer &BaseLayer, + TransformFunction Transform); void emit(MaterializationResponsibility R, VModuleKey K, std::unique_ptr O) override; @@ -46,11 +46,11 @@ private: /// immediately applies the user supplied functor to each object, then adds /// the set of transformed objects to the layer below. template -class ObjectTransformLayer { +class LegacyObjectTransformLayer { public: /// Construct an ObjectTransformLayer with the given BaseLayer - ObjectTransformLayer(BaseLayerT &BaseLayer, - TransformFtor Transform = TransformFtor()) + LegacyObjectTransformLayer(BaseLayerT &BaseLayer, + TransformFtor Transform = TransformFtor()) : BaseLayer(BaseLayer), Transform(std::move(Transform)) {} /// Apply the transform functor to each object in the object set, then diff --git a/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h b/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h index 0c30520a21b..bbd782fdece 100644 --- a/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h +++ b/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h @@ -36,7 +36,7 @@ namespace llvm { namespace orc { -class RTDyldObjectLinkingLayer2 : public ObjectLayer { +class RTDyldObjectLinkingLayer : public ObjectLayer { public: /// Functor for receiving object-loaded notifications. using NotifyLoadedFunction = @@ -51,7 +51,7 @@ public: /// Construct an ObjectLinkingLayer with the given NotifyLoaded, /// and NotifyEmitted functors. - RTDyldObjectLinkingLayer2( + RTDyldObjectLinkingLayer( ExecutionSession &ES, GetMemoryManagerFunction GetMemoryManager, NotifyLoadedFunction NotifyLoaded = NotifyLoadedFunction(), NotifyEmittedFunction NotifyEmitted = NotifyEmittedFunction()); @@ -66,7 +66,7 @@ public: /// the memory manager, rather than just the sections required for execution. /// /// This is kludgy, and may be removed in the future. - RTDyldObjectLinkingLayer2 &setProcessAllSections(bool ProcessAllSections) { + RTDyldObjectLinkingLayer &setProcessAllSections(bool ProcessAllSections) { this->ProcessAllSections = ProcessAllSections; return *this; } @@ -79,13 +79,13 @@ public: /// /// FIXME: We should be able to remove this if/when COFF properly tracks /// exported symbols. - RTDyldObjectLinkingLayer2 & + RTDyldObjectLinkingLayer & setOverrideObjectFlagsWithResponsibilityFlags(bool OverrideObjectFlags) { this->OverrideObjectFlags = OverrideObjectFlags; return *this; } - /// If set, this RTDyldObjectLinkingLayer2 instance will claim responsibility + /// If set, this RTDyldObjectLinkingLayer instance will claim responsibility /// for any symbols provided by a given object file that were not already in /// the MaterializationResponsibility instance. Setting this flag allows /// higher-level program representations (e.g. LLVM IR) to be added based on @@ -96,7 +96,7 @@ public: /// deterministically). If this option is set, clashes for the additional /// symbols may not be detected until late, and detection may depend on /// the flow of control through JIT'd code. Use with care. - RTDyldObjectLinkingLayer2 & + RTDyldObjectLinkingLayer & setAutoClaimResponsibilityForObjectSymbols(bool AutoClaimObjectSymbols) { this->AutoClaimObjectSymbols = AutoClaimObjectSymbols; return *this; @@ -121,7 +121,7 @@ private: std::map> MemMgrs; }; -class RTDyldObjectLinkingLayerBase { +class LegacyRTDyldObjectLinkingLayerBase { public: using ObjectPtr = std::unique_ptr; @@ -173,10 +173,10 @@ protected: /// object files to be loaded into memory, linked, and the addresses of their /// symbols queried. All objects added to this layer can see each other's /// symbols. -class RTDyldObjectLinkingLayer : public RTDyldObjectLinkingLayerBase { +class LegacyRTDyldObjectLinkingLayer : public LegacyRTDyldObjectLinkingLayerBase { public: - using RTDyldObjectLinkingLayerBase::ObjectPtr; + using LegacyRTDyldObjectLinkingLayerBase::ObjectPtr; /// Functor for receiving object-loaded notifications. using NotifyLoadedFtor = @@ -197,7 +197,7 @@ private: template class ConcreteLinkedObject : public LinkedObject { public: - ConcreteLinkedObject(RTDyldObjectLinkingLayer &Parent, VModuleKey K, + ConcreteLinkedObject(LegacyRTDyldObjectLinkingLayer &Parent, VModuleKey K, OwnedObject Obj, MemoryManagerPtrT MemMgr, std::shared_ptr Resolver, bool ProcessAllSections) @@ -313,7 +313,7 @@ private: }; VModuleKey K; - RTDyldObjectLinkingLayer &Parent; + LegacyRTDyldObjectLinkingLayer &Parent; MemoryManagerPtrT MemMgr; OwnedObject ObjForNotify; std::unique_ptr PFC; @@ -321,7 +321,7 @@ private: template std::unique_ptr> - createLinkedObject(RTDyldObjectLinkingLayer &Parent, VModuleKey K, + createLinkedObject(LegacyRTDyldObjectLinkingLayer &Parent, VModuleKey K, OwnedObject Obj, MemoryManagerPtrT MemMgr, std::shared_ptr Resolver, bool ProcessAllSections) { @@ -341,7 +341,7 @@ public: /// Construct an ObjectLinkingLayer with the given NotifyLoaded, /// and NotifyFinalized functors. - RTDyldObjectLinkingLayer( + LegacyRTDyldObjectLinkingLayer( ExecutionSession &ES, ResourcesGetter GetResources, NotifyLoadedFtor NotifyLoaded = NotifyLoadedFtor(), NotifyFinalizedFtor NotifyFinalized = NotifyFinalizedFtor(), diff --git a/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp b/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp index ae1c7e84259..f27a814f33f 100644 --- a/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp +++ b/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp @@ -68,13 +68,13 @@ namespace orc { class PartitioningIRMaterializationUnit : public IRMaterializationUnit { public: PartitioningIRMaterializationUnit(ExecutionSession &ES, ThreadSafeModule TSM, - CompileOnDemandLayer2 &Parent) + CompileOnDemandLayer &Parent) : IRMaterializationUnit(ES, std::move(TSM)), Parent(Parent) {} PartitioningIRMaterializationUnit( ThreadSafeModule TSM, SymbolFlagsMap SymbolFlags, SymbolNameToDefinitionMap SymbolToDefinition, - CompileOnDemandLayer2 &Parent) + CompileOnDemandLayer &Parent) : IRMaterializationUnit(std::move(TSM), std::move(SymbolFlags), std::move(SymbolToDefinition)), Parent(Parent) {} @@ -93,30 +93,30 @@ private: } mutable std::mutex SourceModuleMutex; - CompileOnDemandLayer2 &Parent; + CompileOnDemandLayer &Parent; }; -Optional -CompileOnDemandLayer2::compileRequested(GlobalValueSet Requested) { +Optional +CompileOnDemandLayer::compileRequested(GlobalValueSet Requested) { return std::move(Requested); } -Optional -CompileOnDemandLayer2::compileWholeModule(GlobalValueSet Requested) { +Optional +CompileOnDemandLayer::compileWholeModule(GlobalValueSet Requested) { return None; } -CompileOnDemandLayer2::CompileOnDemandLayer2( +CompileOnDemandLayer::CompileOnDemandLayer( ExecutionSession &ES, IRLayer &BaseLayer, LazyCallThroughManager &LCTMgr, IndirectStubsManagerBuilder BuildIndirectStubsManager) : IRLayer(ES), BaseLayer(BaseLayer), LCTMgr(LCTMgr), BuildIndirectStubsManager(std::move(BuildIndirectStubsManager)) {} -void CompileOnDemandLayer2::setPartitionFunction(PartitionFunction Partition) { +void CompileOnDemandLayer::setPartitionFunction(PartitionFunction Partition) { this->Partition = std::move(Partition); } -void CompileOnDemandLayer2::emit(MaterializationResponsibility R, VModuleKey K, +void CompileOnDemandLayer::emit(MaterializationResponsibility R, VModuleKey K, ThreadSafeModule TSM) { assert(TSM.getModule() && "Null module"); @@ -160,8 +160,8 @@ void CompileOnDemandLayer2::emit(MaterializationResponsibility R, VModuleKey K, std::move(Callables))); } -CompileOnDemandLayer2::PerDylibResources & -CompileOnDemandLayer2::getPerDylibResources(JITDylib &TargetD) { +CompileOnDemandLayer::PerDylibResources & +CompileOnDemandLayer::getPerDylibResources(JITDylib &TargetD) { auto I = DylibResources.find(&TargetD); if (I == DylibResources.end()) { auto &ImplD = @@ -176,7 +176,7 @@ CompileOnDemandLayer2::getPerDylibResources(JITDylib &TargetD) { return I->second; } -void CompileOnDemandLayer2::cleanUpModule(Module &M) { +void CompileOnDemandLayer::cleanUpModule(Module &M) { for (auto &F : M.functions()) { if (F.isDeclaration()) continue; @@ -189,7 +189,7 @@ void CompileOnDemandLayer2::cleanUpModule(Module &M) { } } -void CompileOnDemandLayer2::expandPartition(GlobalValueSet &Partition) { +void CompileOnDemandLayer::expandPartition(GlobalValueSet &Partition) { // Expands the partition to ensure the following rules hold: // (1) If any alias is in the partition, its aliasee is also in the partition. // (2) If any aliasee is in the partition, its aliases are also in the @@ -221,7 +221,7 @@ void CompileOnDemandLayer2::expandPartition(GlobalValueSet &Partition) { Partition.insert(GV); } -void CompileOnDemandLayer2::emitPartition( +void CompileOnDemandLayer::emitPartition( MaterializationResponsibility R, ThreadSafeModule TSM, IRMaterializationUnit::SymbolNameToDefinitionMap Defs) { diff --git a/lib/ExecutionEngine/Orc/ExecutionUtils.cpp b/lib/ExecutionEngine/Orc/ExecutionUtils.cpp index 4c8f725df54..21a604f71ca 100644 --- a/lib/ExecutionEngine/Orc/ExecutionUtils.cpp +++ b/lib/ExecutionEngine/Orc/ExecutionUtils.cpp @@ -87,7 +87,7 @@ iterator_range getDestructors(const Module &M) { CtorDtorIterator(DtorsList, true)); } -void CtorDtorRunner2::add(iterator_range CtorDtors) { +void CtorDtorRunner::add(iterator_range CtorDtors) { if (CtorDtors.begin() == CtorDtors.end()) return; @@ -115,7 +115,7 @@ void CtorDtorRunner2::add(iterator_range CtorDtors) { } } -Error CtorDtorRunner2::run() { +Error CtorDtorRunner::run() { using CtorDtorTy = void (*)(); SymbolNameSet Names; @@ -165,7 +165,7 @@ int LocalCXXRuntimeOverridesBase::CXAAtExitOverride(DestructorPtr Destructor, return 0; } -Error LocalCXXRuntimeOverrides2::enable(JITDylib &JD, +Error LocalCXXRuntimeOverrides::enable(JITDylib &JD, MangleAndInterner &Mangle) { SymbolMap RuntimeInterposes; RuntimeInterposes[Mangle("__dso_handle")] = diff --git a/lib/ExecutionEngine/Orc/IRCompileLayer.cpp b/lib/ExecutionEngine/Orc/IRCompileLayer.cpp index 5dee1c80e0b..6d029e16ba9 100644 --- a/lib/ExecutionEngine/Orc/IRCompileLayer.cpp +++ b/lib/ExecutionEngine/Orc/IRCompileLayer.cpp @@ -12,16 +12,16 @@ namespace llvm { namespace orc { -IRCompileLayer2::IRCompileLayer2(ExecutionSession &ES, ObjectLayer &BaseLayer, +IRCompileLayer::IRCompileLayer(ExecutionSession &ES, ObjectLayer &BaseLayer, CompileFunction Compile) : IRLayer(ES), BaseLayer(BaseLayer), Compile(std::move(Compile)) {} -void IRCompileLayer2::setNotifyCompiled(NotifyCompiledFunction NotifyCompiled) { +void IRCompileLayer::setNotifyCompiled(NotifyCompiledFunction NotifyCompiled) { std::lock_guard Lock(IRLayerMutex); this->NotifyCompiled = std::move(NotifyCompiled); } -void IRCompileLayer2::emit(MaterializationResponsibility R, VModuleKey K, +void IRCompileLayer::emit(MaterializationResponsibility R, VModuleKey K, ThreadSafeModule TSM) { assert(TSM.getModule() && "Module must not be null"); diff --git a/lib/ExecutionEngine/Orc/IRTransformLayer.cpp b/lib/ExecutionEngine/Orc/IRTransformLayer.cpp index 7a79a382d8d..acba7916d40 100644 --- a/lib/ExecutionEngine/Orc/IRTransformLayer.cpp +++ b/lib/ExecutionEngine/Orc/IRTransformLayer.cpp @@ -13,12 +13,12 @@ namespace llvm { namespace orc { -IRTransformLayer2::IRTransformLayer2(ExecutionSession &ES, +IRTransformLayer::IRTransformLayer(ExecutionSession &ES, IRLayer &BaseLayer, TransformFunction Transform) : IRLayer(ES), BaseLayer(BaseLayer), Transform(std::move(Transform)) {} -void IRTransformLayer2::emit(MaterializationResponsibility R, VModuleKey K, +void IRTransformLayer::emit(MaterializationResponsibility R, VModuleKey K, ThreadSafeModule TSM) { assert(TSM.getModule() && "Module must not be null"); diff --git a/lib/ExecutionEngine/Orc/LLJIT.cpp b/lib/ExecutionEngine/Orc/LLJIT.cpp index 478ac2e2148..e464da267ae 100644 --- a/lib/ExecutionEngine/Orc/LLJIT.cpp +++ b/lib/ExecutionEngine/Orc/LLJIT.cpp @@ -21,7 +21,7 @@ namespace { : llvm::orc::SimpleCompiler(*TM), TM(std::move(TM)) {} private: // FIXME: shared because std::functions (and thus - // IRCompileLayer2::CompileFunction) are not moveable. + // IRCompileLayer::CompileFunction) are not moveable. std::shared_ptr TM; }; diff --git a/lib/ExecutionEngine/Orc/ObjectTransformLayer.cpp b/lib/ExecutionEngine/Orc/ObjectTransformLayer.cpp index 6980c8140fd..0be23f2e1a4 100644 --- a/lib/ExecutionEngine/Orc/ObjectTransformLayer.cpp +++ b/lib/ExecutionEngine/Orc/ObjectTransformLayer.cpp @@ -13,13 +13,13 @@ namespace llvm { namespace orc { -ObjectTransformLayer2::ObjectTransformLayer2(ExecutionSession &ES, - ObjectLayer &BaseLayer, - TransformFunction Transform) +ObjectTransformLayer::ObjectTransformLayer(ExecutionSession &ES, + ObjectLayer &BaseLayer, + TransformFunction Transform) : ObjectLayer(ES), BaseLayer(BaseLayer), Transform(std::move(Transform)) {} -void ObjectTransformLayer2::emit(MaterializationResponsibility R, VModuleKey K, - std::unique_ptr O) { +void ObjectTransformLayer::emit(MaterializationResponsibility R, VModuleKey K, + std::unique_ptr O) { assert(O && "Module must not be null"); if (auto TransformedObj = Transform(std::move(O))) diff --git a/lib/ExecutionEngine/Orc/OrcCBindingsStack.h b/lib/ExecutionEngine/Orc/OrcCBindingsStack.h index 3fedba1caa6..deddfcb10e1 100644 --- a/lib/ExecutionEngine/Orc/OrcCBindingsStack.h +++ b/lib/ExecutionEngine/Orc/OrcCBindingsStack.h @@ -77,9 +77,9 @@ public: }; template <> - class GenericLayerImpl : public GenericLayer { + class GenericLayerImpl : public GenericLayer { private: - using LayerT = orc::RTDyldObjectLinkingLayer; + using LayerT = orc::LegacyRTDyldObjectLinkingLayer; public: GenericLayerImpl(LayerT &Layer) : Layer(Layer) {} @@ -107,10 +107,10 @@ class OrcCBindingsStack { public: using CompileCallbackMgr = orc::JITCompileCallbackManager; - using ObjLayerT = orc::RTDyldObjectLinkingLayer; - using CompileLayerT = orc::IRCompileLayer; + using ObjLayerT = orc::LegacyRTDyldObjectLinkingLayer; + using CompileLayerT = orc::LegacyIRCompileLayer; using CODLayerT = - orc::CompileOnDemandLayer; + orc::LegacyCompileOnDemandLayer; using CallbackManagerBuilder = std::function()>; @@ -312,7 +312,7 @@ public: // Run the static constructors, and save the static destructor runner for // execution when the JIT is torn down. - orc::CtorDtorRunner CtorRunner(std::move(CtorNames), K); + orc::LegacyCtorDtorRunner CtorRunner(std::move(CtorNames), K); if (auto Err = CtorRunner.runViaLayer(*this)) return std::move(Err); @@ -517,8 +517,8 @@ private: std::map> KeyLayers; - orc::LocalCXXRuntimeOverrides CXXRuntimeOverrides; - std::vector> IRStaticDestructorRunners; + orc::LegacyLocalCXXRuntimeOverrides CXXRuntimeOverrides; + std::vector> IRStaticDestructorRunners; std::string ErrMsg; ResolverMap Resolvers; diff --git a/lib/ExecutionEngine/Orc/OrcMCJITReplacement.cpp b/lib/ExecutionEngine/Orc/OrcMCJITReplacement.cpp index 4def579e709..617bc2fc64b 100644 --- a/lib/ExecutionEngine/Orc/OrcMCJITReplacement.cpp +++ b/lib/ExecutionEngine/Orc/OrcMCJITReplacement.cpp @@ -128,7 +128,7 @@ void OrcMCJITReplacement::runStaticConstructorsDestructors(bool isDtors) { auto &CtorDtorsMap = isDtors ? UnexecutedDestructors : UnexecutedConstructors; for (auto &KV : CtorDtorsMap) - cantFail(CtorDtorRunner(std::move(KV.second), KV.first) + cantFail(LegacyCtorDtorRunner(std::move(KV.second), KV.first) .runViaLayer(LazyEmitLayer)); CtorDtorsMap.clear(); diff --git a/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h b/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h index 1195d39561d..36e7e83a8ba 100644 --- a/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h +++ b/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h @@ -461,8 +461,8 @@ private: return MangledName; } - using ObjectLayerT = RTDyldObjectLinkingLayer; - using CompileLayerT = IRCompileLayer; + using ObjectLayerT = LegacyRTDyldObjectLinkingLayer; + using CompileLayerT = LegacyIRCompileLayer; using LazyEmitLayerT = LazyEmittingLayer; ExecutionSession ES; diff --git a/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp b/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp index e84295ca215..fa574140d48 100644 --- a/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp +++ b/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp @@ -76,14 +76,14 @@ private: namespace llvm { namespace orc { -RTDyldObjectLinkingLayer2::RTDyldObjectLinkingLayer2( +RTDyldObjectLinkingLayer::RTDyldObjectLinkingLayer( ExecutionSession &ES, GetMemoryManagerFunction GetMemoryManager, NotifyLoadedFunction NotifyLoaded, NotifyEmittedFunction NotifyEmitted) : ObjectLayer(ES), GetMemoryManager(GetMemoryManager), NotifyLoaded(std::move(NotifyLoaded)), NotifyEmitted(std::move(NotifyEmitted)) {} -void RTDyldObjectLinkingLayer2::emit(MaterializationResponsibility R, +void RTDyldObjectLinkingLayer::emit(MaterializationResponsibility R, VModuleKey K, std::unique_ptr O) { assert(O && "Object must not be null"); @@ -153,7 +153,7 @@ void RTDyldObjectLinkingLayer2::emit(MaterializationResponsibility R, }); } -Error RTDyldObjectLinkingLayer2::onObjLoad( +Error RTDyldObjectLinkingLayer::onObjLoad( VModuleKey K, MaterializationResponsibility &R, object::ObjectFile &Obj, std::unique_ptr LoadedObjInfo, std::map Resolved, @@ -196,7 +196,7 @@ Error RTDyldObjectLinkingLayer2::onObjLoad( return Error::success(); } -void RTDyldObjectLinkingLayer2::onObjEmit(VModuleKey K, +void RTDyldObjectLinkingLayer::onObjEmit(VModuleKey K, MaterializationResponsibility &R, Error Err) { if (Err) { diff --git a/tools/lli/lli.cpp b/tools/lli/lli.cpp index d633fe6f800..f4585dc080d 100644 --- a/tools/lli/lli.cpp +++ b/tools/lli/lli.cpp @@ -696,7 +696,7 @@ int main(int argc, char **argv, char * const *envp) { return Result; } -static orc::IRTransformLayer2::TransformFunction createDebugDumper() { +static orc::IRTransformLayer::TransformFunction createDebugDumper() { switch (OrcDumpKind) { case DumpKind::NoDump: return [](orc::ThreadSafeModule TSM, @@ -781,7 +781,7 @@ int runOrcLazyJIT(const char *ProgName) { auto J = ExitOnErr(orc::LLLazyJIT::Create(std::move(JTMB), DL, LazyJITCompileThreads)); if (PerModuleLazy) - J->setPartitionFunction(orc::CompileOnDemandLayer2::compileWholeModule); + J->setPartitionFunction(orc::CompileOnDemandLayer::compileWholeModule); auto Dump = createDebugDumper(); @@ -797,7 +797,7 @@ int runOrcLazyJIT(const char *ProgName) { ExitOnErr(orc::DynamicLibrarySearchGenerator::GetForCurrentProcess(DL))); orc::MangleAndInterner Mangle(J->getExecutionSession(), DL); - orc::LocalCXXRuntimeOverrides2 CXXRuntimeOverrides; + orc::LocalCXXRuntimeOverrides CXXRuntimeOverrides; ExitOnErr(CXXRuntimeOverrides.enable(J->getMainJITDylib(), Mangle)); // Add the main module. diff --git a/unittests/ExecutionEngine/Orc/CMakeLists.txt b/unittests/ExecutionEngine/Orc/CMakeLists.txt index 8b0d5fc2435..019437d4ad5 100644 --- a/unittests/ExecutionEngine/Orc/CMakeLists.txt +++ b/unittests/ExecutionEngine/Orc/CMakeLists.txt @@ -10,7 +10,6 @@ set(LLVM_LINK_COMPONENTS ) add_llvm_unittest(OrcJITTests - CompileOnDemandLayerTest.cpp CoreAPIsTest.cpp IndirectionUtilsTest.cpp GlobalMappingLayerTest.cpp @@ -18,6 +17,8 @@ add_llvm_unittest(OrcJITTests LazyCallThroughAndReexportsTest.cpp LazyEmittingLayerTest.cpp LegacyAPIInteropTest.cpp + LegacyCompileOnDemandLayerTest.cpp + LegacyRTDyldObjectLinkingLayerTest.cpp ObjectTransformLayerTest.cpp OrcCAPITest.cpp OrcTestCommon.cpp @@ -25,7 +26,6 @@ add_llvm_unittest(OrcJITTests RemoteObjectLayerTest.cpp RPCUtilsTest.cpp RTDyldObjectLinkingLayerTest.cpp - RTDyldObjectLinkingLayer2Test.cpp SymbolStringPoolTest.cpp ThreadSafeModuleTest.cpp ) diff --git a/unittests/ExecutionEngine/Orc/CompileOnDemandLayerTest.cpp b/unittests/ExecutionEngine/Orc/LegacyCompileOnDemandLayerTest.cpp similarity index 95% rename from unittests/ExecutionEngine/Orc/CompileOnDemandLayerTest.cpp rename to unittests/ExecutionEngine/Orc/LegacyCompileOnDemandLayerTest.cpp index 9aa4437550b..38f7a654571 100644 --- a/unittests/ExecutionEngine/Orc/CompileOnDemandLayerTest.cpp +++ b/unittests/ExecutionEngine/Orc/LegacyCompileOnDemandLayerTest.cpp @@ -54,7 +54,7 @@ public: } }; -TEST(CompileOnDemandLayerTest, FindSymbol) { +TEST(LegacyCompileOnDemandLayerTest, FindSymbol) { MockBaseLayer> TestBaseLayer; TestBaseLayer.findSymbolImpl = [](const std::string &Name, bool) { @@ -76,7 +76,7 @@ TEST(CompileOnDemandLayerTest, FindSymbol) { llvm_unreachable("Should never be called"); }; - llvm::orc::CompileOnDemandLayer COD( + llvm::orc::LegacyCompileOnDemandLayer COD( ES, TestBaseLayer, GetResolver, SetResolver, [](Function &F) { return std::set{&F}; }, CallbackMgr, [] { return llvm::make_unique(); }, true); diff --git a/unittests/ExecutionEngine/Orc/LegacyRTDyldObjectLinkingLayerTest.cpp b/unittests/ExecutionEngine/Orc/LegacyRTDyldObjectLinkingLayerTest.cpp new file mode 100644 index 00000000000..8c9c958cc42 --- /dev/null +++ b/unittests/ExecutionEngine/Orc/LegacyRTDyldObjectLinkingLayerTest.cpp @@ -0,0 +1,282 @@ +//===- RTDyldObjectLinkingLayerTest.cpp - RTDyld linking layer unit tests -===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h" +#include "OrcTestCommon.h" +#include "llvm/ExecutionEngine/ExecutionEngine.h" +#include "llvm/ExecutionEngine/Orc/CompileUtils.h" +#include "llvm/ExecutionEngine/Orc/LambdaResolver.h" +#include "llvm/ExecutionEngine/Orc/Legacy.h" +#include "llvm/ExecutionEngine/Orc/NullResolver.h" +#include "llvm/ExecutionEngine/SectionMemoryManager.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/LLVMContext.h" +#include "gtest/gtest.h" + +using namespace llvm; +using namespace llvm::orc; + +namespace { + +class LegacyRTDyldObjectLinkingLayerExecutionTest : public testing::Test, + public OrcExecutionTest { + +}; + +class SectionMemoryManagerWrapper : public SectionMemoryManager { +public: + int FinalizationCount = 0; + int NeedsToReserveAllocationSpaceCount = 0; + + bool needsToReserveAllocationSpace() override { + ++NeedsToReserveAllocationSpaceCount; + return SectionMemoryManager::needsToReserveAllocationSpace(); + } + + bool finalizeMemory(std::string *ErrMsg = nullptr) override { + ++FinalizationCount; + return SectionMemoryManager::finalizeMemory(ErrMsg); + } +}; + +TEST(LegacyRTDyldObjectLinkingLayerTest, TestSetProcessAllSections) { + class MemoryManagerWrapper : public SectionMemoryManager { + public: + MemoryManagerWrapper(bool &DebugSeen) : DebugSeen(DebugSeen) {} + uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment, + unsigned SectionID, + StringRef SectionName, + bool IsReadOnly) override { + if (SectionName == ".debug_str") + DebugSeen = true; + return SectionMemoryManager::allocateDataSection(Size, Alignment, + SectionID, + SectionName, + IsReadOnly); + } + private: + bool &DebugSeen; + }; + + bool DebugSectionSeen = false; + auto MM = std::make_shared(DebugSectionSeen); + + ExecutionSession ES; + + LegacyRTDyldObjectLinkingLayer ObjLayer(ES, [&MM](VModuleKey) { + return LegacyRTDyldObjectLinkingLayer::Resources{ + MM, std::make_shared()}; + }); + + LLVMContext Context; + auto M = llvm::make_unique("", Context); + M->setTargetTriple("x86_64-unknown-linux-gnu"); + Type *Int32Ty = IntegerType::get(Context, 32); + GlobalVariable *GV = + new GlobalVariable(*M, Int32Ty, false, GlobalValue::ExternalLinkage, + ConstantInt::get(Int32Ty, 42), "foo"); + + GV->setSection(".debug_str"); + + + // Initialize the native target in case this is the first unit test + // to try to build a TM. + OrcNativeTarget::initialize(); + std::unique_ptr TM( + EngineBuilder().selectTarget(Triple(M->getTargetTriple()), "", "", + SmallVector())); + if (!TM) + return; + + auto Obj = SimpleCompiler(*TM)(*M); + + { + // Test with ProcessAllSections = false (the default). + auto K = ES.allocateVModule(); + cantFail(ObjLayer.addObject( + K, MemoryBuffer::getMemBufferCopy(Obj->getBuffer()))); + cantFail(ObjLayer.emitAndFinalize(K)); + EXPECT_EQ(DebugSectionSeen, false) + << "Unexpected debug info section"; + cantFail(ObjLayer.removeObject(K)); + } + + { + // Test with ProcessAllSections = true. + ObjLayer.setProcessAllSections(true); + auto K = ES.allocateVModule(); + cantFail(ObjLayer.addObject(K, std::move(Obj))); + cantFail(ObjLayer.emitAndFinalize(K)); + EXPECT_EQ(DebugSectionSeen, true) + << "Expected debug info section not seen"; + cantFail(ObjLayer.removeObject(K)); + } +} + +TEST_F(LegacyRTDyldObjectLinkingLayerExecutionTest, NoDuplicateFinalization) { + if (!SupportsJIT) + return; + + ExecutionSession ES; + + auto MM = std::make_shared(); + + std::map> Resolvers; + + LegacyRTDyldObjectLinkingLayer ObjLayer(ES, [&](VModuleKey K) { + auto I = Resolvers.find(K); + assert(I != Resolvers.end() && "Missing resolver"); + auto R = std::move(I->second); + Resolvers.erase(I); + return LegacyRTDyldObjectLinkingLayer::Resources{MM, std::move(R)}; + }); + SimpleCompiler Compile(*TM); + + // Create a pair of modules that will trigger recursive finalization: + // Module 1: + // int bar() { return 42; } + // Module 2: + // int bar(); + // int foo() { return bar(); } + // + // Verify that the memory manager is only finalized once (for Module 2). + // Failure suggests that finalize is being called on the inner RTDyld + // instance (for Module 1) which is unsafe, as it will prevent relocation of + // Module 2. + + ModuleBuilder MB1(Context, "", "dummy"); + { + MB1.getModule()->setDataLayout(TM->createDataLayout()); + Function *BarImpl = MB1.createFunctionDecl("bar"); + BasicBlock *BarEntry = BasicBlock::Create(Context, "entry", BarImpl); + IRBuilder<> Builder(BarEntry); + IntegerType *Int32Ty = IntegerType::get(Context, 32); + Value *FourtyTwo = ConstantInt::getSigned(Int32Ty, 42); + Builder.CreateRet(FourtyTwo); + } + + auto Obj1 = Compile(*MB1.getModule()); + + ModuleBuilder MB2(Context, "", "dummy"); + { + MB2.getModule()->setDataLayout(TM->createDataLayout()); + Function *BarDecl = MB2.createFunctionDecl("bar"); + Function *FooImpl = MB2.createFunctionDecl("foo"); + BasicBlock *FooEntry = BasicBlock::Create(Context, "entry", FooImpl); + IRBuilder<> Builder(FooEntry); + Builder.CreateRet(Builder.CreateCall(BarDecl)); + } + auto Obj2 = Compile(*MB2.getModule()); + + auto K1 = ES.allocateVModule(); + Resolvers[K1] = std::make_shared(); + cantFail(ObjLayer.addObject(K1, std::move(Obj1))); + + auto K2 = ES.allocateVModule(); + auto LegacyLookup = [&](const std::string &Name) { + return ObjLayer.findSymbol(Name, true); + }; + + Resolvers[K2] = createSymbolResolver( + [&](const SymbolNameSet &Symbols) { + return cantFail( + getResponsibilitySetWithLegacyFn(Symbols, LegacyLookup)); + }, + [&](std::shared_ptr Query, + const SymbolNameSet &Symbols) { + return lookupWithLegacyFn(ES, *Query, Symbols, LegacyLookup); + }); + + cantFail(ObjLayer.addObject(K2, std::move(Obj2))); + cantFail(ObjLayer.emitAndFinalize(K2)); + cantFail(ObjLayer.removeObject(K2)); + + // Finalization of module 2 should trigger finalization of module 1. + // Verify that finalize on SMMW is only called once. + EXPECT_EQ(MM->FinalizationCount, 1) + << "Extra call to finalize"; +} + +TEST_F(LegacyRTDyldObjectLinkingLayerExecutionTest, NoPrematureAllocation) { + if (!SupportsJIT) + return; + + ExecutionSession ES; + + auto MM = std::make_shared(); + + LegacyRTDyldObjectLinkingLayer ObjLayer(ES, [&MM](VModuleKey K) { + return LegacyRTDyldObjectLinkingLayer::Resources{ + MM, std::make_shared()}; + }); + SimpleCompiler Compile(*TM); + + // Create a pair of unrelated modules: + // + // Module 1: + // int foo() { return 42; } + // Module 2: + // int bar() { return 7; } + // + // Both modules will share a memory manager. We want to verify that the + // second object is not loaded before the first one is finalized. To do this + // in a portable way, we abuse the + // RuntimeDyld::MemoryManager::needsToReserveAllocationSpace hook, which is + // called once per object before any sections are allocated. + + ModuleBuilder MB1(Context, "", "dummy"); + { + MB1.getModule()->setDataLayout(TM->createDataLayout()); + Function *BarImpl = MB1.createFunctionDecl("foo"); + BasicBlock *BarEntry = BasicBlock::Create(Context, "entry", BarImpl); + IRBuilder<> Builder(BarEntry); + IntegerType *Int32Ty = IntegerType::get(Context, 32); + Value *FourtyTwo = ConstantInt::getSigned(Int32Ty, 42); + Builder.CreateRet(FourtyTwo); + } + + auto Obj1 = Compile(*MB1.getModule()); + + ModuleBuilder MB2(Context, "", "dummy"); + { + MB2.getModule()->setDataLayout(TM->createDataLayout()); + Function *BarImpl = MB2.createFunctionDecl("bar"); + BasicBlock *BarEntry = BasicBlock::Create(Context, "entry", BarImpl); + IRBuilder<> Builder(BarEntry); + IntegerType *Int32Ty = IntegerType::get(Context, 32); + Value *Seven = ConstantInt::getSigned(Int32Ty, 7); + Builder.CreateRet(Seven); + } + auto Obj2 = Compile(*MB2.getModule()); + + auto K = ES.allocateVModule(); + cantFail(ObjLayer.addObject(K, std::move(Obj1))); + cantFail(ObjLayer.addObject(ES.allocateVModule(), std::move(Obj2))); + cantFail(ObjLayer.emitAndFinalize(K)); + cantFail(ObjLayer.removeObject(K)); + + // Only one call to needsToReserveAllocationSpace should have been made. + EXPECT_EQ(MM->NeedsToReserveAllocationSpaceCount, 1) + << "More than one call to needsToReserveAllocationSpace " + "(multiple unrelated objects loaded prior to finalization)"; +} + +TEST_F(LegacyRTDyldObjectLinkingLayerExecutionTest, TestNotifyLoadedSignature) { + ExecutionSession ES; + LegacyRTDyldObjectLinkingLayer ObjLayer( + ES, + [](VModuleKey) { + return LegacyRTDyldObjectLinkingLayer::Resources{ + nullptr, std::make_shared()}; + }, + [](VModuleKey, const object::ObjectFile &obj, + const RuntimeDyld::LoadedObjectInfo &info) {}); +} + +} // end anonymous namespace diff --git a/unittests/ExecutionEngine/Orc/ObjectTransformLayerTest.cpp b/unittests/ExecutionEngine/Orc/ObjectTransformLayerTest.cpp index 6ad3c19ada9..1c530247a7c 100644 --- a/unittests/ExecutionEngine/Orc/ObjectTransformLayerTest.cpp +++ b/unittests/ExecutionEngine/Orc/ObjectTransformLayerTest.cpp @@ -175,19 +175,19 @@ private: } }; -// Test each operation on ObjectTransformLayer. -TEST(ObjectTransformLayerTest, Main) { +// Test each operation on LegacyObjectTransformLayer. +TEST(LegacyObjectTransformLayerTest, Main) { MockBaseLayer M; ExecutionSession ES(std::make_shared()); // Create one object transform layer using a transform (as a functor) // that allocates new objects, and deals in unique pointers. - ObjectTransformLayer T1(M); + LegacyObjectTransformLayer T1(M); // Create a second object transform layer using a transform (as a lambda) // that mutates objects in place, and deals in naked pointers - ObjectTransformLayer( std::shared_ptr)>> T2(M, [](std::shared_ptr Obj) { @@ -257,9 +257,9 @@ TEST(ObjectTransformLayerTest, Main) { if (!RunStaticChecks) return; - // Make sure that ObjectTransformLayer implements the object layer concept + // Make sure that LegacyObjectTransformLayer implements the object layer concept // correctly by sandwitching one between an ObjectLinkingLayer and an - // IRCompileLayer, verifying that it compiles if we have a call to the + // LegacyIRCompileLayer, verifying that it compiles if we have a call to the // IRComileLayer's addModule that should call the transform layer's // addObject, and also calling the other public transform layer methods // directly to make sure the methods they intend to forward to exist on @@ -282,8 +282,8 @@ TEST(ObjectTransformLayerTest, Main) { }; // Construct the jit layers. - RTDyldObjectLinkingLayer BaseLayer(ES, [](VModuleKey) { - return RTDyldObjectLinkingLayer::Resources{ + LegacyRTDyldObjectLinkingLayer BaseLayer(ES, [](VModuleKey) { + return LegacyRTDyldObjectLinkingLayer::Resources{ std::make_shared(), std::make_shared()}; }); @@ -291,20 +291,20 @@ TEST(ObjectTransformLayerTest, Main) { auto IdentityTransform = [](std::unique_ptr Obj) { return Obj; }; - ObjectTransformLayer + LegacyObjectTransformLayer TransformLayer(BaseLayer, IdentityTransform); auto NullCompiler = [](llvm::Module &) { return std::unique_ptr(nullptr); }; - IRCompileLayer + LegacyIRCompileLayer CompileLayer(TransformLayer, NullCompiler); - // Make sure that the calls from IRCompileLayer to ObjectTransformLayer + // Make sure that the calls from LegacyIRCompileLayer to LegacyObjectTransformLayer // compile. cantFail(CompileLayer.addModule(ES.allocateVModule(), std::unique_ptr())); - // Make sure that the calls from ObjectTransformLayer to ObjectLinkingLayer + // Make sure that the calls from LegacyObjectTransformLayer to ObjectLinkingLayer // compile. VModuleKey DummyKey = ES.allocateVModule(); cantFail(TransformLayer.emitAndFinalize(DummyKey)); diff --git a/unittests/ExecutionEngine/Orc/RTDyldObjectLinkingLayer2Test.cpp b/unittests/ExecutionEngine/Orc/RTDyldObjectLinkingLayer2Test.cpp deleted file mode 100644 index 1dbd48b5972..00000000000 --- a/unittests/ExecutionEngine/Orc/RTDyldObjectLinkingLayer2Test.cpp +++ /dev/null @@ -1,228 +0,0 @@ -//===--- RTDyldObjectLinkingLayer2Test.cpp - RTDyld linking layer tests ---===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#include "OrcTestCommon.h" -#include "llvm/ExecutionEngine/ExecutionEngine.h" -#include "llvm/ExecutionEngine/Orc/CompileUtils.h" -#include "llvm/ExecutionEngine/Orc/IRCompileLayer.h" -#include "llvm/ExecutionEngine/Orc/LambdaResolver.h" -#include "llvm/ExecutionEngine/Orc/Legacy.h" -#include "llvm/ExecutionEngine/Orc/NullResolver.h" -#include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h" -#include "llvm/ExecutionEngine/SectionMemoryManager.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/LLVMContext.h" -#include "gtest/gtest.h" - -using namespace llvm; -using namespace llvm::orc; - -namespace { - -class RTDyldObjectLinkingLayer2ExecutionTest : public testing::Test, - public OrcExecutionTest {}; - -// Adds an object with a debug section to RuntimeDyld and then returns whether -// the debug section was passed to the memory manager. -static bool testSetProcessAllSections(std::unique_ptr Obj, - bool ProcessAllSections) { - class MemoryManagerWrapper : public SectionMemoryManager { - public: - MemoryManagerWrapper(bool &DebugSeen) : DebugSeen(DebugSeen) {} - uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment, - unsigned SectionID, StringRef SectionName, - bool IsReadOnly) override { - if (SectionName == ".debug_str") - DebugSeen = true; - return SectionMemoryManager::allocateDataSection( - Size, Alignment, SectionID, SectionName, IsReadOnly); - } - - private: - bool &DebugSeen; - }; - - bool DebugSectionSeen = false; - - ExecutionSession ES; - auto &JD = ES.createJITDylib("main"); - auto Foo = ES.intern("foo"); - - RTDyldObjectLinkingLayer2 ObjLayer(ES, [&DebugSectionSeen](VModuleKey) { - return llvm::make_unique(DebugSectionSeen); - }); - - auto OnResolveDoNothing = [](Expected R) { - cantFail(std::move(R)); - }; - - auto OnReadyDoNothing = [](Error Err) { cantFail(std::move(Err)); }; - - ObjLayer.setProcessAllSections(ProcessAllSections); - auto K = ES.allocateVModule(); - cantFail(ObjLayer.add(JD, K, std::move(Obj))); - ES.lookup({&JD}, {Foo}, OnResolveDoNothing, OnReadyDoNothing, - NoDependenciesToRegister); - return DebugSectionSeen; -} - -TEST(RTDyldObjectLinkingLayer2Test, TestSetProcessAllSections) { - LLVMContext Context; - auto M = llvm::make_unique("", Context); - M->setTargetTriple("x86_64-unknown-linux-gnu"); - Type *Int32Ty = IntegerType::get(Context, 32); - GlobalVariable *GV = - new GlobalVariable(*M, Int32Ty, false, GlobalValue::ExternalLinkage, - ConstantInt::get(Int32Ty, 42), "foo"); - - GV->setSection(".debug_str"); - - // Initialize the native target in case this is the first unit test - // to try to build a TM. - OrcNativeTarget::initialize(); - std::unique_ptr TM(EngineBuilder().selectTarget( - Triple(M->getTargetTriple()), "", "", SmallVector())); - if (!TM) - return; - - auto Obj = SimpleCompiler(*TM)(*M); - - EXPECT_FALSE(testSetProcessAllSections( - MemoryBuffer::getMemBufferCopy(Obj->getBuffer()), false)) - << "Debug section seen despite ProcessAllSections being false"; - EXPECT_TRUE(testSetProcessAllSections(std::move(Obj), true)) - << "Expected to see debug section when ProcessAllSections is true"; -} - -TEST(RTDyldObjectLinkingLayer2Test, TestOverrideObjectFlags) { - - OrcNativeTarget::initialize(); - - std::unique_ptr TM( - EngineBuilder().selectTarget(Triple("x86_64-unknown-linux-gnu"), "", "", - SmallVector())); - - if (!TM) - return; - - // Our compiler is going to modify symbol visibility settings without telling - // ORC. This will test our ability to override the flags later. - class FunkySimpleCompiler : public SimpleCompiler { - public: - FunkySimpleCompiler(TargetMachine &TM) : SimpleCompiler(TM) {} - - CompileResult operator()(Module &M) { - auto *Foo = M.getFunction("foo"); - assert(Foo && "Expected function Foo not found"); - Foo->setVisibility(GlobalValue::HiddenVisibility); - return SimpleCompiler::operator()(M); - } - }; - - // Create a module with two void() functions: foo and bar. - ThreadSafeContext TSCtx(llvm::make_unique()); - ThreadSafeModule M; - { - ModuleBuilder MB(*TSCtx.getContext(), TM->getTargetTriple().str(), "dummy"); - MB.getModule()->setDataLayout(TM->createDataLayout()); - - Function *FooImpl = MB.createFunctionDecl("foo"); - BasicBlock *FooEntry = - BasicBlock::Create(*TSCtx.getContext(), "entry", FooImpl); - IRBuilder<> B1(FooEntry); - B1.CreateRetVoid(); - - Function *BarImpl = MB.createFunctionDecl("bar"); - BasicBlock *BarEntry = - BasicBlock::Create(*TSCtx.getContext(), "entry", BarImpl); - IRBuilder<> B2(BarEntry); - B2.CreateRetVoid(); - - M = ThreadSafeModule(MB.takeModule(), std::move(TSCtx)); - } - - // Create a simple stack and set the override flags option. - ExecutionSession ES; - auto &JD = ES.createJITDylib("main"); - auto Foo = ES.intern("foo"); - RTDyldObjectLinkingLayer2 ObjLayer( - ES, [](VModuleKey) { return llvm::make_unique(); }); - IRCompileLayer2 CompileLayer(ES, ObjLayer, FunkySimpleCompiler(*TM)); - - ObjLayer.setOverrideObjectFlagsWithResponsibilityFlags(true); - - cantFail(CompileLayer.add(JD, ES.allocateVModule(), std::move(M))); - ES.lookup({&JD}, {Foo}, [](Expected R) { cantFail(std::move(R)); }, - [](Error Err) { cantFail(std::move(Err)); }, - NoDependenciesToRegister); -} - -TEST(RTDyldObjectLinkingLayer2Test, TestAutoClaimResponsibilityForSymbols) { - - OrcNativeTarget::initialize(); - - std::unique_ptr TM( - EngineBuilder().selectTarget(Triple("x86_64-unknown-linux-gnu"), "", "", - SmallVector())); - - if (!TM) - return; - - // Our compiler is going to add a new symbol without telling ORC. - // This will test our ability to auto-claim responsibility later. - class FunkySimpleCompiler : public SimpleCompiler { - public: - FunkySimpleCompiler(TargetMachine &TM) : SimpleCompiler(TM) {} - - CompileResult operator()(Module &M) { - Function *BarImpl = - Function::Create(TypeBuilder::get(M.getContext()), - GlobalValue::ExternalLinkage, "bar", &M); - BasicBlock *BarEntry = - BasicBlock::Create(M.getContext(), "entry", BarImpl); - IRBuilder<> B(BarEntry); - B.CreateRetVoid(); - - return SimpleCompiler::operator()(M); - } - }; - - // Create a module with two void() functions: foo and bar. - ThreadSafeContext TSCtx(llvm::make_unique()); - ThreadSafeModule M; - { - ModuleBuilder MB(*TSCtx.getContext(), TM->getTargetTriple().str(), "dummy"); - MB.getModule()->setDataLayout(TM->createDataLayout()); - - Function *FooImpl = MB.createFunctionDecl("foo"); - BasicBlock *FooEntry = - BasicBlock::Create(*TSCtx.getContext(), "entry", FooImpl); - IRBuilder<> B(FooEntry); - B.CreateRetVoid(); - - M = ThreadSafeModule(MB.takeModule(), std::move(TSCtx)); - } - - // Create a simple stack and set the override flags option. - ExecutionSession ES; - auto &JD = ES.createJITDylib("main"); - auto Foo = ES.intern("foo"); - RTDyldObjectLinkingLayer2 ObjLayer( - ES, [](VModuleKey) { return llvm::make_unique(); }); - IRCompileLayer2 CompileLayer(ES, ObjLayer, FunkySimpleCompiler(*TM)); - - ObjLayer.setAutoClaimResponsibilityForObjectSymbols(true); - - cantFail(CompileLayer.add(JD, ES.allocateVModule(), std::move(M))); - ES.lookup({&JD}, {Foo}, [](Expected R) { cantFail(std::move(R)); }, - [](Error Err) { cantFail(std::move(Err)); }, - NoDependenciesToRegister); -} - -} // end anonymous namespace diff --git a/unittests/ExecutionEngine/Orc/RTDyldObjectLinkingLayerTest.cpp b/unittests/ExecutionEngine/Orc/RTDyldObjectLinkingLayerTest.cpp index 62c6b7dfa31..75ccfc9ab0d 100644 --- a/unittests/ExecutionEngine/Orc/RTDyldObjectLinkingLayerTest.cpp +++ b/unittests/ExecutionEngine/Orc/RTDyldObjectLinkingLayerTest.cpp @@ -1,4 +1,4 @@ -//===- RTDyldObjectLinkingLayerTest.cpp - RTDyld linking layer unit tests -===// +//===--- RTDyldObjectLinkingLayerTest.cpp - RTDyld linking layer tests ---===// // // The LLVM Compiler Infrastructure // @@ -7,13 +7,14 @@ // //===----------------------------------------------------------------------===// -#include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h" #include "OrcTestCommon.h" #include "llvm/ExecutionEngine/ExecutionEngine.h" #include "llvm/ExecutionEngine/Orc/CompileUtils.h" +#include "llvm/ExecutionEngine/Orc/IRCompileLayer.h" #include "llvm/ExecutionEngine/Orc/LambdaResolver.h" #include "llvm/ExecutionEngine/Orc/Legacy.h" #include "llvm/ExecutionEngine/Orc/NullResolver.h" +#include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h" #include "llvm/ExecutionEngine/SectionMemoryManager.h" #include "llvm/IR/Constants.h" #include "llvm/IR/LLVMContext.h" @@ -25,258 +26,203 @@ using namespace llvm::orc; namespace { class RTDyldObjectLinkingLayerExecutionTest : public testing::Test, - public OrcExecutionTest { - -}; - -class SectionMemoryManagerWrapper : public SectionMemoryManager { -public: - int FinalizationCount = 0; - int NeedsToReserveAllocationSpaceCount = 0; - - bool needsToReserveAllocationSpace() override { - ++NeedsToReserveAllocationSpaceCount; - return SectionMemoryManager::needsToReserveAllocationSpace(); - } - - bool finalizeMemory(std::string *ErrMsg = nullptr) override { - ++FinalizationCount; - return SectionMemoryManager::finalizeMemory(ErrMsg); - } -}; + public OrcExecutionTest {}; -TEST(RTDyldObjectLinkingLayerTest, TestSetProcessAllSections) { +// Adds an object with a debug section to RuntimeDyld and then returns whether +// the debug section was passed to the memory manager. +static bool testSetProcessAllSections(std::unique_ptr Obj, + bool ProcessAllSections) { class MemoryManagerWrapper : public SectionMemoryManager { public: MemoryManagerWrapper(bool &DebugSeen) : DebugSeen(DebugSeen) {} uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment, - unsigned SectionID, - StringRef SectionName, + unsigned SectionID, StringRef SectionName, bool IsReadOnly) override { if (SectionName == ".debug_str") DebugSeen = true; - return SectionMemoryManager::allocateDataSection(Size, Alignment, - SectionID, - SectionName, - IsReadOnly); + return SectionMemoryManager::allocateDataSection( + Size, Alignment, SectionID, SectionName, IsReadOnly); } + private: bool &DebugSeen; }; bool DebugSectionSeen = false; - auto MM = std::make_shared(DebugSectionSeen); ExecutionSession ES; + auto &JD = ES.createJITDylib("main"); + auto Foo = ES.intern("foo"); - RTDyldObjectLinkingLayer ObjLayer(ES, [&MM](VModuleKey) { - return RTDyldObjectLinkingLayer::Resources{ - MM, std::make_shared()}; + RTDyldObjectLinkingLayer ObjLayer(ES, [&DebugSectionSeen](VModuleKey) { + return llvm::make_unique(DebugSectionSeen); }); + auto OnResolveDoNothing = [](Expected R) { + cantFail(std::move(R)); + }; + + auto OnReadyDoNothing = [](Error Err) { cantFail(std::move(Err)); }; + + ObjLayer.setProcessAllSections(ProcessAllSections); + auto K = ES.allocateVModule(); + cantFail(ObjLayer.add(JD, K, std::move(Obj))); + ES.lookup({&JD}, {Foo}, OnResolveDoNothing, OnReadyDoNothing, + NoDependenciesToRegister); + return DebugSectionSeen; +} + +TEST(RTDyldObjectLinkingLayerTest, TestSetProcessAllSections) { LLVMContext Context; auto M = llvm::make_unique("", Context); M->setTargetTriple("x86_64-unknown-linux-gnu"); Type *Int32Ty = IntegerType::get(Context, 32); GlobalVariable *GV = - new GlobalVariable(*M, Int32Ty, false, GlobalValue::ExternalLinkage, + new GlobalVariable(*M, Int32Ty, false, GlobalValue::ExternalLinkage, ConstantInt::get(Int32Ty, 42), "foo"); GV->setSection(".debug_str"); - // Initialize the native target in case this is the first unit test // to try to build a TM. OrcNativeTarget::initialize(); - std::unique_ptr TM( - EngineBuilder().selectTarget(Triple(M->getTargetTriple()), "", "", - SmallVector())); + std::unique_ptr TM(EngineBuilder().selectTarget( + Triple(M->getTargetTriple()), "", "", SmallVector())); if (!TM) return; auto Obj = SimpleCompiler(*TM)(*M); - { - // Test with ProcessAllSections = false (the default). - auto K = ES.allocateVModule(); - cantFail(ObjLayer.addObject( - K, MemoryBuffer::getMemBufferCopy(Obj->getBuffer()))); - cantFail(ObjLayer.emitAndFinalize(K)); - EXPECT_EQ(DebugSectionSeen, false) - << "Unexpected debug info section"; - cantFail(ObjLayer.removeObject(K)); - } - - { - // Test with ProcessAllSections = true. - ObjLayer.setProcessAllSections(true); - auto K = ES.allocateVModule(); - cantFail(ObjLayer.addObject(K, std::move(Obj))); - cantFail(ObjLayer.emitAndFinalize(K)); - EXPECT_EQ(DebugSectionSeen, true) - << "Expected debug info section not seen"; - cantFail(ObjLayer.removeObject(K)); - } + EXPECT_FALSE(testSetProcessAllSections( + MemoryBuffer::getMemBufferCopy(Obj->getBuffer()), false)) + << "Debug section seen despite ProcessAllSections being false"; + EXPECT_TRUE(testSetProcessAllSections(std::move(Obj), true)) + << "Expected to see debug section when ProcessAllSections is true"; } -TEST_F(RTDyldObjectLinkingLayerExecutionTest, NoDuplicateFinalization) { - if (!SupportsJIT) - return; +TEST(RTDyldObjectLinkingLayerTest, TestOverrideObjectFlags) { - ExecutionSession ES; + OrcNativeTarget::initialize(); - auto MM = std::make_shared(); + std::unique_ptr TM( + EngineBuilder().selectTarget(Triple("x86_64-unknown-linux-gnu"), "", "", + SmallVector())); - std::map> Resolvers; + if (!TM) + return; - RTDyldObjectLinkingLayer ObjLayer(ES, [&](VModuleKey K) { - auto I = Resolvers.find(K); - assert(I != Resolvers.end() && "Missing resolver"); - auto R = std::move(I->second); - Resolvers.erase(I); - return RTDyldObjectLinkingLayer::Resources{MM, std::move(R)}; - }); - SimpleCompiler Compile(*TM); - - // Create a pair of modules that will trigger recursive finalization: - // Module 1: - // int bar() { return 42; } - // Module 2: - // int bar(); - // int foo() { return bar(); } - // - // Verify that the memory manager is only finalized once (for Module 2). - // Failure suggests that finalize is being called on the inner RTDyld - // instance (for Module 1) which is unsafe, as it will prevent relocation of - // Module 2. - - ModuleBuilder MB1(Context, "", "dummy"); - { - MB1.getModule()->setDataLayout(TM->createDataLayout()); - Function *BarImpl = MB1.createFunctionDecl("bar"); - BasicBlock *BarEntry = BasicBlock::Create(Context, "entry", BarImpl); - IRBuilder<> Builder(BarEntry); - IntegerType *Int32Ty = IntegerType::get(Context, 32); - Value *FourtyTwo = ConstantInt::getSigned(Int32Ty, 42); - Builder.CreateRet(FourtyTwo); - } + // Our compiler is going to modify symbol visibility settings without telling + // ORC. This will test our ability to override the flags later. + class FunkySimpleCompiler : public SimpleCompiler { + public: + FunkySimpleCompiler(TargetMachine &TM) : SimpleCompiler(TM) {} - auto Obj1 = Compile(*MB1.getModule()); + CompileResult operator()(Module &M) { + auto *Foo = M.getFunction("foo"); + assert(Foo && "Expected function Foo not found"); + Foo->setVisibility(GlobalValue::HiddenVisibility); + return SimpleCompiler::operator()(M); + } + }; - ModuleBuilder MB2(Context, "", "dummy"); + // Create a module with two void() functions: foo and bar. + ThreadSafeContext TSCtx(llvm::make_unique()); + ThreadSafeModule M; { - MB2.getModule()->setDataLayout(TM->createDataLayout()); - Function *BarDecl = MB2.createFunctionDecl("bar"); - Function *FooImpl = MB2.createFunctionDecl("foo"); - BasicBlock *FooEntry = BasicBlock::Create(Context, "entry", FooImpl); - IRBuilder<> Builder(FooEntry); - Builder.CreateRet(Builder.CreateCall(BarDecl)); + ModuleBuilder MB(*TSCtx.getContext(), TM->getTargetTriple().str(), "dummy"); + MB.getModule()->setDataLayout(TM->createDataLayout()); + + Function *FooImpl = MB.createFunctionDecl("foo"); + BasicBlock *FooEntry = + BasicBlock::Create(*TSCtx.getContext(), "entry", FooImpl); + IRBuilder<> B1(FooEntry); + B1.CreateRetVoid(); + + Function *BarImpl = MB.createFunctionDecl("bar"); + BasicBlock *BarEntry = + BasicBlock::Create(*TSCtx.getContext(), "entry", BarImpl); + IRBuilder<> B2(BarEntry); + B2.CreateRetVoid(); + + M = ThreadSafeModule(MB.takeModule(), std::move(TSCtx)); } - auto Obj2 = Compile(*MB2.getModule()); - auto K1 = ES.allocateVModule(); - Resolvers[K1] = std::make_shared(); - cantFail(ObjLayer.addObject(K1, std::move(Obj1))); + // Create a simple stack and set the override flags option. + ExecutionSession ES; + auto &JD = ES.createJITDylib("main"); + auto Foo = ES.intern("foo"); + RTDyldObjectLinkingLayer ObjLayer( + ES, [](VModuleKey) { return llvm::make_unique(); }); + IRCompileLayer CompileLayer(ES, ObjLayer, FunkySimpleCompiler(*TM)); - auto K2 = ES.allocateVModule(); - auto LegacyLookup = [&](const std::string &Name) { - return ObjLayer.findSymbol(Name, true); - }; + ObjLayer.setOverrideObjectFlagsWithResponsibilityFlags(true); - Resolvers[K2] = createSymbolResolver( - [&](const SymbolNameSet &Symbols) { - return cantFail( - getResponsibilitySetWithLegacyFn(Symbols, LegacyLookup)); - }, - [&](std::shared_ptr Query, - const SymbolNameSet &Symbols) { - return lookupWithLegacyFn(ES, *Query, Symbols, LegacyLookup); - }); - - cantFail(ObjLayer.addObject(K2, std::move(Obj2))); - cantFail(ObjLayer.emitAndFinalize(K2)); - cantFail(ObjLayer.removeObject(K2)); - - // Finalization of module 2 should trigger finalization of module 1. - // Verify that finalize on SMMW is only called once. - EXPECT_EQ(MM->FinalizationCount, 1) - << "Extra call to finalize"; + cantFail(CompileLayer.add(JD, ES.allocateVModule(), std::move(M))); + ES.lookup({&JD}, {Foo}, [](Expected R) { cantFail(std::move(R)); }, + [](Error Err) { cantFail(std::move(Err)); }, + NoDependenciesToRegister); } -TEST_F(RTDyldObjectLinkingLayerExecutionTest, NoPrematureAllocation) { - if (!SupportsJIT) - return; +TEST(RTDyldObjectLinkingLayerTest, TestAutoClaimResponsibilityForSymbols) { - ExecutionSession ES; + OrcNativeTarget::initialize(); - auto MM = std::make_shared(); + std::unique_ptr TM( + EngineBuilder().selectTarget(Triple("x86_64-unknown-linux-gnu"), "", "", + SmallVector())); - RTDyldObjectLinkingLayer ObjLayer(ES, [&MM](VModuleKey K) { - return RTDyldObjectLinkingLayer::Resources{ - MM, std::make_shared()}; - }); - SimpleCompiler Compile(*TM); - - // Create a pair of unrelated modules: - // - // Module 1: - // int foo() { return 42; } - // Module 2: - // int bar() { return 7; } - // - // Both modules will share a memory manager. We want to verify that the - // second object is not loaded before the first one is finalized. To do this - // in a portable way, we abuse the - // RuntimeDyld::MemoryManager::needsToReserveAllocationSpace hook, which is - // called once per object before any sections are allocated. - - ModuleBuilder MB1(Context, "", "dummy"); - { - MB1.getModule()->setDataLayout(TM->createDataLayout()); - Function *BarImpl = MB1.createFunctionDecl("foo"); - BasicBlock *BarEntry = BasicBlock::Create(Context, "entry", BarImpl); - IRBuilder<> Builder(BarEntry); - IntegerType *Int32Ty = IntegerType::get(Context, 32); - Value *FourtyTwo = ConstantInt::getSigned(Int32Ty, 42); - Builder.CreateRet(FourtyTwo); - } + if (!TM) + return; - auto Obj1 = Compile(*MB1.getModule()); + // Our compiler is going to add a new symbol without telling ORC. + // This will test our ability to auto-claim responsibility later. + class FunkySimpleCompiler : public SimpleCompiler { + public: + FunkySimpleCompiler(TargetMachine &TM) : SimpleCompiler(TM) {} + + CompileResult operator()(Module &M) { + Function *BarImpl = + Function::Create(TypeBuilder::get(M.getContext()), + GlobalValue::ExternalLinkage, "bar", &M); + BasicBlock *BarEntry = + BasicBlock::Create(M.getContext(), "entry", BarImpl); + IRBuilder<> B(BarEntry); + B.CreateRetVoid(); + + return SimpleCompiler::operator()(M); + } + }; - ModuleBuilder MB2(Context, "", "dummy"); + // Create a module with two void() functions: foo and bar. + ThreadSafeContext TSCtx(llvm::make_unique()); + ThreadSafeModule M; { - MB2.getModule()->setDataLayout(TM->createDataLayout()); - Function *BarImpl = MB2.createFunctionDecl("bar"); - BasicBlock *BarEntry = BasicBlock::Create(Context, "entry", BarImpl); - IRBuilder<> Builder(BarEntry); - IntegerType *Int32Ty = IntegerType::get(Context, 32); - Value *Seven = ConstantInt::getSigned(Int32Ty, 7); - Builder.CreateRet(Seven); - } - auto Obj2 = Compile(*MB2.getModule()); + ModuleBuilder MB(*TSCtx.getContext(), TM->getTargetTriple().str(), "dummy"); + MB.getModule()->setDataLayout(TM->createDataLayout()); - auto K = ES.allocateVModule(); - cantFail(ObjLayer.addObject(K, std::move(Obj1))); - cantFail(ObjLayer.addObject(ES.allocateVModule(), std::move(Obj2))); - cantFail(ObjLayer.emitAndFinalize(K)); - cantFail(ObjLayer.removeObject(K)); - - // Only one call to needsToReserveAllocationSpace should have been made. - EXPECT_EQ(MM->NeedsToReserveAllocationSpaceCount, 1) - << "More than one call to needsToReserveAllocationSpace " - "(multiple unrelated objects loaded prior to finalization)"; -} + Function *FooImpl = MB.createFunctionDecl("foo"); + BasicBlock *FooEntry = + BasicBlock::Create(*TSCtx.getContext(), "entry", FooImpl); + IRBuilder<> B(FooEntry); + B.CreateRetVoid(); -TEST_F(RTDyldObjectLinkingLayerExecutionTest, TestNotifyLoadedSignature) { + M = ThreadSafeModule(MB.takeModule(), std::move(TSCtx)); + } + + // Create a simple stack and set the override flags option. ExecutionSession ES; + auto &JD = ES.createJITDylib("main"); + auto Foo = ES.intern("foo"); RTDyldObjectLinkingLayer ObjLayer( - ES, - [](VModuleKey) { - return RTDyldObjectLinkingLayer::Resources{ - nullptr, std::make_shared()}; - }, - [](VModuleKey, const object::ObjectFile &obj, - const RuntimeDyld::LoadedObjectInfo &info) {}); + ES, [](VModuleKey) { return llvm::make_unique(); }); + IRCompileLayer CompileLayer(ES, ObjLayer, FunkySimpleCompiler(*TM)); + + ObjLayer.setAutoClaimResponsibilityForObjectSymbols(true); + + cantFail(CompileLayer.add(JD, ES.allocateVModule(), std::move(M))); + ES.lookup({&JD}, {Foo}, [](Expected R) { cantFail(std::move(R)); }, + [](Error Err) { cantFail(std::move(Err)); }, + NoDependenciesToRegister); } } // end anonymous namespace -- GitLab From 3926274437d3fa4050a03e06074535388e1d7e9f Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 15 Oct 2018 23:34:58 +0000 Subject: [PATCH 0229/1116] [X86] Remove some isel patterns that shouldn't be possible. These included a bitcast of a load from v4f32 to v2f64, but DAG combine should have already changed the type of the load to remove the cast. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344573 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86InstrAVX512.td | 2 -- lib/Target/X86/X86InstrSSE.td | 4 ---- 2 files changed, 6 deletions(-) diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 158aba447ed..f617de7dd7d 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -4421,8 +4421,6 @@ let Predicates = [HasAVX512] in { (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>; def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))), (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>; - def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))), - (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>; def : Pat<(v2f64 (X86vzload addr:$src)), (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>; diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index b3c639f4f0c..8a836d8c173 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -265,8 +265,6 @@ let Predicates = [UseAVX] in { (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>; def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))), (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>; - def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))), - (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>; def : Pat<(v2f64 (X86vzload addr:$src)), (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>; @@ -349,8 +347,6 @@ let Predicates = [UseSSE2] in { (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>; def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))), (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>; - def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))), - (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>; def : Pat<(v2f64 (X86vzload addr:$src)), (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>; } -- GitLab From 2fa550c88a10d1fd710493b0fd6885bfe4d75f88 Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Tue, 16 Oct 2018 00:09:12 +0000 Subject: [PATCH 0230/1116] [WebAssembly] LSDA info generation Summary: This adds support for LSDA (exception table) generation for wasm EH. Wasm EH mostly follows the structure of Itanium-style exception tables, with one exception: a call site table entry in wasm EH corresponds to not a call site but a landing pad. In wasm EH, the VM is responsible for stack unwinding. After an exception occurs and the stack is unwound, the control flow is transferred to wasm 'catch' instruction by the VM, after which the personality function is called from the compiler-generated code. (Refer to WasmEHPrepare pass for more information on this part.) This patch: - Changes wasm.landingpad.index intrinsic to take a token argument, to make this 1:1 match with a catchpad instruction - Stores landingpad index info and catch type info MachineFunction in before instruction selection - Lowers wasm.lsda intrinsic to an MCSymbol pointing to the start of an exception table - Adds WasmException class with overridden methods for table generation - Adds support for LSDA section in Wasm object writer Reviewers: dschuff, sbc100, rnk Subscribers: mgorny, jgravelle-google, sunfish, llvm-commits Differential Revision: https://reviews.llvm.org/D52748 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344575 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/CodeGen/MachineFunction.h | 22 +- include/llvm/IR/IntrinsicsWebAssembly.td | 3 +- lib/CodeGen/AsmPrinter/AsmPrinter.cpp | 3 +- lib/CodeGen/AsmPrinter/CMakeLists.txt | 1 + lib/CodeGen/AsmPrinter/EHStreamer.cpp | 10 +- lib/CodeGen/AsmPrinter/EHStreamer.h | 11 +- lib/CodeGen/AsmPrinter/WasmException.cpp | 81 ++++++ lib/CodeGen/AsmPrinter/WasmException.h | 42 +++ lib/CodeGen/MachineFunction.cpp | 45 ++-- .../SelectionDAG/SelectionDAGBuilder.cpp | 10 +- lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp | 83 ++++-- lib/CodeGen/TargetLoweringObjectFileImpl.cpp | 4 + lib/CodeGen/WasmEHPrepare.cpp | 2 +- lib/MC/MCObjectFileInfo.cpp | 6 + lib/MC/WasmObjectWriter.cpp | 4 +- .../WebAssembly/WebAssemblyISelLowering.cpp | 15 +- .../WebAssembly/WebAssemblyInstrInfo.td | 2 + .../WebAssembly/WebAssemblyMCInstLower.cpp | 7 + test/CodeGen/WebAssembly/eh-lsda.ll | 239 ++++++++++++++++++ test/CodeGen/WebAssembly/wasmehprepare.ll | 6 +- 20 files changed, 529 insertions(+), 67 deletions(-) create mode 100644 lib/CodeGen/AsmPrinter/WasmException.cpp create mode 100644 lib/CodeGen/AsmPrinter/WasmException.h create mode 100644 test/CodeGen/WebAssembly/eh-lsda.ll diff --git a/include/llvm/CodeGen/MachineFunction.h b/include/llvm/CodeGen/MachineFunction.h index 7471b314846..bc81e485a80 100644 --- a/include/llvm/CodeGen/MachineFunction.h +++ b/include/llvm/CodeGen/MachineFunction.h @@ -316,6 +316,9 @@ class MachineFunction { /// Map a landing pad's EH symbol to the call site indexes. DenseMap> LPadToCallSiteMap; + /// Map a landing pad to its index. + DenseMap WasmLPadToIndexMap; + /// Map of invoke call site index values to associated begin EH_LABEL. DenseMap CallSiteMap; @@ -810,7 +813,8 @@ public: LandingPadInfo &getOrCreateLandingPadInfo(MachineBasicBlock *LandingPad); /// Remap landing pad labels and remove any deleted landing pads. - void tidyLandingPads(DenseMap *LPMap = nullptr); + void tidyLandingPads(DenseMap *LPMap = nullptr, + bool TidyIfNoBeginLabels = true); /// Return a reference to the landing pad info for the current function. const std::vector &getLandingPads() const { @@ -853,6 +857,22 @@ public: /// Map the landing pad's EH symbol to the call site indexes. void setCallSiteLandingPad(MCSymbol *Sym, ArrayRef Sites); + /// Map the landing pad to its index. Used for Wasm exception handling. + void setWasmLandingPadIndex(const MachineBasicBlock *LPad, unsigned Index) { + WasmLPadToIndexMap[LPad] = Index; + } + + /// Returns true if the landing pad has an associate index in wasm EH. + bool hasWasmLandingPadIndex(const MachineBasicBlock *LPad) const { + return WasmLPadToIndexMap.count(LPad); + } + + /// Get the index in wasm EH for a given landing pad. + unsigned getWasmLandingPadIndex(const MachineBasicBlock *LPad) const { + assert(hasWasmLandingPadIndex(LPad)); + return WasmLPadToIndexMap.lookup(LPad); + } + /// Get the call site indexes for a landing pad EH symbol. SmallVectorImpl &getCallSiteLandingPad(MCSymbol *Sym) { assert(hasCallSiteLandingPad(Sym) && diff --git a/include/llvm/IR/IntrinsicsWebAssembly.td b/include/llvm/IR/IntrinsicsWebAssembly.td index adf7cb0ba0e..9aa2a4ebeca 100644 --- a/include/llvm/IR/IntrinsicsWebAssembly.td +++ b/include/llvm/IR/IntrinsicsWebAssembly.td @@ -71,7 +71,8 @@ def int_wasm_catch : Intrinsic<[llvm_ptr_ty], [llvm_i32_ty], // WebAssembly EH must maintain the landingpads in the order assigned to them // by WasmEHPrepare pass to generate landingpad table in EHStreamer. This is // used in order to give them the indices in WasmEHPrepare. -def int_wasm_landingpad_index: Intrinsic<[], [llvm_i32_ty], [IntrNoMem]>; +def int_wasm_landingpad_index: Intrinsic<[], [llvm_token_ty, llvm_i32_ty], + [IntrNoMem]>; // Returns LSDA address of the current function. def int_wasm_lsda : Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>; diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index 63c5b262edc..526f7ce3083 100644 --- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -16,6 +16,7 @@ #include "CodeViewDebug.h" #include "DwarfDebug.h" #include "DwarfException.h" +#include "WasmException.h" #include "WinCFGuard.h" #include "WinException.h" #include "llvm/ADT/APFloat.h" @@ -356,7 +357,7 @@ bool AsmPrinter::doInitialization(Module &M) { } break; case ExceptionHandling::Wasm: - // TODO to prevent warning + ES = new WasmException(this); break; } if (ES) diff --git a/lib/CodeGen/AsmPrinter/CMakeLists.txt b/lib/CodeGen/AsmPrinter/CMakeLists.txt index 6cba4a0d4b8..3fb088ab6f0 100644 --- a/lib/CodeGen/AsmPrinter/CMakeLists.txt +++ b/lib/CodeGen/AsmPrinter/CMakeLists.txt @@ -23,6 +23,7 @@ add_llvm_library(LLVMAsmPrinter WinCFGuard.cpp WinException.cpp CodeViewDebug.cpp + WasmException.cpp DEPENDS intrinsics_gen diff --git a/lib/CodeGen/AsmPrinter/EHStreamer.cpp b/lib/CodeGen/AsmPrinter/EHStreamer.cpp index be04b9a6e8c..7599121de2b 100644 --- a/lib/CodeGen/AsmPrinter/EHStreamer.cpp +++ b/lib/CodeGen/AsmPrinter/EHStreamer.cpp @@ -345,7 +345,9 @@ computeCallSiteTable(SmallVectorImpl &CallSites, /// unwound and handling continues. /// 3. Type ID table contains references to all the C++ typeinfo for all /// catches in the function. This tables is reverse indexed base 1. -void EHStreamer::emitExceptionTable() { +/// +/// Returns the starting symbol of an exception table. +MCSymbol *EHStreamer::emitExceptionTable() { const MachineFunction *MF = Asm->MF; const std::vector &TypeInfos = MF->getTypeInfos(); const std::vector &FilterIds = MF->getFilterIds(); @@ -375,6 +377,7 @@ void EHStreamer::emitExceptionTable() { computeCallSiteTable(CallSites, LandingPads, FirstActions); bool IsSJLJ = Asm->MAI->getExceptionHandlingType() == ExceptionHandling::SjLj; + bool IsWasm = Asm->MAI->getExceptionHandlingType() == ExceptionHandling::Wasm; unsigned CallSiteEncoding = IsSJLJ ? dwarf::DW_EH_PE_udata4 : dwarf::DW_EH_PE_uleb128; bool HaveTTData = !TypeInfos.empty() || !FilterIds.empty(); @@ -457,8 +460,8 @@ void EHStreamer::emitExceptionTable() { Asm->EmitLabelDifferenceAsULEB128(CstEndLabel, CstBeginLabel); Asm->OutStreamer->EmitLabel(CstBeginLabel); - // SjLj Exception handling - if (IsSJLJ) { + // SjLj / Wasm Exception handling + if (IsSJLJ || IsWasm) { unsigned idx = 0; for (SmallVectorImpl::const_iterator I = CallSites.begin(), E = CallSites.end(); I != E; ++I, ++idx) { @@ -604,6 +607,7 @@ void EHStreamer::emitExceptionTable() { } Asm->EmitAlignment(2); + return GCCETSym; } void EHStreamer::emitTypeInfos(unsigned TTypeEncoding, MCSymbol *TTBaseLabel) { diff --git a/lib/CodeGen/AsmPrinter/EHStreamer.h b/lib/CodeGen/AsmPrinter/EHStreamer.h index b89421a1e06..e3a6f8e9d58 100644 --- a/lib/CodeGen/AsmPrinter/EHStreamer.h +++ b/lib/CodeGen/AsmPrinter/EHStreamer.h @@ -85,9 +85,10 @@ protected: /// zero for the landing pad and the action. Calls marked 'nounwind' have /// no entry and must not be contained in the try-range of any entry - they /// form gaps in the table. Entries must be ordered by try-range address. - void computeCallSiteTable(SmallVectorImpl &CallSites, - const SmallVectorImpl &LandingPads, - const SmallVectorImpl &FirstActions); + virtual void computeCallSiteTable( + SmallVectorImpl &CallSites, + const SmallVectorImpl &LandingPads, + const SmallVectorImpl &FirstActions); /// Emit landing pads and actions. /// @@ -108,7 +109,9 @@ protected: /// found the frame is unwound and handling continues. /// 3. Type id table contains references to all the C++ typeinfo for all /// catches in the function. This tables is reversed indexed base 1. - void emitExceptionTable(); + /// + /// Returns the starting symbol of an exception table. + MCSymbol *emitExceptionTable(); virtual void emitTypeInfos(unsigned TTypeEncoding, MCSymbol *TTBaseLabel); diff --git a/lib/CodeGen/AsmPrinter/WasmException.cpp b/lib/CodeGen/AsmPrinter/WasmException.cpp new file mode 100644 index 00000000000..46745d08c9f --- /dev/null +++ b/lib/CodeGen/AsmPrinter/WasmException.cpp @@ -0,0 +1,81 @@ +//===-- CodeGen/AsmPrinter/WasmException.cpp - Wasm Exception Impl --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains support for writing WebAssembly exception info into asm +// files. +// +//===----------------------------------------------------------------------===// + +#include "WasmException.h" +#include "llvm/MC/MCStreamer.h" +using namespace llvm; + +void WasmException::markFunctionEnd() { + // Get rid of any dead landing pads. + if (!Asm->MF->getLandingPads().empty()) { + auto *NonConstMF = const_cast(Asm->MF); + // Wasm does not set BeginLabel and EndLabel information for landing pads, + // so we should set the second argument false. + NonConstMF->tidyLandingPads(nullptr, /* TidyIfNoBeginLabels */ false); + } +} + +void WasmException::endFunction(const MachineFunction *MF) { + bool ShouldEmitExceptionTable = false; + for (const LandingPadInfo &Info : MF->getLandingPads()) { + if (MF->hasWasmLandingPadIndex(Info.LandingPadBlock)) { + ShouldEmitExceptionTable = true; + break; + } + } + if (!ShouldEmitExceptionTable) + return; + MCSymbol *LSDALabel = emitExceptionTable(); + assert(LSDALabel && ".GCC_exception_table has not been emitted!"); + + // Wasm requires every data section symbol to have a .size set. So we emit an + // end marker and set the size as the difference between the start end the end + // marker. + MCSymbol *LSDAEndLabel = Asm->createTempSymbol("GCC_except_table_end"); + Asm->OutStreamer->EmitLabel(LSDAEndLabel); + MCContext &OutContext = Asm->OutStreamer->getContext(); + const MCExpr *SizeExp = MCBinaryExpr::createSub( + MCSymbolRefExpr::create(LSDAEndLabel, OutContext), + MCSymbolRefExpr::create(LSDALabel, OutContext), OutContext); + Asm->OutStreamer->emitELFSize(LSDALabel, SizeExp); +} + +// Compute the call-site table for wasm EH. Even though we use the same function +// name to share the common routines, a call site entry in the table corresponds +// to not a call site for possibly-throwing functions but a landing pad. In wasm +// EH the VM is responsible for stack unwinding. After an exception occurs and +// the stack is unwound, the control flow is transferred to wasm 'catch' +// instruction by the VM, after which the personality function is called from +// the compiler-generated code. Refer to WasmEHPrepare pass for more +// information. +void WasmException::computeCallSiteTable( + SmallVectorImpl &CallSites, + const SmallVectorImpl &LandingPads, + const SmallVectorImpl &FirstActions) { + MachineFunction &MF = *Asm->MF; + for (unsigned I = 0, N = LandingPads.size(); I < N; ++I) { + const LandingPadInfo *Info = LandingPads[I]; + MachineBasicBlock *LPad = Info->LandingPadBlock; + // We don't emit LSDA for single catch (...). + if (!MF.hasWasmLandingPadIndex(LPad)) + continue; + // Wasm EH must maintain the EH pads in the order assigned to them by the + // WasmEHPrepare pass. + unsigned LPadIndex = MF.getWasmLandingPadIndex(LPad); + CallSiteEntry Site = {nullptr, nullptr, Info, FirstActions[I]}; + if (CallSites.size() < LPadIndex + 1) + CallSites.resize(LPadIndex + 1); + CallSites[LPadIndex] = Site; + } +} diff --git a/lib/CodeGen/AsmPrinter/WasmException.h b/lib/CodeGen/AsmPrinter/WasmException.h new file mode 100644 index 00000000000..09a9a25ce8d --- /dev/null +++ b/lib/CodeGen/AsmPrinter/WasmException.h @@ -0,0 +1,42 @@ +//===-- WasmException.h - Wasm Exception Framework -------------*- C++ -*--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains support for writing WebAssembly exception info into asm +// files. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_WASMEXCEPTION_H +#define LLVM_LIB_CODEGEN_ASMPRINTER_WASMEXCEPTION_H + +#include "EHStreamer.h" +#include "llvm/CodeGen/AsmPrinter.h" + +namespace llvm { + +class LLVM_LIBRARY_VISIBILITY WasmException : public EHStreamer { +public: + WasmException(AsmPrinter *A) : EHStreamer(A) {} + + void endModule() override {} + void beginFunction(const MachineFunction *MF) override {} + virtual void markFunctionEnd() override; + void endFunction(const MachineFunction *MF) override; + +protected: + // Compute the call site table for wasm EH. + void computeCallSiteTable( + SmallVectorImpl &CallSites, + const SmallVectorImpl &LandingPads, + const SmallVectorImpl &FirstActions) override; +}; + +} // End of namespace llvm + +#endif diff --git a/lib/CodeGen/MachineFunction.cpp b/lib/CodeGen/MachineFunction.cpp index 431484f078b..9e4963c4bdb 100644 --- a/lib/CodeGen/MachineFunction.cpp +++ b/lib/CodeGen/MachineFunction.cpp @@ -661,8 +661,11 @@ MCSymbol *MachineFunction::addLandingPad(MachineBasicBlock *LandingPad) { } } - } else if (isa(FirstI)) { - // TODO + } else if (const auto *CPI = dyn_cast(FirstI)) { + for (unsigned I = CPI->getNumArgOperands(); I != 0; --I) { + Value *TypeInfo = CPI->getArgOperand(I - 1)->stripPointerCasts(); + addCatchTypeInfo(LandingPad, dyn_cast(TypeInfo)); + } } else { assert(isa(FirstI) && "Invalid landingpad!"); @@ -687,7 +690,8 @@ void MachineFunction::addFilterTypeInfo(MachineBasicBlock *LandingPad, LP.TypeIds.push_back(getFilterIDFor(IdsInFilter)); } -void MachineFunction::tidyLandingPads(DenseMap *LPMap) { +void MachineFunction::tidyLandingPads(DenseMap *LPMap, + bool TidyIfNoBeginLabels) { for (unsigned i = 0; i != LandingPads.size(); ) { LandingPadInfo &LandingPad = LandingPads[i]; if (LandingPad.LandingPadLabel && @@ -702,24 +706,25 @@ void MachineFunction::tidyLandingPads(DenseMap *LPMap) { continue; } - for (unsigned j = 0, e = LandingPads[i].BeginLabels.size(); j != e; ++j) { - MCSymbol *BeginLabel = LandingPad.BeginLabels[j]; - MCSymbol *EndLabel = LandingPad.EndLabels[j]; - if ((BeginLabel->isDefined() || - (LPMap && (*LPMap)[BeginLabel] != 0)) && - (EndLabel->isDefined() || - (LPMap && (*LPMap)[EndLabel] != 0))) continue; - - LandingPad.BeginLabels.erase(LandingPad.BeginLabels.begin() + j); - LandingPad.EndLabels.erase(LandingPad.EndLabels.begin() + j); - --j; - --e; - } + if (TidyIfNoBeginLabels) { + for (unsigned j = 0, e = LandingPads[i].BeginLabels.size(); j != e; ++j) { + MCSymbol *BeginLabel = LandingPad.BeginLabels[j]; + MCSymbol *EndLabel = LandingPad.EndLabels[j]; + if ((BeginLabel->isDefined() || (LPMap && (*LPMap)[BeginLabel] != 0)) && + (EndLabel->isDefined() || (LPMap && (*LPMap)[EndLabel] != 0))) + continue; + + LandingPad.BeginLabels.erase(LandingPad.BeginLabels.begin() + j); + LandingPad.EndLabels.erase(LandingPad.EndLabels.begin() + j); + --j; + --e; + } - // Remove landing pads with no try-ranges. - if (LandingPads[i].BeginLabels.empty()) { - LandingPads.erase(LandingPads.begin() + i); - continue; + // Remove landing pads with no try-ranges. + if (LandingPads[i].BeginLabels.empty()) { + LandingPads.erase(LandingPads.begin() + i); + continue; + } } // If there is no landing pad, ensure that the list of typeids is empty. diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 1a99ef734f1..3907f647142 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -6282,12 +6282,12 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { return nullptr; } - case Intrinsic::wasm_landingpad_index: { - // TODO store landing pad index in a map, which will be used when generating - // LSDA information + case Intrinsic::wasm_landingpad_index: + // Information this intrinsic contained has been transferred to + // MachineFunction in SelectionDAGISel::PrepareEHLandingPad. We can safely + // delete it now. return nullptr; } - } } void SelectionDAGBuilder::visitConstrainedFPIntrinsic( @@ -6444,7 +6444,7 @@ SelectionDAGBuilder::lowerInvokable(TargetLowering::CallLoweringInfo &CLI, WinEHFuncInfo *EHInfo = DAG.getMachineFunction().getWinEHFuncInfo(); EHInfo->addIPToStateRange(cast(CLI.CS.getInstruction()), BeginLabel, EndLabel); - } else { + } else if (!isScopedEHPersonality(Pers)) { MF.addInvoke(FuncInfo.MBBMap[EHPadBB], BeginLabel, EndLabel); } } diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp index 2b4a590f19f..90bcaa653c3 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -27,6 +27,7 @@ #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/Analysis/CFG.h" +#include "llvm/Analysis/EHPersonalities.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" @@ -1128,6 +1129,36 @@ static bool hasExceptionPointerOrCodeUser(const CatchPadInst *CPI) { return false; } +// wasm.landingpad.index intrinsic is for associating a landing pad index number +// with a catchpad instruction. Retrieve the landing pad index in the intrinsic +// and store the mapping in the function. +static void mapWasmLandingPadIndex(MachineBasicBlock *MBB, + const CatchPadInst *CPI) { + MachineFunction *MF = MBB->getParent(); + // In case of single catch (...), we don't emit LSDA, so we don't need + // this information. + bool IsSingleCatchAllClause = + CPI->getNumArgOperands() == 1 && + cast(CPI->getArgOperand(0))->isNullValue(); + if (!IsSingleCatchAllClause) { + // Create a mapping from landing pad label to landing pad index. + bool IntrFound = false; + for (const User *U : CPI->users()) { + if (const auto *Call = dyn_cast(U)) { + Intrinsic::ID IID = Call->getIntrinsicID(); + if (IID == Intrinsic::wasm_landingpad_index) { + Value *IndexArg = Call->getArgOperand(1); + int Index = cast(IndexArg)->getZExtValue(); + MF->setWasmLandingPadIndex(MBB, Index); + IntrFound = true; + break; + } + } + } + assert(IntrFound && "wasm.landingpad.index intrinsic not found!"); + } +} + /// PrepareEHLandingPad - Emit an EH_LABEL, set up live-in registers, and /// do other setup for EH landing-pad blocks. bool SelectionDAGISel::PrepareEHLandingPad() { @@ -1137,44 +1168,48 @@ bool SelectionDAGISel::PrepareEHLandingPad() { const TargetRegisterClass *PtrRC = TLI->getRegClassFor(TLI->getPointerTy(CurDAG->getDataLayout())); + auto Pers = classifyEHPersonality(PersonalityFn); + // Catchpads have one live-in register, which typically holds the exception // pointer or code. - if (const auto *CPI = dyn_cast(LLVMBB->getFirstNonPHI())) { - if (hasExceptionPointerOrCodeUser(CPI)) { - // Get or create the virtual register to hold the pointer or code. Mark - // the live in physreg and copy into the vreg. - MCPhysReg EHPhysReg = TLI->getExceptionPointerRegister(PersonalityFn); - assert(EHPhysReg && "target lacks exception pointer register"); - MBB->addLiveIn(EHPhysReg); - unsigned VReg = FuncInfo->getCatchPadExceptionPointerVReg(CPI, PtrRC); - BuildMI(*MBB, FuncInfo->InsertPt, SDB->getCurDebugLoc(), - TII->get(TargetOpcode::COPY), VReg) - .addReg(EHPhysReg, RegState::Kill); + if (isFuncletEHPersonality(Pers)) { + if (const auto *CPI = dyn_cast(LLVMBB->getFirstNonPHI())) { + if (hasExceptionPointerOrCodeUser(CPI)) { + // Get or create the virtual register to hold the pointer or code. Mark + // the live in physreg and copy into the vreg. + MCPhysReg EHPhysReg = TLI->getExceptionPointerRegister(PersonalityFn); + assert(EHPhysReg && "target lacks exception pointer register"); + MBB->addLiveIn(EHPhysReg); + unsigned VReg = FuncInfo->getCatchPadExceptionPointerVReg(CPI, PtrRC); + BuildMI(*MBB, FuncInfo->InsertPt, SDB->getCurDebugLoc(), + TII->get(TargetOpcode::COPY), VReg) + .addReg(EHPhysReg, RegState::Kill); + } } return true; } - if (!LLVMBB->isLandingPad()) - return true; - // Add a label to mark the beginning of the landing pad. Deletion of the // landing pad can thus be detected via the MachineModuleInfo. MCSymbol *Label = MF->addLandingPad(MBB); - // Assign the call site to the landing pad's begin label. - MF->setCallSiteLandingPad(Label, SDB->LPadToCallSiteMap[MBB]); - const MCInstrDesc &II = TII->get(TargetOpcode::EH_LABEL); BuildMI(*MBB, FuncInfo->InsertPt, SDB->getCurDebugLoc(), II) .addSym(Label); - // Mark exception register as live in. - if (unsigned Reg = TLI->getExceptionPointerRegister(PersonalityFn)) - FuncInfo->ExceptionPointerVirtReg = MBB->addLiveIn(Reg, PtrRC); - - // Mark exception selector register as live in. - if (unsigned Reg = TLI->getExceptionSelectorRegister(PersonalityFn)) - FuncInfo->ExceptionSelectorVirtReg = MBB->addLiveIn(Reg, PtrRC); + if (Pers == EHPersonality::Wasm_CXX) { + if (const auto *CPI = dyn_cast(LLVMBB->getFirstNonPHI())) + mapWasmLandingPadIndex(MBB, CPI); + } else { + // Assign the call site to the landing pad's begin label. + MF->setCallSiteLandingPad(Label, SDB->LPadToCallSiteMap[MBB]); + // Mark exception register as live in. + if (unsigned Reg = TLI->getExceptionPointerRegister(PersonalityFn)) + FuncInfo->ExceptionPointerVirtReg = MBB->addLiveIn(Reg, PtrRC); + // Mark exception selector register as live in. + if (unsigned Reg = TLI->getExceptionSelectorRegister(PersonalityFn)) + FuncInfo->ExceptionSelectorVirtReg = MBB->addLiveIn(Reg, PtrRC); + } return true; } diff --git a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp index b046cd81d6c..341ab927861 100644 --- a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp +++ b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp @@ -1748,6 +1748,10 @@ const MCExpr *TargetLoweringObjectFileWasm::lowerRelativeReference( void TargetLoweringObjectFileWasm::InitializeWasm() { StaticCtorSection = getContext().getWasmSection(".init_array", SectionKind::getData()); + + // We don't use PersonalityEncoding and LSDAEncoding because we don't emit + // .cfi directives. We use TTypeEncoding to encode typeinfo global variables. + TTypeEncoding = dwarf::DW_EH_PE_absptr; } MCSection *TargetLoweringObjectFileWasm::getStaticCtorSection( diff --git a/lib/CodeGen/WasmEHPrepare.cpp b/lib/CodeGen/WasmEHPrepare.cpp index 83d04da5dd0..6f02a05f561 100644 --- a/lib/CodeGen/WasmEHPrepare.cpp +++ b/lib/CodeGen/WasmEHPrepare.cpp @@ -300,7 +300,7 @@ void WasmEHPrepare::prepareEHPad(BasicBlock *BB, unsigned Index) { // This is to create a map of in // SelectionDAGISel, which is to be used in EHStreamer to emit LSDA tables. // Pseudocode: wasm.landingpad.index(Index); - IRB.CreateCall(LPadIndexF, IRB.getInt32(Index)); + IRB.CreateCall(LPadIndexF, {FPI, IRB.getInt32(Index)}); // Pseudocode: __wasm_lpad_context.lpad_index = index; IRB.CreateStore(IRB.getInt32(Index), LPadIndexField); diff --git a/lib/MC/MCObjectFileInfo.cpp b/lib/MC/MCObjectFileInfo.cpp index edfccfcb9ed..b1e03f8efee 100644 --- a/lib/MC/MCObjectFileInfo.cpp +++ b/lib/MC/MCObjectFileInfo.cpp @@ -743,6 +743,12 @@ void MCObjectFileInfo::initWasmMCObjectFileInfo(const Triple &T) { DwarfPubNamesSection = Ctx->getWasmSection(".debug_pubnames", SectionKind::getMetadata()); DwarfPubTypesSection = Ctx->getWasmSection(".debug_pubtypes", SectionKind::getMetadata()); + // Wasm use data section for LSDA. + // TODO Consider putting each function's exception table in a separate + // section, as in -function-sections, to facilitate lld's --gc-section. + LSDASection = Ctx->getWasmSection(".rodata.gcc_except_table", + SectionKind::getReadOnlyWithRel()); + // TODO: Define more sections. } diff --git a/lib/MC/WasmObjectWriter.cpp b/lib/MC/WasmObjectWriter.cpp index cbbe161ae82..f9318ad5801 100644 --- a/lib/MC/WasmObjectWriter.cpp +++ b/lib/MC/WasmObjectWriter.cpp @@ -635,10 +635,12 @@ static void addData(SmallVectorImpl &DataBytes, llvm_unreachable("The fill should be an assembler constant"); DataBytes.insert(DataBytes.end(), Fill->getValueSize() * NumValues, Fill->getValue()); + } else if (auto *LEB = dyn_cast(&Frag)) { + const SmallVectorImpl &Contents = LEB->getContents(); + DataBytes.insert(DataBytes.end(), Contents.begin(), Contents.end()); } else { const auto &DataFrag = cast(Frag); const SmallVectorImpl &Contents = DataFrag.getContents(); - DataBytes.insert(DataBytes.end(), Contents.begin(), Contents.end()); } } diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index 30c2e843408..080bfe771a4 100644 --- a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -21,6 +21,7 @@ #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineJumpTableInfo.h" +#include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/IR/DiagnosticInfo.h" @@ -966,9 +967,17 @@ WebAssemblyTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, default: return {}; // Don't custom lower most intrinsics. - case Intrinsic::wasm_lsda: - // TODO For now, just return 0 not to crash - return DAG.getConstant(0, DL, Op.getValueType()); + case Intrinsic::wasm_lsda: { + MachineFunction &MF = DAG.getMachineFunction(); + EVT VT = Op.getValueType(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout()); + auto &Context = MF.getMMI().getContext(); + MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") + + Twine(MF.getFunctionNumber())); + return DAG.getNode(WebAssemblyISD::Wrapper, DL, VT, + DAG.getMCSymbol(S, PtrVT)); + } } } diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td index 8d98510c67d..4acad5f5943 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td @@ -269,6 +269,8 @@ def : Pat<(i32 (WebAssemblywrapper tglobaladdr:$addr)), (CONST_I32 tglobaladdr:$addr)>; def : Pat<(i32 (WebAssemblywrapper texternalsym:$addr)), (CONST_I32 texternalsym:$addr)>; +def : Pat<(i32 (WebAssemblywrapper mcsym:$sym)), (CONST_I32 mcsym:$sym)>; +def : Pat<(i64 (WebAssemblywrapper mcsym:$sym)), (CONST_I64 mcsym:$sym)>; //===----------------------------------------------------------------------===// // Additional sets of instructions. diff --git a/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp b/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp index e9a0cf51905..15b3da4c8b8 100644 --- a/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp +++ b/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp @@ -226,6 +226,13 @@ void WebAssemblyMCInstLower::Lower(const MachineInstr *MI, (MO.getTargetFlags() & WebAssemblyII::MO_SYMBOL_FUNCTION) != 0, (MO.getTargetFlags() & WebAssemblyII::MO_SYMBOL_GLOBAL) != 0); break; + case MachineOperand::MO_MCSymbol: + // This is currently used only for LSDA symbols (GCC_except_table), + // because global addresses or other external symbols are handled above. + assert(MO.getTargetFlags() == 0 && + "WebAssembly does not use target flags on MCSymbol"); + MCOp = LowerSymbolOperand(MO.getMCSymbol(), /*Offset=*/0, false, false); + break; } OutMI.addOperand(MCOp); diff --git a/test/CodeGen/WebAssembly/eh-lsda.ll b/test/CodeGen/WebAssembly/eh-lsda.ll new file mode 100644 index 00000000000..fd550938c42 --- /dev/null +++ b/test/CodeGen/WebAssembly/eh-lsda.ll @@ -0,0 +1,239 @@ +; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-keep-registers -exception-model=wasm -mattr=+exception-handling | FileCheck -allow-deprecated-dag-overlap %s +target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128" +target triple = "wasm32-unknown-unknown" + +@_ZTIi = external constant i8* +@_ZTIf = external constant i8* +@_ZTId = external constant i8* + +; Single catch (...) does not need an exception table. +; +; try { +; may_throw(); +; } catch (...) { +; } +; CHECK-LABEL: test0: +; CHECK-NOT: GCC_except_table +define void @test0() personality i8* bitcast (i32 (...)* @__gxx_wasm_personality_v0 to i8*) { +entry: + invoke void @may_throw() + to label %try.cont unwind label %catch.dispatch + +catch.dispatch: ; preds = %entry + %0 = catchswitch within none [label %catch.start] unwind to caller + +catch.start: ; preds = %catch.dispatch + %1 = catchpad within %0 [i8* null] + %2 = call i8* @llvm.wasm.get.exception(token %1) + %3 = call i32 @llvm.wasm.get.ehselector(token %1) + %4 = call i8* @__cxa_begin_catch(i8* %2) [ "funclet"(token %1) ] + call void @__cxa_end_catch() [ "funclet"(token %1) ] + catchret from %1 to label %try.cont + +try.cont: ; preds = %entry, %catch.start + ret void +} + +; Exception table generation + shared action test. +; +; try { +; may_throw(); +; } catch (int) { +; } catch (float) { +; } catch (double) { +; } catch (...) { +; } +; +; try { +; may_throw(); +; } catch (double) { +; } catch (...) { +; } +; +; try { +; may_throw(); +; } catch (int) { +; } catch (float) { +; } +; +; There are three landing pads. The second landing pad should share action table +; entries with the first landing pad because they end with the same sequence +; (double -> ...). But the third landing table cannot share action table entries +; with others, so it should create its own entries. +; CHECK-LABEL: test1: +; CHECK: .section .rodata.gcc_except_table,"",@ +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: GCC_except_table[[START:[0-9]+]]: +; CHECK-NEXT: .Lexception0: +; CHECK-NEXT: .int8 255 # @LPStart Encoding = omit +; CHECK-NEXT: .int8 0 # @TType Encoding = absptr +; CHECK-NEXT: .uleb128 .Lttbase0-.Lttbaseref0 +; CHECK-NEXT: .Lttbaseref0: +; CHECK-NEXT: .int8 1 # Call site Encoding = uleb128 +; CHECK-NEXT: .uleb128 .Lcst_end0-.Lcst_begin0 +; CHECK-NEXT: .Lcst_begin0: +; CHECK-NEXT: .int8 0 # >> Call Site 0 << +; CHECK-NEXT: # On exception at call site 0 +; CHECK-NEXT: .int8 7 # Action: 4 +; CHECK-NEXT: .int8 1 # >> Call Site 1 << +; CHECK-NEXT: # On exception at call site 1 +; CHECK-NEXT: .int8 3 # Action: 2 +; CHECK-NEXT: .int8 2 # >> Call Site 2 << +; CHECK-NEXT: # On exception at call site 2 +; CHECK-NEXT: .int8 11 # Action: 6 +; CHECK-NEXT: .Lcst_end0: +; CHECK-NEXT: .int8 1 # >> Action Record 1 << +; CHECK-NEXT: # Catch TypeInfo 1 +; CHECK-NEXT: .int8 0 # No further actions +; CHECK-NEXT: .int8 2 # >> Action Record 2 << +; CHECK-NEXT: # Catch TypeInfo 2 +; CHECK-NEXT: .int8 125 # Continue to action 1 +; CHECK-NEXT: .int8 3 # >> Action Record 3 << +; CHECK-NEXT: # Catch TypeInfo 3 +; CHECK-NEXT: .int8 125 # Continue to action 2 +; CHECK-NEXT: .int8 4 # >> Action Record 4 << +; CHECK-NEXT: # Catch TypeInfo 4 +; CHECK-NEXT: .int8 125 # Continue to action 3 +; CHECK-NEXT: .int8 3 # >> Action Record 5 << +; CHECK-NEXT: # Catch TypeInfo 3 +; CHECK-NEXT: .int8 0 # No further actions +; CHECK-NEXT: .int8 4 # >> Action Record 6 << +; CHECK-NEXT: # Catch TypeInfo 4 +; CHECK-NEXT: .int8 125 # Continue to action 5 +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: # >> Catch TypeInfos << +; CHECK-NEXT: .int32 _ZTIi # TypeInfo 4 +; CHECK-NEXT: .int32 _ZTIf # TypeInfo 3 +; CHECK-NEXT: .int32 _ZTId # TypeInfo 2 +; CHECK-NEXT: .int32 0 # TypeInfo 1 +; CHECK-NEXT: .Lttbase0: +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: .LGCC_except_table_end[[END:[0-9]+]]: +; CHECK-NEXT: .size GCC_except_table[[START]], .LGCC_except_table_end[[END]]-GCC_except_table[[START]] +define void @test1() personality i8* bitcast (i32 (...)* @__gxx_wasm_personality_v0 to i8*) { +entry: + invoke void @may_throw() + to label %try.cont unwind label %catch.dispatch + +catch.dispatch: ; preds = %entry + %0 = catchswitch within none [label %catch.start] unwind to caller + +catch.start: ; preds = %catch.dispatch + %1 = catchpad within %0 [i8* bitcast (i8** @_ZTIi to i8*), i8* bitcast (i8** @_ZTIf to i8*), i8* bitcast (i8** @_ZTId to i8*), i8* null] + %2 = call i8* @llvm.wasm.get.exception(token %1) + %3 = call i32 @llvm.wasm.get.ehselector(token %1) + %4 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*)) + %matches = icmp eq i32 %3, %4 + br i1 %matches, label %catch10, label %catch.fallthrough + +catch10: ; preds = %catch.start + %5 = call i8* @__cxa_begin_catch(i8* %2) [ "funclet"(token %1) ] + %6 = bitcast i8* %5 to i32* + %7 = load i32, i32* %6, align 4 + call void @__cxa_end_catch() [ "funclet"(token %1) ] + catchret from %1 to label %try.cont + +try.cont: ; preds = %entry, %catch, %catch4, %catch7, %catch10 + invoke void @may_throw() + to label %try.cont23 unwind label %catch.dispatch14 + +catch.dispatch14: ; preds = %try.cont + %8 = catchswitch within none [label %catch.start15] unwind to caller + +catch.start15: ; preds = %catch.dispatch14 + %9 = catchpad within %8 [i8* bitcast (i8** @_ZTId to i8*), i8* null] + %10 = call i8* @llvm.wasm.get.exception(token %9) + %11 = call i32 @llvm.wasm.get.ehselector(token %9) + %12 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTId to i8*)) + %matches16 = icmp eq i32 %11, %12 + %13 = call i8* @__cxa_begin_catch(i8* %10) [ "funclet"(token %9) ] + br i1 %matches16, label %catch20, label %catch17 + +catch20: ; preds = %catch.start15 + %14 = bitcast i8* %13 to double* + %15 = load double, double* %14, align 8 + call void @__cxa_end_catch() [ "funclet"(token %9) ] + catchret from %9 to label %try.cont23 + +try.cont23: ; preds = %try.cont, %catch17, %catch20 + invoke void @may_throw() + to label %try.cont36 unwind label %catch.dispatch25 + +catch.dispatch25: ; preds = %try.cont23 + %16 = catchswitch within none [label %catch.start26] unwind to caller + +catch.start26: ; preds = %catch.dispatch25 + %17 = catchpad within %16 [i8* bitcast (i8** @_ZTIi to i8*), i8* bitcast (i8** @_ZTIf to i8*)] + %18 = call i8* @llvm.wasm.get.exception(token %17) + %19 = call i32 @llvm.wasm.get.ehselector(token %17) + %20 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*)) + %matches27 = icmp eq i32 %19, %20 + br i1 %matches27, label %catch33, label %catch.fallthrough28 + +catch33: ; preds = %catch.start26 + %21 = call i8* @__cxa_begin_catch(i8* %18) [ "funclet"(token %17) ] + %22 = bitcast i8* %21 to i32* + %23 = load i32, i32* %22, align 4 + call void @__cxa_end_catch() [ "funclet"(token %17) ] + catchret from %17 to label %try.cont36 + +catch.fallthrough28: ; preds = %catch.start26 + %24 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIf to i8*)) + %matches29 = icmp eq i32 %19, %24 + br i1 %matches29, label %catch30, label %rethrow + +catch30: ; preds = %catch.fallthrough28 + %25 = call i8* @__cxa_begin_catch(i8* %18) [ "funclet"(token %17) ] + %26 = bitcast i8* %25 to float* + %27 = load float, float* %26, align 4 + call void @__cxa_end_catch() [ "funclet"(token %17) ] + catchret from %17 to label %try.cont36 + +rethrow: ; preds = %catch.fallthrough28 + call void @__cxa_rethrow() [ "funclet"(token %17) ] + unreachable + +try.cont36: ; preds = %try.cont23, %catch30, %catch33 + ret void + +catch17: ; preds = %catch.start15 + call void @__cxa_end_catch() [ "funclet"(token %9) ] + catchret from %9 to label %try.cont23 + +catch.fallthrough: ; preds = %catch.start + %28 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIf to i8*)) + %matches1 = icmp eq i32 %3, %28 + br i1 %matches1, label %catch7, label %catch.fallthrough2 + +catch7: ; preds = %catch.fallthrough + %29 = call i8* @__cxa_begin_catch(i8* %2) [ "funclet"(token %1) ] + %30 = bitcast i8* %29 to float* + %31 = load float, float* %30, align 4 + call void @__cxa_end_catch() [ "funclet"(token %1) ] + catchret from %1 to label %try.cont + +catch.fallthrough2: ; preds = %catch.fallthrough + %32 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTId to i8*)) + %matches3 = icmp eq i32 %3, %32 + %33 = call i8* @__cxa_begin_catch(i8* %2) [ "funclet"(token %1) ] + br i1 %matches3, label %catch4, label %catch + +catch4: ; preds = %catch.fallthrough2 + %34 = bitcast i8* %33 to double* + %35 = load double, double* %34, align 8 + call void @__cxa_end_catch() [ "funclet"(token %1) ] + catchret from %1 to label %try.cont + +catch: ; preds = %catch.fallthrough2 + call void @__cxa_end_catch() [ "funclet"(token %1) ] + catchret from %1 to label %try.cont +} + +declare void @may_throw() +declare i32 @llvm.eh.typeid.for(i8*) +declare i8* @llvm.wasm.get.exception(token) +declare i32 @llvm.wasm.get.ehselector(token) +declare void @__cxa_rethrow() +declare i8* @__cxa_begin_catch(i8*) +declare void @__cxa_end_catch() +declare i32 @__gxx_wasm_personality_v0(...) diff --git a/test/CodeGen/WebAssembly/wasmehprepare.ll b/test/CodeGen/WebAssembly/wasmehprepare.ll index e6005e34057..67e198eb058 100644 --- a/test/CodeGen/WebAssembly/wasmehprepare.ll +++ b/test/CodeGen/WebAssembly/wasmehprepare.ll @@ -30,7 +30,7 @@ catch.start: ; preds = %catch.dispatch ; CHECK: catch.start: ; CHECK-NEXT: %[[CATCHPAD:.*]] = catchpad ; CHECK-NEXT: %[[EXN:.*]] = call i8* @llvm.wasm.catch(i32 0) -; CHECK-NEXT: call void @llvm.wasm.landingpad.index(i32 0) +; CHECK-NEXT: call void @llvm.wasm.landingpad.index(token %[[CATCHPAD]], i32 0) ; CHECK-NEXT: store i32 0, i32* getelementptr inbounds ({ i32, i8*, i32 }, { i32, i8*, i32 }* @__wasm_lpad_context, i32 0, i32 0) ; CHECK-NEXT: %[[LSDA:.*]] = call i8* @llvm.wasm.lsda() ; CHECK-NEXT: store i8* %[[LSDA]], i8** getelementptr inbounds ({ i32, i8*, i32 }, { i32, i8*, i32 }* @__wasm_lpad_context, i32 0, i32 1) @@ -98,7 +98,7 @@ catch.start3: ; preds = %catch.dispatch2 %matches = icmp eq i32 %8, %9 br i1 %matches, label %catch4, label %rethrow ; CHECK: catch.start3: -; CHECK: call void @llvm.wasm.landingpad.index(i32 0) +; CHECK: call void @llvm.wasm.landingpad.index(token %{{.+}}, i32 0) catch4: ; preds = %catch.start3 %10 = call i8* @__cxa_begin_catch(i8* %7) [ "funclet"(token %6) ] @@ -311,7 +311,7 @@ declare void @__cxa_rethrow() declare void @__clang_call_terminate(i8*) ; CHECK-DAG: declare i8* @llvm.wasm.catch(i32) -; CHECK-DAG: declare void @llvm.wasm.landingpad.index(i32) +; CHECK-DAG: declare void @llvm.wasm.landingpad.index(token, i32) ; CHECK-DAG: declare i8* @llvm.wasm.lsda() ; CHECK-DAG: declare i32 @_Unwind_CallPersonality(i8*) -- GitLab From 341f13c81dcaa21a6beab540e71e0bc15c526e66 Mon Sep 17 00:00:00 2001 From: Sebastian Pop Date: Tue, 16 Oct 2018 00:42:07 +0000 Subject: [PATCH 0231/1116] [hot-cold-split] fix failing testcases git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344577 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/Transforms/HotColdSplit/split-cold-2.ll | 2 +- .../HotColdSplit/split-out-dbg-val-of-arg.ll | 14 +++++++++++--- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/test/Transforms/HotColdSplit/split-cold-2.ll b/test/Transforms/HotColdSplit/split-cold-2.ll index e243a47623a..3e1a567113a 100644 --- a/test/Transforms/HotColdSplit/split-cold-2.ll +++ b/test/Transforms/HotColdSplit/split-cold-2.ll @@ -13,7 +13,7 @@ entry: br i1 undef, label %if.then, label %if.else if.then: - unreachable + ret void if.else: br label %if.then4 diff --git a/test/Transforms/HotColdSplit/split-out-dbg-val-of-arg.ll b/test/Transforms/HotColdSplit/split-out-dbg-val-of-arg.ll index 4b81de7b35b..dcaff122442 100644 --- a/test/Transforms/HotColdSplit/split-out-dbg-val-of-arg.ll +++ b/test/Transforms/HotColdSplit/split-out-dbg-val-of-arg.ll @@ -9,7 +9,7 @@ entry: br i1 undef, label %if.then, label %if.end, !dbg !12 if.then: ; preds = %entry - unreachable, !dbg !13 + ret void, !dbg !13 if.end: ; preds = %entry call void @llvm.dbg.value(metadata i32 %arg1, metadata !9, metadata !DIExpression()), !dbg !11 @@ -19,10 +19,16 @@ if.then12: ; preds = %if.end br label %cleanup40, !dbg !15 cleanup40: ; preds = %if.then12 - br label %return, !dbg !16 + br i1 undef, label %if.then5, label %if.end1, !dbg !16 + +if.then5: + br label %return, !dbg !17 + +if.end1: + br label %return, !dbg !18 return: ; preds = %cleanup40 - ret void, !dbg !17 + unreachable, !dbg !19 } declare void @llvm.dbg.value(metadata, metadata, metadata) @@ -49,3 +55,5 @@ declare void @llvm.dbg.value(metadata, metadata, metadata) !15 = !DILocation(line: 5, column: 1, scope: !6) !16 = !DILocation(line: 6, column: 1, scope: !6) !17 = !DILocation(line: 7, column: 1, scope: !6) +!18 = !DILocation(line: 8, column: 1, scope: !6) +!19 = !DILocation(line: 9, column: 1, scope: !6) -- GitLab From f2cb5da6a45f63427c0d1e6a3f0deca57c44429e Mon Sep 17 00:00:00 2001 From: Max Kazantsev Date: Tue, 16 Oct 2018 05:26:21 +0000 Subject: [PATCH 0232/1116] [SCEV] Limit AddRec "simplifications" to avoid combinatorial explosions SCEV's transform that turns `{A1,+,A2,+,...,+,An} * {B1,+,B2,+,...,+,Bn}` into a single AddRec of size `2n+1` with complex combinatorial coefficients can easily trigger exponential growth of the SCEV (in case if nothing gets folded and simplified). We tried to restrain this transform using the option `scalar-evolution-max-add-rec-size`, but its default value seems to be insufficiently small: the test attached to this patch with default value of this option `16` has a SCEV of >3M symbols (when printed out). This patch reduces the simplification limit. It is not a cure to combinatorial explosions, but at least it reduces this corner case to something more or less reasonable. Differential Revision: https://reviews.llvm.org/D53282 Reviewed By: sanjoy git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344584 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Analysis/ScalarEvolution.cpp | 2 +- .../ScalarEvolution/binomial-explision.ll | 47 +++++++++++++++++++ 2 files changed, 48 insertions(+), 1 deletion(-) create mode 100644 test/Analysis/ScalarEvolution/binomial-explision.ll diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp index 4a30447f647..60cd1cb4127 100644 --- a/lib/Analysis/ScalarEvolution.cpp +++ b/lib/Analysis/ScalarEvolution.cpp @@ -204,7 +204,7 @@ static cl::opt static cl::opt MaxAddRecSize("scalar-evolution-max-add-rec-size", cl::Hidden, cl::desc("Max coefficients in AddRec during evolving"), - cl::init(16)); + cl::init(8)); //===----------------------------------------------------------------------===// // SCEV class definitions diff --git a/test/Analysis/ScalarEvolution/binomial-explision.ll b/test/Analysis/ScalarEvolution/binomial-explision.ll new file mode 100644 index 00000000000..82d0beda6b5 --- /dev/null +++ b/test/Analysis/ScalarEvolution/binomial-explision.ll @@ -0,0 +1,47 @@ +; RUN: opt -analyze -scalar-evolution < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1" + +; Check that we don't have unreasonably huge SCEVs and in particular only a +; reasonable amount of AddRecs in the notation of %tmp19. If we "simplify" SCEVs +; too aggressively, we may end up with huge nested expressions. +define void @test(i32 %x, i64 %y, i1 %cond) { + +; CHECK: %tmp19 = mul i32 %tmp17, %tmp18 +; CHECK: (((( +; CHECK-NOT: ((((( +; CHECK: %tmp20 = add i32 %tmp19, %x + +bb: + br label %bb1 + +bb1: ; preds = %bb3, %bb + %tmp = phi i64 [ %y, %bb ], [ %tmp22, %bb3 ] + %tmp2 = phi i32 [ %x, %bb ], [ %tmp4, %bb3 ] + br label %bb5 + +bb3: ; preds = %bb5 + %tmp4 = add i32 %tmp2, %x + br label %bb1 + +bb5: ; preds = %bb5, %bb1 + %tmp6 = phi i32 [ %tmp23, %bb5 ], [ %tmp2, %bb1 ] + %tmp7 = sub i32 -119, %tmp6 + %tmp8 = mul i32 %tmp7, %x + %tmp9 = sub i32 -120, %tmp6 + %tmp10 = mul i32 %tmp8, %tmp9 + %tmp11 = mul i32 %x, %tmp10 + %tmp12 = sub i32 -121, %tmp6 + %tmp13 = mul i32 %tmp10, %tmp12 + %tmp14 = mul i32 %tmp11, %tmp13 + %tmp15 = sub i32 -122, %tmp6 + %tmp16 = mul i32 %tmp13, %tmp15 + %tmp17 = mul i32 %tmp14, %tmp16 + %tmp18 = mul i32 %tmp16, %x + %tmp19 = mul i32 %tmp17, %tmp18 + %tmp20 = add i32 %tmp19, %x + %tmp21 = sext i32 %tmp20 to i64 + %tmp22 = add i64 %y, %tmp21 + %tmp23 = add i32 %tmp6, 7 + br i1 %cond, label %bb5, label %bb3 +} -- GitLab From d043791034148b50b5479124613a622f2b17cb7e Mon Sep 17 00:00:00 2001 From: Alexander Shaposhnikov Date: Tue, 16 Oct 2018 05:40:18 +0000 Subject: [PATCH 0233/1116] [llvm-objcopy] Factor out Buffer In this diff we move out the hierarchy of buffers from Object.h/Object.cpp into separate files since it is not ELF-specific and will be reused later. After this change Object.h/Object.cpp are almost exclusively ELF-specific. Test plan: make check-all Differential revision: https://reviews.llvm.org/D53298 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344585 91177308-0d34-0410-b5e6-96231b3b80d8 --- tools/llvm-objcopy/Buffer.cpp | 51 ++++++++++++++++++++++ tools/llvm-objcopy/Buffer.h | 66 +++++++++++++++++++++++++++++ tools/llvm-objcopy/CMakeLists.txt | 1 + tools/llvm-objcopy/Object.cpp | 31 -------------- tools/llvm-objcopy/Object.h | 44 +------------------ tools/llvm-objcopy/llvm-objcopy.cpp | 1 + 6 files changed, 120 insertions(+), 74 deletions(-) create mode 100644 tools/llvm-objcopy/Buffer.cpp create mode 100644 tools/llvm-objcopy/Buffer.h diff --git a/tools/llvm-objcopy/Buffer.cpp b/tools/llvm-objcopy/Buffer.cpp new file mode 100644 index 00000000000..8044b023aaa --- /dev/null +++ b/tools/llvm-objcopy/Buffer.cpp @@ -0,0 +1,51 @@ +//===- Buffer.cpp ---------------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "Buffer.h" +#include "llvm-objcopy.h" +#include "llvm/Support/FileOutputBuffer.h" +#include "llvm/Support/MemoryBuffer.h" +#include + +namespace llvm { +namespace objcopy { + +Buffer::~Buffer() {} + +void FileBuffer::allocate(size_t Size) { + Expected> BufferOrErr = + FileOutputBuffer::create(getName(), Size, FileOutputBuffer::F_executable); + handleAllErrors(BufferOrErr.takeError(), [this](const ErrorInfoBase &E) { + error("failed to open " + getName() + ": " + E.message()); + }); + Buf = std::move(*BufferOrErr); +} + +Error FileBuffer::commit() { return Buf->commit(); } + +uint8_t *FileBuffer::getBufferStart() { + return reinterpret_cast(Buf->getBufferStart()); +} + +void MemBuffer::allocate(size_t Size) { + Buf = WritableMemoryBuffer::getNewMemBuffer(Size, getName()); +} + +Error MemBuffer::commit() { return Error::success(); } + +uint8_t *MemBuffer::getBufferStart() { + return reinterpret_cast(Buf->getBufferStart()); +} + +std::unique_ptr MemBuffer::releaseMemoryBuffer() { + return std::move(Buf); +} + +} // end namespace objcopy +} // end namespace llvm diff --git a/tools/llvm-objcopy/Buffer.h b/tools/llvm-objcopy/Buffer.h new file mode 100644 index 00000000000..e5b9c5b2d22 --- /dev/null +++ b/tools/llvm-objcopy/Buffer.h @@ -0,0 +1,66 @@ +//===- Buffer.h -------------------------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_OBJCOPY_BUFFER_H +#define LLVM_TOOLS_OBJCOPY_BUFFER_H + +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/FileOutputBuffer.h" +#include "llvm/Support/MemoryBuffer.h" +#include + +namespace llvm { +namespace objcopy { + +// The class Buffer abstracts out the common interface of FileOutputBuffer and +// WritableMemoryBuffer so that the hierarchy of Writers depends on this +// abstract interface and doesn't depend on a particular implementation. +// TODO: refactor the buffer classes in LLVM to enable us to use them here +// directly. +class Buffer { + StringRef Name; + +public: + virtual ~Buffer(); + virtual void allocate(size_t Size) = 0; + virtual uint8_t *getBufferStart() = 0; + virtual Error commit() = 0; + + explicit Buffer(StringRef Name) : Name(Name) {} + StringRef getName() const { return Name; } +}; + +class FileBuffer : public Buffer { + std::unique_ptr Buf; + +public: + void allocate(size_t Size) override; + uint8_t *getBufferStart() override; + Error commit() override; + + explicit FileBuffer(StringRef FileName) : Buffer(FileName) {} +}; + +class MemBuffer : public Buffer { + std::unique_ptr Buf; + +public: + void allocate(size_t Size) override; + uint8_t *getBufferStart() override; + Error commit() override; + + explicit MemBuffer(StringRef Name) : Buffer(Name) {} + + std::unique_ptr releaseMemoryBuffer(); +}; + +} // end namespace objcopy +} // end namespace llvm + +#endif // LLVM_TOOLS_OBJCOPY_BUFFER_H diff --git a/tools/llvm-objcopy/CMakeLists.txt b/tools/llvm-objcopy/CMakeLists.txt index 8d963e56758..9ac7d0eb4c2 100644 --- a/tools/llvm-objcopy/CMakeLists.txt +++ b/tools/llvm-objcopy/CMakeLists.txt @@ -14,6 +14,7 @@ tablegen(LLVM StripOpts.inc -gen-opt-parser-defs) add_public_tablegen_target(StripOptsTableGen) add_llvm_tool(llvm-objcopy + Buffer.cpp CopyConfig.cpp llvm-objcopy.cpp Object.cpp diff --git a/tools/llvm-objcopy/Object.cpp b/tools/llvm-objcopy/Object.cpp index ddf811a769b..d677579ea23 100644 --- a/tools/llvm-objcopy/Object.cpp +++ b/tools/llvm-objcopy/Object.cpp @@ -33,37 +33,6 @@ using namespace llvm::objcopy; using namespace object; using namespace ELF; -Buffer::~Buffer() {} - -void FileBuffer::allocate(size_t Size) { - Expected> BufferOrErr = - FileOutputBuffer::create(getName(), Size, FileOutputBuffer::F_executable); - handleAllErrors(BufferOrErr.takeError(), [this](const ErrorInfoBase &E) { - error("failed to open " + getName() + ": " + E.message()); - }); - Buf = std::move(*BufferOrErr); -} - -Error FileBuffer::commit() { return Buf->commit(); } - -uint8_t *FileBuffer::getBufferStart() { - return reinterpret_cast(Buf->getBufferStart()); -} - -void MemBuffer::allocate(size_t Size) { - Buf = WritableMemoryBuffer::getNewMemBuffer(Size, getName()); -} - -Error MemBuffer::commit() { return Error::success(); } - -uint8_t *MemBuffer::getBufferStart() { - return reinterpret_cast(Buf->getBufferStart()); -} - -std::unique_ptr MemBuffer::releaseMemoryBuffer() { - return std::move(Buf); -} - template void ELFWriter::writePhdr(const Segment &Seg) { uint8_t *B = Buf.getBufferStart(); B += Obj.ProgramHdrSegment.Offset + Seg.Index * sizeof(Elf_Phdr); diff --git a/tools/llvm-objcopy/Object.h b/tools/llvm-objcopy/Object.h index 46c8f1ca4bf..1019391fa89 100644 --- a/tools/llvm-objcopy/Object.h +++ b/tools/llvm-objcopy/Object.h @@ -10,6 +10,7 @@ #ifndef LLVM_TOOLS_OBJCOPY_OBJECT_H #define LLVM_TOOLS_OBJCOPY_OBJECT_H +#include "Buffer.h" #include "CopyConfig.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/StringRef.h" @@ -30,7 +31,6 @@ namespace llvm { enum class DebugCompressionType; namespace objcopy { -class Buffer; class SectionBase; class Section; class OwnedDataSection; @@ -146,48 +146,6 @@ public: explicit BinarySectionWriter(Buffer &Buf) : SectionWriter(Buf) {} }; -// The class Buffer abstracts out the common interface of FileOutputBuffer and -// WritableMemoryBuffer so that the hierarchy of Writers depends on this -// abstract interface and doesn't depend on a particular implementation. -// TODO: refactor the buffer classes in LLVM to enable us to use them here -// directly. -class Buffer { - StringRef Name; - -public: - virtual ~Buffer(); - virtual void allocate(size_t Size) = 0; - virtual uint8_t *getBufferStart() = 0; - virtual Error commit() = 0; - - explicit Buffer(StringRef Name) : Name(Name) {} - StringRef getName() const { return Name; } -}; - -class FileBuffer : public Buffer { - std::unique_ptr Buf; - -public: - void allocate(size_t Size) override; - uint8_t *getBufferStart() override; - Error commit() override; - - explicit FileBuffer(StringRef FileName) : Buffer(FileName) {} -}; - -class MemBuffer : public Buffer { - std::unique_ptr Buf; - -public: - void allocate(size_t Size) override; - uint8_t *getBufferStart() override; - Error commit() override; - - explicit MemBuffer(StringRef Name) : Buffer(Name) {} - - std::unique_ptr releaseMemoryBuffer(); -}; - class Writer { protected: Object &Obj; diff --git a/tools/llvm-objcopy/llvm-objcopy.cpp b/tools/llvm-objcopy/llvm-objcopy.cpp index c9b170d1d61..b7dbf6c66b3 100644 --- a/tools/llvm-objcopy/llvm-objcopy.cpp +++ b/tools/llvm-objcopy/llvm-objcopy.cpp @@ -8,6 +8,7 @@ //===----------------------------------------------------------------------===// #include "llvm-objcopy.h" +#include "Buffer.h" #include "CopyConfig.h" #include "Object.h" -- GitLab From 600d43cad2724c404c52e8401ef934f1e1f90e41 Mon Sep 17 00:00:00 2001 From: Max Kazantsev Date: Tue, 16 Oct 2018 06:34:53 +0000 Subject: [PATCH 0234/1116] [NFC] Turn isGuaranteedToExecute into a method git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344587 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Analysis/MustExecute.h | 11 +++++------ lib/Analysis/MustExecute.cpp | 16 ++++++++-------- lib/Transforms/Scalar/LICM.cpp | 6 +++--- lib/Transforms/Scalar/LoopUnswitch.cpp | 2 +- 4 files changed, 17 insertions(+), 18 deletions(-) diff --git a/include/llvm/Analysis/MustExecute.h b/include/llvm/Analysis/MustExecute.h index 40a02735d1b..82387476a6d 100644 --- a/include/llvm/Analysis/MustExecute.h +++ b/include/llvm/Analysis/MustExecute.h @@ -82,15 +82,14 @@ public: /// LoopSafetyInfo. Some callers rely on this fact. void computeLoopSafetyInfo(Loop *); + /// Returns true if the instruction in a loop is guaranteed to execute at + /// least once (under the assumption that the loop is entered). + bool isGuaranteedToExecute(const Instruction &Inst, const DominatorTree *DT, + const Loop *CurLoop) const; + LoopSafetyInfo() = default; }; -/// Returns true if the instruction in a loop is guaranteed to execute at least -/// once (under the assumption that the loop is entered). -bool isGuaranteedToExecute(const Instruction &Inst, const DominatorTree *DT, - const Loop *CurLoop, - const LoopSafetyInfo *SafetyInfo); - } #endif diff --git a/lib/Analysis/MustExecute.cpp b/lib/Analysis/MustExecute.cpp index 79ec8e400c0..7c1ce86d15b 100644 --- a/lib/Analysis/MustExecute.cpp +++ b/lib/Analysis/MustExecute.cpp @@ -176,9 +176,9 @@ bool LoopSafetyInfo::allLoopPathsLeadToBlock(const Loop *CurLoop, /// Returns true if the instruction in a loop is guaranteed to execute at least /// once. -bool llvm::isGuaranteedToExecute(const Instruction &Inst, - const DominatorTree *DT, const Loop *CurLoop, - const LoopSafetyInfo *SafetyInfo) { +bool LoopSafetyInfo::isGuaranteedToExecute(const Instruction &Inst, + const DominatorTree *DT, + const Loop *CurLoop) const { // We have to check to make sure that the instruction dominates all // of the exit blocks. If it doesn't, then there is a path out of the loop // which does not execute this instruction, so we can't hoist it. @@ -191,17 +191,17 @@ bool llvm::isGuaranteedToExecute(const Instruction &Inst, // Inst unless we can prove that Inst comes before the potential implicit // exit. At the moment, we use a (cheap) hack for the common case where // the instruction of interest is the first one in the block. - return !SafetyInfo->headerMayThrow() || - Inst.getParent()->getFirstNonPHIOrDbg() == &Inst; + return !headerMayThrow() || + Inst.getParent()->getFirstNonPHIOrDbg() == &Inst; // Somewhere in this loop there is an instruction which may throw and make us // exit the loop. - if (SafetyInfo->anyBlockMayThrow()) + if (anyBlockMayThrow()) return false; // If there is a path from header to exit or latch that doesn't lead to our // instruction's block, return false. - if (!SafetyInfo->allLoopPathsLeadToBlock(CurLoop, Inst.getParent(), DT)) + if (!allLoopPathsLeadToBlock(CurLoop, Inst.getParent(), DT)) return false; return true; @@ -242,7 +242,7 @@ static bool isMustExecuteIn(const Instruction &I, Loop *L, DominatorTree *DT) { // caller actually gets the full power at the moment. LoopSafetyInfo LSI; LSI.computeLoopSafetyInfo(L); - return isGuaranteedToExecute(I, DT, L, &LSI) || + return LSI.isGuaranteedToExecute(I, DT, L) || isGuaranteedToExecuteForEveryIteration(&I, L); } diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp index 601d49fc03f..9bf75a4ffbf 100644 --- a/lib/Transforms/Scalar/LICM.cpp +++ b/lib/Transforms/Scalar/LICM.cpp @@ -1116,7 +1116,7 @@ static void hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop, // The check on hasMetadataOtherThanDebugLoc is to prevent us from burning // time in isGuaranteedToExecute if we don't actually have anything to // drop. It is a compile time optimization, not required for correctness. - !isGuaranteedToExecute(I, DT, CurLoop, SafetyInfo)) + !SafetyInfo->isGuaranteedToExecute(I, DT, CurLoop)) I.dropUnknownNonDebugMetadata(); // Move the new node to the Preheader, before its terminator. @@ -1150,7 +1150,7 @@ static bool isSafeToExecuteUnconditionally(Instruction &Inst, return true; bool GuaranteedToExecute = - isGuaranteedToExecute(Inst, DT, CurLoop, SafetyInfo); + SafetyInfo->isGuaranteedToExecute(Inst, DT, CurLoop); if (!GuaranteedToExecute) { auto *LI = dyn_cast(&Inst); @@ -1408,7 +1408,7 @@ bool llvm::promoteLoopAccessesToScalars( if (!DereferenceableInPH || !SafeToInsertStore || (InstAlignment > Alignment)) { - if (isGuaranteedToExecute(*UI, DT, CurLoop, SafetyInfo)) { + if (SafetyInfo->isGuaranteedToExecute(*UI, DT, CurLoop)) { DereferenceableInPH = true; SafeToInsertStore = true; Alignment = std::max(Alignment, InstAlignment); diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp index 13e6bd13754..cd49f51283f 100644 --- a/lib/Transforms/Scalar/LoopUnswitch.cpp +++ b/lib/Transforms/Scalar/LoopUnswitch.cpp @@ -721,7 +721,7 @@ bool LoopUnswitch::processCurrentLoop() { // This is a workaround for the discrepancy between LLVM IR and MSan // semantics. See PR28054 for more details. if (SanitizeMemory && - !isGuaranteedToExecute(*TI, DT, currentLoop, &SafetyInfo)) + !SafetyInfo.isGuaranteedToExecute(*TI, DT, currentLoop)) continue; if (BranchInst *BI = dyn_cast(TI)) { -- GitLab From 2173a4b23e50db6a3df8ddc7c17bf73cd39a1828 Mon Sep 17 00:00:00 2001 From: Max Kazantsev Date: Tue, 16 Oct 2018 07:50:14 +0000 Subject: [PATCH 0235/1116] [NFC] Move block throw check inside allLoopPathsLeadToBlock git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344588 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Analysis/MustExecute.h | 7 +++++-- lib/Analysis/MustExecute.cpp | 16 ++++++++++------ 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/include/llvm/Analysis/MustExecute.h b/include/llvm/Analysis/MustExecute.h index 82387476a6d..e643e4ec563 100644 --- a/include/llvm/Analysis/MustExecute.h +++ b/include/llvm/Analysis/MustExecute.h @@ -65,13 +65,16 @@ public: /// abnormally. bool headerMayThrow() const; + /// Returns true iff the block \p BB potentially may throw exception. It can + /// be false-positive in cases when we want to avoid complex analysis. + bool blockMayThrow(const BasicBlock *BB) const; + /// Returns true iff any block of the loop for which this info is contains an /// instruction that may throw or otherwise exit abnormally. bool anyBlockMayThrow() const; /// Return true if we must reach the block \p BB under assumption that the - /// loop \p CurLoop is entered and no instruction throws or otherwise exits - /// abnormally. + /// loop \p CurLoop is entered. bool allLoopPathsLeadToBlock(const Loop *CurLoop, const BasicBlock *BB, const DominatorTree *DT) const; diff --git a/lib/Analysis/MustExecute.cpp b/lib/Analysis/MustExecute.cpp index 7c1ce86d15b..7f0912de26b 100644 --- a/lib/Analysis/MustExecute.cpp +++ b/lib/Analysis/MustExecute.cpp @@ -26,6 +26,11 @@ bool LoopSafetyInfo::headerMayThrow() const { return HeaderMayThrow; } +bool LoopSafetyInfo::blockMayThrow(const BasicBlock *BB) const { + (void)BB; + return anyBlockMayThrow(); +} + bool LoopSafetyInfo::anyBlockMayThrow() const { return MayThrow; } @@ -148,7 +153,10 @@ bool LoopSafetyInfo::allLoopPathsLeadToBlock(const Loop *CurLoop, // 3) Exit blocks which are not taken on 1st iteration. // Memoize blocks we've already checked. SmallPtrSet CheckedSuccessors; - for (auto *Pred : Predecessors) + for (auto *Pred : Predecessors) { + // Predecessor block may throw, so it has a side exit. + if (blockMayThrow(Pred)) + return false; for (auto *Succ : successors(Pred)) if (CheckedSuccessors.insert(Succ).second && Succ != BB && !Predecessors.count(Succ)) @@ -169,6 +177,7 @@ bool LoopSafetyInfo::allLoopPathsLeadToBlock(const Loop *CurLoop, if (CurLoop->contains(Succ) || !CanProveNotTakenFirstIteration(Succ, DT, CurLoop)) return false; + } // All predecessors can only lead us to BB. return true; @@ -194,11 +203,6 @@ bool LoopSafetyInfo::isGuaranteedToExecute(const Instruction &Inst, return !headerMayThrow() || Inst.getParent()->getFirstNonPHIOrDbg() == &Inst; - // Somewhere in this loop there is an instruction which may throw and make us - // exit the loop. - if (anyBlockMayThrow()) - return false; - // If there is a path from header to exit or latch that doesn't lead to our // instruction's block, return false. if (!allLoopPathsLeadToBlock(CurLoop, Inst.getParent(), DT)) -- GitLab From 72b430c741dde011dbb9e5445f4cfffa807b3f2a Mon Sep 17 00:00:00 2001 From: David Stenberg Date: Tue, 16 Oct 2018 08:06:48 +0000 Subject: [PATCH 0236/1116] [DebugInfo][LCSSA] Rewrite pre-existing debug values outside loop Summary: Extend LCSSA so that debug values outside loops are rewritten to use the PHI nodes that the pass creates. This fixes PR39019. In that case, we ran LCSSA on a loop that was later on vectorized, which left us with something like this: for.cond.cleanup: %add.lcssa = phi i32 [ %add, %for.body ], [ %34, %middle.block ] call void @llvm.dbg.value(metadata i32 %add, ret i32 %add.lcssa for.body: %add = [...] br i1 %exitcond, label %for.cond.cleanup, label %for.body which later resulted in the debug.value becoming undef when removing the scalar loop (and the location would have probably been wrong for the vectorized case otherwise). As we now may need to query the AvailableVals cache more than once for a basic block, FindAvailableVals() in SSAUpdaterImpl is changed so that it updates the cache for blocks that we do not create a PHI node for, regardless of the block's number of predecessors. The debug value in the attached IR reproducer would not be properly rewritten without this. Debug values residing in blocks where we have not inserted any PHI nodes are currently left as-is by this patch. I'm not sure what should be done with those uses. Reviewers: mattd, aprantl, vsk, probinson Reviewed By: mattd, aprantl Subscribers: jmorse, gbedwell, JDevlieghere, llvm-commits Differential Revision: https://reviews.llvm.org/D53130 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344589 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Transforms/Utils/SSAUpdater.h | 4 ++ .../llvm/Transforms/Utils/SSAUpdaterImpl.h | 7 +- lib/Transforms/Utils/LCSSA.cpp | 16 +++++ lib/Transforms/Utils/SSAUpdater.cpp | 5 ++ .../LCSSA/rewrite-existing-dbg-values.ll | 69 +++++++++++++++++++ 5 files changed, 97 insertions(+), 4 deletions(-) create mode 100644 test/Transforms/LCSSA/rewrite-existing-dbg-values.ll diff --git a/include/llvm/Transforms/Utils/SSAUpdater.h b/include/llvm/Transforms/Utils/SSAUpdater.h index 4a791166299..d02607acbbb 100644 --- a/include/llvm/Transforms/Utils/SSAUpdater.h +++ b/include/llvm/Transforms/Utils/SSAUpdater.h @@ -76,6 +76,10 @@ public: /// block. bool HasValueForBlock(BasicBlock *BB) const; + /// Return the value for the specified block if the SSAUpdater has one, + /// otherwise return nullptr. + Value *FindValueForBlock(BasicBlock *BB) const; + /// Construct SSA form, materializing a value that is live at the end /// of the specified block. Value *GetValueAtEndOfBlock(BasicBlock *BB); diff --git a/include/llvm/Transforms/Utils/SSAUpdaterImpl.h b/include/llvm/Transforms/Utils/SSAUpdaterImpl.h index b7649ba8833..cab0f3e7157 100644 --- a/include/llvm/Transforms/Utils/SSAUpdaterImpl.h +++ b/include/llvm/Transforms/Utils/SSAUpdaterImpl.h @@ -357,10 +357,9 @@ public: BBInfo *Info = *I; if (Info->DefBB != Info) { - // Record the available value at join nodes to speed up subsequent - // uses of this SSAUpdater for the same value. - if (Info->NumPreds > 1) - (*AvailableVals)[Info->BB] = Info->DefBB->AvailableVal; + // Record the available value to speed up subsequent uses of this + // SSAUpdater for the same value. + (*AvailableVals)[Info->BB] = Info->DefBB->AvailableVal; continue; } diff --git a/lib/Transforms/Utils/LCSSA.cpp b/lib/Transforms/Utils/LCSSA.cpp index a1f8e7484bc..53d444b309d 100644 --- a/lib/Transforms/Utils/LCSSA.cpp +++ b/lib/Transforms/Utils/LCSSA.cpp @@ -41,6 +41,7 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/PredIteratorCache.h" #include "llvm/Pass.h" #include "llvm/Transforms/Utils.h" @@ -201,6 +202,21 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl &Worklist, SSAUpdate.RewriteUse(*UseToRewrite); } + SmallVector DbgValues; + llvm::findDbgValues(DbgValues, I); + + // Update pre-existing debug value uses that reside outside the loop. + auto &Ctx = I->getContext(); + for (auto DVI : DbgValues) { + BasicBlock *UserBB = DVI->getParent(); + if (InstBB == UserBB || L->contains(UserBB)) + continue; + // We currently only handle debug values residing in blocks where we have + // inserted a PHI instruction. + if (Value *V = SSAUpdate.FindValueForBlock(UserBB)) + DVI->setOperand(0, MetadataAsValue::get(Ctx, ValueAsMetadata::get(V))); + } + // SSAUpdater might have inserted phi-nodes inside other loops. We'll need // to post-process them to keep LCSSA form. for (PHINode *InsertedPN : InsertedPHIs) { diff --git a/lib/Transforms/Utils/SSAUpdater.cpp b/lib/Transforms/Utils/SSAUpdater.cpp index 4a1fd8d571a..9e5fb0e7172 100644 --- a/lib/Transforms/Utils/SSAUpdater.cpp +++ b/lib/Transforms/Utils/SSAUpdater.cpp @@ -64,6 +64,11 @@ bool SSAUpdater::HasValueForBlock(BasicBlock *BB) const { return getAvailableVals(AV).count(BB); } +Value *SSAUpdater::FindValueForBlock(BasicBlock *BB) const { + AvailableValsTy::iterator AVI = getAvailableVals(AV).find(BB); + return (AVI != getAvailableVals(AV).end()) ? AVI->second : nullptr; +} + void SSAUpdater::AddAvailableValue(BasicBlock *BB, Value *V) { assert(ProtoType && "Need to initialize SSAUpdater"); assert(ProtoType == V->getType() && diff --git a/test/Transforms/LCSSA/rewrite-existing-dbg-values.ll b/test/Transforms/LCSSA/rewrite-existing-dbg-values.ll new file mode 100644 index 00000000000..563a75f407f --- /dev/null +++ b/test/Transforms/LCSSA/rewrite-existing-dbg-values.ll @@ -0,0 +1,69 @@ +; RUN: opt -S -lcssa < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Reproducer for PR39019. +; +; Verify that the llvm.dbg.value in the %for.cond.cleanup2 block is rewritten +; to use the PHI node for %add that is created by LCSSA. + +; CHECK-LABEL: for.cond.cleanup2: +; CHECK-NEXT: [[PN:%[^ ]*]] = phi i32 [ %add.lcssa, %for.cond.cleanup1 ] +; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 [[PN]], metadata [[VAR:![0-9]+]], metadata !DIExpression()) +; CHECK-NEXT: call void @bar(i32 [[PN]]) + +; CHECK-LABEL: for.body: +; CHECK: %add = add nsw i32 0, 2 +; CHECK: call void @llvm.dbg.value(metadata i32 %add, metadata [[VAR]], metadata !DIExpression()) + +; CHECK: [[VAR]] = !DILocalVariable(name: "sum", + +; Function Attrs: nounwind +define void @foo() #0 !dbg !6 { +entry: + br label %for.cond.preheader, !dbg !12 + +for.cond.preheader: ; preds = %for.cond.cleanup1, %entry + br label %for.body, !dbg !12 + +for.cond.cleanup2: ; preds = %for.cond.cleanup1 + call void @llvm.dbg.value(metadata i32 %add, metadata !9, metadata !DIExpression()), !dbg !12 + tail call void @bar(i32 %add) #0, !dbg !12 + ret void, !dbg !12 + +for.cond.cleanup1: ; preds = %for.body + br i1 false, label %for.cond.preheader, label %for.cond.cleanup2, !dbg !12 + +for.body: ; preds = %for.body, %for.cond.preheader + %add = add nsw i32 0, 2, !dbg !12 + call void @llvm.dbg.value(metadata i32 %add, metadata !9, metadata !DIExpression()), !dbg !12 + br i1 false, label %for.body, label %for.cond.cleanup1, !dbg !12 +} + +; Function Attrs: nounwind +declare void @bar(i32) #0 + +; Function Attrs: nounwind readnone speculatable +declare void @llvm.dbg.value(metadata, metadata, metadata) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone speculatable } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!3, !4} +!llvm.ident = !{!5} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 8.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, globals: !2, nameTableKind: None) +!1 = !DIFile(filename: "foo.c", directory: "/") +!2 = !{} +!3 = !{i32 2, !"Dwarf Version", i32 4} +!4 = !{i32 2, !"Debug Info Version", i32 3} +!5 = !{!"clang version 8.0.0"} +!6 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 10, type: !7, isLocal: false, isDefinition: true, scopeLine: 10, isOptimized: true, unit: !0, retainedNodes: !8) +!7 = !DISubroutineType(types: !2) +!8 = !{!9} +!9 = !DILocalVariable(name: "sum", scope: !10, file: !1, line: 11, type: !11) +!10 = !DILexicalBlockFile(scope: !6, file: !1, discriminator: 0) +!11 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!12 = !DILocation(line: 0, scope: !10) -- GitLab From d871042d6011d26a98608ce19cd90a2dc6794301 Mon Sep 17 00:00:00 2001 From: Max Kazantsev Date: Tue, 16 Oct 2018 08:07:14 +0000 Subject: [PATCH 0237/1116] [NFC] Encapsulate work with BlockColors in LoopSafetyInfo git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344590 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Analysis/MustExecute.h | 16 +++++++++++++--- lib/Analysis/MustExecute.cpp | 17 ++++++++++++++++- lib/Transforms/Scalar/LICM.cpp | 15 ++++++--------- 3 files changed, 35 insertions(+), 13 deletions(-) diff --git a/include/llvm/Analysis/MustExecute.h b/include/llvm/Analysis/MustExecute.h index e643e4ec563..d78b38bdead 100644 --- a/include/llvm/Analysis/MustExecute.h +++ b/include/llvm/Analysis/MustExecute.h @@ -49,6 +49,9 @@ class LoopSafetyInfo { // may throw. bool HeaderMayThrow = false; // Same as previous, but specific to loop header + // Used to update funclet bundle operands. + DenseMap BlockColors; + /// Collect all blocks from \p CurLoop which lie on all possible paths from /// the header of \p CurLoop (inclusive) to BB (exclusive) into the set /// \p Predecessors. If \p BB is the header, \p Predecessors will be empty. @@ -56,9 +59,16 @@ class LoopSafetyInfo { const Loop *CurLoop, const BasicBlock *BB, SmallPtrSetImpl &Predecessors) const; +protected: + /// Computes block colors. + void computeBlockColors(const Loop *CurLoop); + public: - // Used to update funclet bundle operands. - DenseMap BlockColors; + /// Returns block colors map that is used to update funclet operand bundles. + const DenseMap &getBlockColors() const; + + /// Copy colors of block \p Old into the block \p New. + void copyColors(BasicBlock *New, BasicBlock *Old); /// Returns true iff the header block of the loop for which this info is /// calculated contains an instruction that may throw or otherwise exit @@ -83,7 +93,7 @@ public: /// as argument. Updates safety information in LoopSafetyInfo argument. /// Note: This is defined to clear and reinitialize an already initialized /// LoopSafetyInfo. Some callers rely on this fact. - void computeLoopSafetyInfo(Loop *); + void computeLoopSafetyInfo(const Loop *CurLoop); /// Returns true if the instruction in a loop is guaranteed to execute at /// least once (under the assumption that the loop is entered). diff --git a/lib/Analysis/MustExecute.cpp b/lib/Analysis/MustExecute.cpp index 7f0912de26b..bce941be26c 100644 --- a/lib/Analysis/MustExecute.cpp +++ b/lib/Analysis/MustExecute.cpp @@ -22,6 +22,17 @@ #include "llvm/Support/raw_ostream.h" using namespace llvm; +const DenseMap & +LoopSafetyInfo::getBlockColors() const { + return BlockColors; +} + +void LoopSafetyInfo::copyColors(BasicBlock *New, BasicBlock *Old) { + ColorVector &ColorsForNewBlock = BlockColors[New]; + ColorVector &ColorsForOldBlock = BlockColors[Old]; + ColorsForNewBlock = ColorsForOldBlock; +} + bool LoopSafetyInfo::headerMayThrow() const { return HeaderMayThrow; } @@ -35,7 +46,7 @@ bool LoopSafetyInfo::anyBlockMayThrow() const { return MayThrow; } -void LoopSafetyInfo::computeLoopSafetyInfo(Loop *CurLoop) { +void LoopSafetyInfo::computeLoopSafetyInfo(const Loop *CurLoop) { assert(CurLoop != nullptr && "CurLoop can't be null"); BasicBlock *Header = CurLoop->getHeader(); // Iterate over header and compute safety info. @@ -51,6 +62,10 @@ void LoopSafetyInfo::computeLoopSafetyInfo(Loop *CurLoop) { (BB != BBE) && !MayThrow; ++BB) MayThrow |= !isGuaranteedToTransferExecutionToSuccessor(*BB); + computeBlockColors(CurLoop); +} + +void LoopSafetyInfo::computeBlockColors(const Loop *CurLoop) { // Compute funclet colors if we might sink/hoist in a function with a funclet // personality routine. Function *Fn = CurLoop->getHeader()->getParent(); diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp index 9bf75a4ffbf..6c899289593 100644 --- a/lib/Transforms/Scalar/LICM.cpp +++ b/lib/Transforms/Scalar/LICM.cpp @@ -798,7 +798,7 @@ static bool isFreeInLoop(const Instruction &I, const Loop *CurLoop, static bool isNotUsedOrFreeInLoop(const Instruction &I, const Loop *CurLoop, const LoopSafetyInfo *SafetyInfo, TargetTransformInfo *TTI, bool &FreeInLoop) { - const auto &BlockColors = SafetyInfo->BlockColors; + const auto &BlockColors = SafetyInfo->getBlockColors(); bool IsFree = isFreeInLoop(I, CurLoop, TTI); for (const User *U : I.users()) { const Instruction *UI = cast(U); @@ -833,7 +833,7 @@ CloneInstructionInExitBlock(Instruction &I, BasicBlock &ExitBlock, PHINode &PN, const LoopSafetyInfo *SafetyInfo) { Instruction *New; if (auto *CI = dyn_cast(&I)) { - const auto &BlockColors = SafetyInfo->BlockColors; + const auto &BlockColors = SafetyInfo->getBlockColors(); // Sinking call-sites need to be handled differently from other // instructions. The cloned call-site needs a funclet bundle operand @@ -913,7 +913,7 @@ static bool canSplitPredecessors(PHINode *PN, LoopSafetyInfo *SafetyInfo) { // it require updating BlockColors for all offspring blocks accordingly. By // skipping such corner case, we can make updating BlockColors after splitting // predecessor fairly simple. - if (!SafetyInfo->BlockColors.empty() && BB->getFirstNonPHI()->isEHPad()) + if (!SafetyInfo->getBlockColors().empty() && BB->getFirstNonPHI()->isEHPad()) return false; for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) { BasicBlock *BBPred = *PI; @@ -967,7 +967,7 @@ static void splitPredecessorsOfLoopExit(PHINode *PN, DominatorTree *DT, // LE: // %p = phi [%p1, %LE.split], [%p2, %LE.split2] // - auto &BlockColors = SafetyInfo->BlockColors; + const auto &BlockColors = SafetyInfo->getBlockColors(); SmallSetVector PredBBs(pred_begin(ExitBB), pred_end(ExitBB)); while (!PredBBs.empty()) { BasicBlock *PredBB = *PredBBs.begin(); @@ -979,14 +979,11 @@ static void splitPredecessorsOfLoopExit(PHINode *PN, DominatorTree *DT, // Since we do not allow splitting EH-block with BlockColors in // canSplitPredecessors(), we can simply assign predecessor's color to // the new block. - if (!BlockColors.empty()) { + if (!BlockColors.empty()) // Grab a reference to the ColorVector to be inserted before getting the // reference to the vector we are copying because inserting the new // element in BlockColors might cause the map to be reallocated. - ColorVector &ColorsForNewBlock = BlockColors[NewPred]; - ColorVector &ColorsForOldBlock = BlockColors[PredBB]; - ColorsForNewBlock = ColorsForOldBlock; - } + SafetyInfo->copyColors(NewPred, PredBB); } PredBBs.remove(PredBB); } -- GitLab From e3a3e26e8f64091f0d640d63d560219d6e198e74 Mon Sep 17 00:00:00 2001 From: Aleksandar Beserminji Date: Tue, 16 Oct 2018 08:27:28 +0000 Subject: [PATCH 0238/1116] [mips][micromips] Fix how values in .gcc_except_table are calculated When a landing pad is calculated in a program that is compiled for micromips, it will point to an even address. Such an error will cause a segmentation fault, as the instructions in micromips are aligned on odd addresses. This patch sets the last bit of the offset where a landing pad is, to 1, which will effectively be an odd address and point to the instruction exactly. Differential Revision: https://reviews.llvm.org/D52985 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344591 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/MC/MCAsmBackend.h | 5 +++ lib/MC/MCExpr.cpp | 5 +++ .../Mips/MCTargetDesc/MipsAsmBackend.cpp | 8 ++++ lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h | 2 + .../Mips/micromips-gcc-except-table.ll | 37 +++++++++++++++++++ 5 files changed, 57 insertions(+) create mode 100644 test/CodeGen/Mips/micromips-gcc-except-table.ll diff --git a/include/llvm/MC/MCAsmBackend.h b/include/llvm/MC/MCAsmBackend.h index 030d3c05aa5..07835c21fce 100644 --- a/include/llvm/MC/MCAsmBackend.h +++ b/include/llvm/MC/MCAsmBackend.h @@ -165,6 +165,11 @@ public: return 0; } + /// Check whether a given symbol has been flagged with MICROMIPS flag. + virtual bool isMicroMips(const MCSymbol *Sym) const { + return false; + } + /// Handles all target related code padding when starting to write a new /// basic block to an object file. /// diff --git a/lib/MC/MCExpr.cpp b/lib/MC/MCExpr.cpp index a4458e64bd3..38f311be7c6 100644 --- a/lib/MC/MCExpr.cpp +++ b/lib/MC/MCExpr.cpp @@ -526,6 +526,11 @@ static void AttemptToFoldSymbolOffsetDifference( if (Asm->isThumbFunc(&SA)) Addend |= 1; + // If symbol is labeled as micromips, we set low-bit to ensure + // correct offset in .gcc_except_table + if (Asm->getBackend().isMicroMips(&SA)) + Addend |= 1; + // Clear the symbol expr pointers to indicate we have folded these // operands. A = B = nullptr; diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp index 4544be9f27f..63f9151da6b 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp @@ -569,6 +569,14 @@ bool MipsAsmBackend::shouldForceRelocation(const MCAssembler &Asm, } } +bool MipsAsmBackend::isMicroMips(const MCSymbol *Sym) const { + if (const auto *ElfSym = dyn_cast(Sym)) { + if (ElfSym->getOther() & ELF::STO_MIPS_MICROMIPS) + return true; + } + return false; +} + MCAsmBackend *llvm::createMipsAsmBackend(const Target &T, const MCSubtargetInfo &STI, const MCRegisterInfo &MRI, diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h index 3d5e16fcf9b..30359132e92 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h +++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h @@ -25,6 +25,7 @@ class MCAssembler; struct MCFixupKindInfo; class MCObjectWriter; class MCRegisterInfo; +class MCSymbolELF; class Target; class MipsAsmBackend : public MCAsmBackend { @@ -90,6 +91,7 @@ public: bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup, const MCValue &Target) override; + bool isMicroMips(const MCSymbol *Sym) const override; }; // class MipsAsmBackend } // namespace diff --git a/test/CodeGen/Mips/micromips-gcc-except-table.ll b/test/CodeGen/Mips/micromips-gcc-except-table.ll new file mode 100644 index 00000000000..38a76927e2a --- /dev/null +++ b/test/CodeGen/Mips/micromips-gcc-except-table.ll @@ -0,0 +1,37 @@ +; RUN: llc -mtriple=mips-linux-gnu -mcpu=mips32r2 -mattr=+micromips -O3 -filetype=obj < %s | llvm-objdump -s -j .gcc_except_table - | FileCheck %s + +; CHECK: Contents of section .gcc_except_table: +; CHECK-NEXT: 0000 ff9b1501 0c011100 00110e1f 011f1800 +; CHECK-NEXT: 0010 00010000 00000000 + +@_ZTIi = external constant i8* + +define dso_local i32 @main() local_unnamed_addr norecurse personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { +entry: + %exception.i = tail call i8* @__cxa_allocate_exception(i32 4) nounwind + %0 = bitcast i8* %exception.i to i32* + store i32 5, i32* %0, align 16 + invoke void @__cxa_throw(i8* %exception.i, i8* bitcast (i8** @_ZTIi to i8*), i8* null) noreturn + to label %.noexc unwind label %return + +.noexc: + unreachable + +return: + %1 = landingpad { i8*, i32 } + catch i8* null + %2 = extractvalue { i8*, i32 } %1, 0 + %3 = tail call i8* @__cxa_begin_catch(i8* %2) nounwind + tail call void @__cxa_end_catch() + ret i32 0 +} + +declare i32 @__gxx_personality_v0(...) + +declare i8* @__cxa_begin_catch(i8*) local_unnamed_addr + +declare void @__cxa_end_catch() local_unnamed_addr + +declare i8* @__cxa_allocate_exception(i32) local_unnamed_addr + +declare void @__cxa_throw(i8*, i8*, i8*) local_unnamed_addr -- GitLab From 288477a9492e7bdc4b3650b1fe2b8df092078f86 Mon Sep 17 00:00:00 2001 From: Max Kazantsev Date: Tue, 16 Oct 2018 08:31:05 +0000 Subject: [PATCH 0239/1116] [NFC] Make LoopSafetyInfo abstract to allow alternative implementations git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344592 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Analysis/MustExecute.h | 46 +++++++++++++++----- lib/Analysis/MustExecute.cpp | 16 +++---- lib/Transforms/Scalar/LICM.cpp | 2 +- lib/Transforms/Scalar/LoopIdiomRecognize.cpp | 2 +- lib/Transforms/Scalar/LoopUnswitch.cpp | 2 +- lib/Transforms/Utils/LoopUnrollAndJam.cpp | 2 +- 6 files changed, 48 insertions(+), 22 deletions(-) diff --git a/include/llvm/Analysis/MustExecute.h b/include/llvm/Analysis/MustExecute.h index d78b38bdead..6a6a127b7c3 100644 --- a/include/llvm/Analysis/MustExecute.h +++ b/include/llvm/Analysis/MustExecute.h @@ -45,10 +45,6 @@ class Loop; /// loop were made and the info wasn't recomputed properly, the behavior of all /// methods except for computeLoopSafetyInfo is undefined. class LoopSafetyInfo { - bool MayThrow = false; // The current loop contains an instruction which - // may throw. - bool HeaderMayThrow = false; // Same as previous, but specific to loop header - // Used to update funclet bundle operands. DenseMap BlockColors; @@ -73,15 +69,15 @@ public: /// Returns true iff the header block of the loop for which this info is /// calculated contains an instruction that may throw or otherwise exit /// abnormally. - bool headerMayThrow() const; + virtual bool headerMayThrow() const = 0; /// Returns true iff the block \p BB potentially may throw exception. It can /// be false-positive in cases when we want to avoid complex analysis. - bool blockMayThrow(const BasicBlock *BB) const; + virtual bool blockMayThrow(const BasicBlock *BB) const = 0; /// Returns true iff any block of the loop for which this info is contains an /// instruction that may throw or otherwise exit abnormally. - bool anyBlockMayThrow() const; + virtual bool anyBlockMayThrow() const = 0; /// Return true if we must reach the block \p BB under assumption that the /// loop \p CurLoop is entered. @@ -93,14 +89,44 @@ public: /// as argument. Updates safety information in LoopSafetyInfo argument. /// Note: This is defined to clear and reinitialize an already initialized /// LoopSafetyInfo. Some callers rely on this fact. - void computeLoopSafetyInfo(const Loop *CurLoop); + virtual void computeLoopSafetyInfo(const Loop *CurLoop) = 0; /// Returns true if the instruction in a loop is guaranteed to execute at /// least once (under the assumption that the loop is entered). - bool isGuaranteedToExecute(const Instruction &Inst, const DominatorTree *DT, - const Loop *CurLoop) const; + virtual bool isGuaranteedToExecute(const Instruction &Inst, + const DominatorTree *DT, + const Loop *CurLoop) const = 0; LoopSafetyInfo() = default; + + virtual ~LoopSafetyInfo() = default; +}; + + +/// Simple and conservative implementation of LoopSafetyInfo that can give +/// false-positive answers to its queries in order to avoid complicated +/// analysis. +class SimpleLoopSafetyInfo: public LoopSafetyInfo { + bool MayThrow = false; // The current loop contains an instruction which + // may throw. + bool HeaderMayThrow = false; // Same as previous, but specific to loop header + +public: + virtual bool headerMayThrow() const; + + virtual bool blockMayThrow(const BasicBlock *BB) const; + + virtual bool anyBlockMayThrow() const; + + virtual void computeLoopSafetyInfo(const Loop *CurLoop); + + virtual bool isGuaranteedToExecute(const Instruction &Inst, + const DominatorTree *DT, + const Loop *CurLoop) const; + + SimpleLoopSafetyInfo() : LoopSafetyInfo() {}; + + virtual ~SimpleLoopSafetyInfo() {}; }; } diff --git a/lib/Analysis/MustExecute.cpp b/lib/Analysis/MustExecute.cpp index bce941be26c..618e2e3e30d 100644 --- a/lib/Analysis/MustExecute.cpp +++ b/lib/Analysis/MustExecute.cpp @@ -33,20 +33,20 @@ void LoopSafetyInfo::copyColors(BasicBlock *New, BasicBlock *Old) { ColorsForNewBlock = ColorsForOldBlock; } -bool LoopSafetyInfo::headerMayThrow() const { +bool SimpleLoopSafetyInfo::headerMayThrow() const { return HeaderMayThrow; } -bool LoopSafetyInfo::blockMayThrow(const BasicBlock *BB) const { +bool SimpleLoopSafetyInfo::blockMayThrow(const BasicBlock *BB) const { (void)BB; return anyBlockMayThrow(); } -bool LoopSafetyInfo::anyBlockMayThrow() const { +bool SimpleLoopSafetyInfo::anyBlockMayThrow() const { return MayThrow; } -void LoopSafetyInfo::computeLoopSafetyInfo(const Loop *CurLoop) { +void SimpleLoopSafetyInfo::computeLoopSafetyInfo(const Loop *CurLoop) { assert(CurLoop != nullptr && "CurLoop can't be null"); BasicBlock *Header = CurLoop->getHeader(); // Iterate over header and compute safety info. @@ -200,9 +200,9 @@ bool LoopSafetyInfo::allLoopPathsLeadToBlock(const Loop *CurLoop, /// Returns true if the instruction in a loop is guaranteed to execute at least /// once. -bool LoopSafetyInfo::isGuaranteedToExecute(const Instruction &Inst, - const DominatorTree *DT, - const Loop *CurLoop) const { +bool SimpleLoopSafetyInfo::isGuaranteedToExecute(const Instruction &Inst, + const DominatorTree *DT, + const Loop *CurLoop) const { // We have to check to make sure that the instruction dominates all // of the exit blocks. If it doesn't, then there is a path out of the loop // which does not execute this instruction, so we can't hoist it. @@ -259,7 +259,7 @@ static bool isMustExecuteIn(const Instruction &I, Loop *L, DominatorTree *DT) { // TODO: merge these two routines. For the moment, we display the best // result obtained by *either* implementation. This is a bit unfair since no // caller actually gets the full power at the moment. - LoopSafetyInfo LSI; + SimpleLoopSafetyInfo LSI; LSI.computeLoopSafetyInfo(L); return LSI.isGuaranteedToExecute(I, DT, L) || isGuaranteedToExecuteForEveryIteration(&I, L); diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp index 6c899289593..e72342b88b6 100644 --- a/lib/Transforms/Scalar/LICM.cpp +++ b/lib/Transforms/Scalar/LICM.cpp @@ -267,7 +267,7 @@ bool LoopInvariantCodeMotion::runOnLoop( BasicBlock *Preheader = L->getLoopPreheader(); // Compute loop safety information. - LoopSafetyInfo SafetyInfo; + SimpleLoopSafetyInfo SafetyInfo; SafetyInfo.computeLoopSafetyInfo(L); // We want to visit all of the instructions in this loop... that are not parts diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index 9a45551f64b..4b375956a12 100644 --- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -320,7 +320,7 @@ bool LoopIdiomRecognize::runOnCountableLoop() { // The following transforms hoist stores/memsets into the loop pre-header. // Give up if the loop has instructions may throw. - LoopSafetyInfo SafetyInfo; + SimpleLoopSafetyInfo SafetyInfo; SafetyInfo.computeLoopSafetyInfo(CurLoop); if (SafetyInfo.anyBlockMayThrow()) return MadeChange; diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp index cd49f51283f..4a089dfa7db 100644 --- a/lib/Transforms/Scalar/LoopUnswitch.cpp +++ b/lib/Transforms/Scalar/LoopUnswitch.cpp @@ -189,7 +189,7 @@ namespace { BasicBlock *loopPreheader = nullptr; bool SanitizeMemory; - LoopSafetyInfo SafetyInfo; + SimpleLoopSafetyInfo SafetyInfo; // LoopBlocks contains all of the basic blocks of the loop, including the // preheader of the loop, the body of the loop, and the exit blocks of the diff --git a/lib/Transforms/Utils/LoopUnrollAndJam.cpp b/lib/Transforms/Utils/LoopUnrollAndJam.cpp index c17a64f0187..8949c603a84 100644 --- a/lib/Transforms/Utils/LoopUnrollAndJam.cpp +++ b/lib/Transforms/Utils/LoopUnrollAndJam.cpp @@ -761,7 +761,7 @@ bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT, } // Check the loop safety info for exceptions. - LoopSafetyInfo LSI; + SimpleLoopSafetyInfo LSI; LSI.computeLoopSafetyInfo(L); if (LSI.anyBlockMayThrow()) { LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Something may throw\n"); -- GitLab From 141415c0fe2b6e60203a7dc08b672b179cff371b Mon Sep 17 00:00:00 2001 From: Max Kazantsev Date: Tue, 16 Oct 2018 09:11:25 +0000 Subject: [PATCH 0240/1116] [NFC] Remove obsolete method headerMayThrow git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344596 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Analysis/MustExecute.h | 7 ------- lib/Analysis/MustExecute.cpp | 15 ++------------- 2 files changed, 2 insertions(+), 20 deletions(-) diff --git a/include/llvm/Analysis/MustExecute.h b/include/llvm/Analysis/MustExecute.h index 6a6a127b7c3..f136ff750de 100644 --- a/include/llvm/Analysis/MustExecute.h +++ b/include/llvm/Analysis/MustExecute.h @@ -66,11 +66,6 @@ public: /// Copy colors of block \p Old into the block \p New. void copyColors(BasicBlock *New, BasicBlock *Old); - /// Returns true iff the header block of the loop for which this info is - /// calculated contains an instruction that may throw or otherwise exit - /// abnormally. - virtual bool headerMayThrow() const = 0; - /// Returns true iff the block \p BB potentially may throw exception. It can /// be false-positive in cases when we want to avoid complex analysis. virtual bool blockMayThrow(const BasicBlock *BB) const = 0; @@ -112,8 +107,6 @@ class SimpleLoopSafetyInfo: public LoopSafetyInfo { bool HeaderMayThrow = false; // Same as previous, but specific to loop header public: - virtual bool headerMayThrow() const; - virtual bool blockMayThrow(const BasicBlock *BB) const; virtual bool anyBlockMayThrow() const; diff --git a/lib/Analysis/MustExecute.cpp b/lib/Analysis/MustExecute.cpp index 618e2e3e30d..4e42f336dc7 100644 --- a/lib/Analysis/MustExecute.cpp +++ b/lib/Analysis/MustExecute.cpp @@ -33,10 +33,6 @@ void LoopSafetyInfo::copyColors(BasicBlock *New, BasicBlock *Old) { ColorsForNewBlock = ColorsForOldBlock; } -bool SimpleLoopSafetyInfo::headerMayThrow() const { - return HeaderMayThrow; -} - bool SimpleLoopSafetyInfo::blockMayThrow(const BasicBlock *BB) const { (void)BB; return anyBlockMayThrow(); @@ -203,10 +199,6 @@ bool LoopSafetyInfo::allLoopPathsLeadToBlock(const Loop *CurLoop, bool SimpleLoopSafetyInfo::isGuaranteedToExecute(const Instruction &Inst, const DominatorTree *DT, const Loop *CurLoop) const { - // We have to check to make sure that the instruction dominates all - // of the exit blocks. If it doesn't, then there is a path out of the loop - // which does not execute this instruction, so we can't hoist it. - // If the instruction is in the header block for the loop (which is very // common), it is always guaranteed to dominate the exit blocks. Since this // is a common case, and can save some work, check it now. @@ -215,15 +207,12 @@ bool SimpleLoopSafetyInfo::isGuaranteedToExecute(const Instruction &Inst, // Inst unless we can prove that Inst comes before the potential implicit // exit. At the moment, we use a (cheap) hack for the common case where // the instruction of interest is the first one in the block. - return !headerMayThrow() || + return !HeaderMayThrow || Inst.getParent()->getFirstNonPHIOrDbg() == &Inst; // If there is a path from header to exit or latch that doesn't lead to our // instruction's block, return false. - if (!allLoopPathsLeadToBlock(CurLoop, Inst.getParent(), DT)) - return false; - - return true; + return allLoopPathsLeadToBlock(CurLoop, Inst.getParent(), DT); } -- GitLab From 6ccf5849d202cf261cf33335a027f8967e98b0f0 Mon Sep 17 00:00:00 2001 From: Renato Golin Date: Tue, 16 Oct 2018 09:37:52 +0000 Subject: [PATCH 0241/1116] [VPlan] Script to extract VPlan digraphs from log The vectoriser's debug log prints VPlan digraphs, but it's a bit cumbersome to extract them and render them into PNG images. This script does exactly that, being careful enough to extract all individual plans, name them appropriately and save in either .dot or .png files. Example usage: $ opt -O3 -debug-only=loop-vectorize file.ll -S -o /dev/null 2> debug.log $ $LLVM_SRC/utils/extract_vplan.py < debug.log Exporting VF1UF1 to DOT: VPlanVF1UF1.dot Exporting VF24UF1 to DOT: VPlanVF24UF1.dot $ $LLVM_SRC/utils/extract_vplan.py --png < debug.log Exporting VF1UF1 to PNG via dot: VPlanVF1UF1.png Exporting VF24UF1 to PNG via dot: VPlanVF24UF1.png $ xdot VPlanVF1UF1.dot Differential Revision: https://reviews.llvm.org/D53142 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344599 91177308-0d34-0410-b5e6-96231b3b80d8 --- utils/extract_vplan.py | 46 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100755 utils/extract_vplan.py diff --git a/utils/extract_vplan.py b/utils/extract_vplan.py new file mode 100755 index 00000000000..ac0055d2e79 --- /dev/null +++ b/utils/extract_vplan.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python + +# This script extracts the VPlan digraphs from the vectoriser debug messages +# and saves them in individual dot files (one for each plan). Optionally, and +# providing 'dot' is installed, it can also render the dot into a PNG file. + +import sys +import re +import argparse +import shutil +import subprocess + +parser = argparse.ArgumentParser() +parser.add_argument('--png', action='store_true') +args = parser.parse_args() + +dot = shutil.which('dot') +if args.png and not dot: + raise RuntimeError("Can't export to PNG without 'dot' in the system") + +pattern = re.compile(r"(digraph VPlan {.*?\n})",re.DOTALL) +matches = re.findall(pattern, sys.stdin.read()) + +for vplan in matches: + m = re.search("graph \[.+(VF=.+,UF.+), ", vplan) + if not m: + raise ValueError("Can't get the right VPlan name") + name = re.sub('[^a-zA-Z0-9]', '', m.group(1)) + + if args.png: + filename = 'VPlan' + name + '.png' + print("Exporting " + name + " to PNG via dot: " + filename) + p = subprocess.Popen([dot, '-Tpng', '-o', filename], + encoding='utf-8', + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + out, err = p.communicate(input=vplan) + if err: + raise RuntimeError("Error running dot: " + err) + + else: + filename = 'VPlan' + name + '.dot' + print("Exporting " + name + " to DOT: " + filename) + with open(filename, 'w') as out: + out.write(vplan) -- GitLab From 1f1ae517ddc8777691807df63e3e3752164a5fa2 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 16 Oct 2018 09:50:16 +0000 Subject: [PATCH 0242/1116] [X86] Fix Skylake ReadAfterLd for PADDrm etc. Missed in rL343868 as due to their custom InstrRW. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344600 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86SchedSkylakeClient.td | 6 +- lib/Target/X86/X86SchedSkylakeServer.td | 6 +- test/tools/llvm-mca/X86/read-after-ld-2.s | 104 +++++++++++----------- 3 files changed, 61 insertions(+), 55 deletions(-) diff --git a/lib/Target/X86/X86SchedSkylakeClient.td b/lib/Target/X86/X86SchedSkylakeClient.td index b5d842a52b5..d4a3eb07b98 100644 --- a/lib/Target/X86/X86SchedSkylakeClient.td +++ b/lib/Target/X86/X86SchedSkylakeClient.td @@ -1133,7 +1133,8 @@ def SKLWriteResGroup91 : SchedWriteRes<[SKLPort23,SKLPort015]> { def: InstRW<[SKLWriteResGroup91], (instrs VINSERTF128rm, VINSERTI128rm, VPBLENDDrmi)>; -def: InstRW<[SKLWriteResGroup91], (instregex "(V?)PADD(B|D|Q|W)rm", +def: InstRW<[SKLWriteResGroup91, ReadAfterVecXLd], + (instregex "(V?)PADD(B|D|Q|W)rm", "(V?)PSUB(B|D|Q|W)rm")>; def SKLWriteResGroup92 : SchedWriteRes<[SKLPort5,SKLPort23]> { @@ -1230,7 +1231,8 @@ def SKLWriteResGroup110 : SchedWriteRes<[SKLPort23,SKLPort015]> { let ResourceCycles = [1,1]; } def: InstRW<[SKLWriteResGroup110], (instrs VPBLENDDYrmi)>; -def: InstRW<[SKLWriteResGroup110], (instregex "VPADD(B|D|Q|W)Yrm", +def: InstRW<[SKLWriteResGroup110, ReadAfterVecYLd], + (instregex "VPADD(B|D|Q|W)Yrm", "VPSUB(B|D|Q|W)Yrm")>; def SKLWriteResGroup112 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> { diff --git a/lib/Target/X86/X86SchedSkylakeServer.td b/lib/Target/X86/X86SchedSkylakeServer.td index d3fa912be11..cbcb6a6e58b 100644 --- a/lib/Target/X86/X86SchedSkylakeServer.td +++ b/lib/Target/X86/X86SchedSkylakeServer.td @@ -1339,7 +1339,8 @@ def SKXWriteResGroup95 : SchedWriteRes<[SKXPort23,SKXPort015]> { } def: InstRW<[SKXWriteResGroup95], (instrs VMOVNTDQAZ128rm, VPBLENDDrmi)>; -def: InstRW<[SKXWriteResGroup95], (instregex "VBLENDMPDZ128rm(b?)", +def: InstRW<[SKXWriteResGroup95, ReadAfterVecXLd], + (instregex "VBLENDMPDZ128rm(b?)", "VBLENDMPSZ128rm(b?)", "VBROADCASTI32X2Z128m(b?)", "VBROADCASTSSZ128m(b?)", @@ -1534,7 +1535,8 @@ def SKXWriteResGroup121 : SchedWriteRes<[SKXPort23,SKXPort015]> { } def: InstRW<[SKXWriteResGroup121], (instrs VMOVNTDQAZ256rm, VPBLENDDYrmi)>; -def: InstRW<[SKXWriteResGroup121], (instregex "VBLENDMPD(Z|Z256)rm(b?)", +def: InstRW<[SKXWriteResGroup121, ReadAfterVecYLd], + (instregex "VBLENDMPD(Z|Z256)rm(b?)", "VBLENDMPS(Z|Z256)rm(b?)", "VBROADCASTF32X2Z256m(b?)", "VBROADCASTF32X2Zm(b?)", diff --git a/test/tools/llvm-mca/X86/read-after-ld-2.s b/test/tools/llvm-mca/X86/read-after-ld-2.s index ee39b645d5a..7d549b39595 100644 --- a/test/tools/llvm-mca/X86/read-after-ld-2.s +++ b/test/tools/llvm-mca/X86/read-after-ld-2.s @@ -23,7 +23,7 @@ cmp %edi, %edx # HASWELL-NEXT: Total Cycles: 143 # HASWELL-NEXT: Total uOps: 500 -# SKYLAKE-NEXT: Total Cycles: 803 +# SKYLAKE-NEXT: Total Cycles: 110 # SKYLAKE-NEXT: Total uOps: 500 # ZNVER1-NEXT: Total Cycles: 110 @@ -40,8 +40,8 @@ cmp %edi, %edx # HASWELL-NEXT: Block RThroughput: 1.3 # SKYLAKE: Dispatch Width: 6 -# SKYLAKE-NEXT: uOps Per Cycle: 0.62 -# SKYLAKE-NEXT: IPC: 0.50 +# SKYLAKE-NEXT: uOps Per Cycle: 4.55 +# SKYLAKE-NEXT: IPC: 3.64 # SKYLAKE-NEXT: Block RThroughput: 0.8 # ZNVER1: Dispatch Width: 4 @@ -57,8 +57,8 @@ cmp %edi, %edx # HASWELL-NEXT: 0123456789 # HASWELL-NEXT: Index 0123456789 012 -# SKYLAKE-NEXT: 0123456789 0123456789 0123456789 01234 -# SKYLAKE-NEXT: Index 0123456789 0123456789 0123456789 0123456789 +# SKYLAKE-NEXT: 0123456789 +# SKYLAKE-NEXT: Index 0123456789 # ZNVER1-NEXT: 0123456789 # ZNVER1-NEXT: Index 0123456789 @@ -145,43 +145,46 @@ cmp %edi, %edx # HASWELL-NEXT: [9,2] . . . DeE-------R addq $32, %r8 # HASWELL-NEXT: [9,3] . . . DeE------R cmpl %edi, %edx -# SKYLAKE: [0,0] DeER . . . . . . . . . . . . . . . addl $1, %edx -# SKYLAKE-NEXT: [0,1] DeeeeeeeeER . . . . . . . . . . . . . vpaddd (%r8), %ymm0, %ymm0 -# SKYLAKE-NEXT: [0,2] DeE-------R . . . . . . . . . . . . . addq $32, %r8 -# SKYLAKE-NEXT: [0,3] D=eE------R . . . . . . . . . . . . . cmpl %edi, %edx -# SKYLAKE-NEXT: [1,0] D=eE------R . . . . . . . . . . . . . addl $1, %edx -# SKYLAKE-NEXT: [1,1] .D=======eeeeeeeeER . . . . . . . . . . . . vpaddd (%r8), %ymm0, %ymm0 -# SKYLAKE-NEXT: [1,2] .DeE--------------R . . . . . . . . . . . . addq $32, %r8 -# SKYLAKE-NEXT: [1,3] .D=eE-------------R . . . . . . . . . . . . cmpl %edi, %edx -# SKYLAKE-NEXT: [2,0] .D=eE-------------R . . . . . . . . . . . . addl $1, %edx -# SKYLAKE-NEXT: [2,1] . D==============eeeeeeeeER . . . . . . . . . . vpaddd (%r8), %ymm0, %ymm0 -# SKYLAKE-NEXT: [2,2] . DeE---------------------R . . . . . . . . . . addq $32, %r8 -# SKYLAKE-NEXT: [2,3] . D=eE--------------------R . . . . . . . . . . cmpl %edi, %edx -# SKYLAKE-NEXT: [3,0] . D=eE--------------------R . . . . . . . . . . addl $1, %edx -# SKYLAKE-NEXT: [3,1] . D=====================eeeeeeeeER. . . . . . . . . vpaddd (%r8), %ymm0, %ymm0 -# SKYLAKE-NEXT: [3,2] . DeE----------------------------R. . . . . . . . . addq $32, %r8 -# SKYLAKE-NEXT: [3,3] . D=eE---------------------------R. . . . . . . . . cmpl %edi, %edx -# SKYLAKE-NEXT: [4,0] . D=eE---------------------------R. . . . . . . . . addl $1, %edx -# SKYLAKE-NEXT: [4,1] . D============================eeeeeeeeER . . . . . . . vpaddd (%r8), %ymm0, %ymm0 -# SKYLAKE-NEXT: [4,2] . DeE-----------------------------------R . . . . . . . addq $32, %r8 -# SKYLAKE-NEXT: [4,3] . D=eE----------------------------------R . . . . . . . cmpl %edi, %edx -# SKYLAKE-NEXT: [5,0] . D=eE----------------------------------R . . . . . . . addl $1, %edx -# SKYLAKE-NEXT: [5,1] . D===================================eeeeeeeeER . . . . . vpaddd (%r8), %ymm0, %ymm0 -# SKYLAKE-NEXT: [5,2] . DeE------------------------------------------R . . . . . addq $32, %r8 -# SKYLAKE-NEXT: [5,3] . D=eE-----------------------------------------R . . . . . cmpl %edi, %edx -# SKYLAKE-NEXT: [6,0] . D=eE-----------------------------------------R . . . . . addl $1, %edx -# SKYLAKE-NEXT: [6,1] . .D==========================================eeeeeeeeER . . . . vpaddd (%r8), %ymm0, %ymm0 -# SKYLAKE-NEXT: [6,2] . .DeE-------------------------------------------------R . . . . addq $32, %r8 -# SKYLAKE-NEXT: [6,3] . .D=eE------------------------------------------------R . . . . cmpl %edi, %edx -# SKYLAKE-NEXT: [7,0] . .D=eE------------------------------------------------R . . . . addl $1, %edx -# SKYLAKE-NEXT: [7,1] . . D=================================================eeeeeeeeER . . vpaddd (%r8), %ymm0, %ymm0 -# SKYLAKE-NEXT: [7,2] . . DeE--------------------------------------------------------R . . addq $32, %r8 -# SKYLAKE-NEXT: [7,3] . . D=eE-------------------------------------------------------R . . cmpl %edi, %edx -# SKYLAKE-NEXT: [8,0] . . D=eE-------------------------------------------------------R . . addl $1, %edx -# SKYLAKE-NEXT: [8,1] . . D========================================================eeeeeeeeER vpaddd (%r8), %ymm0, %ymm0 -# SKYLAKE-NEXT: [8,2] . . DeE---------------------------------------------------------------R addq $32, %r8 -# SKYLAKE-NEXT: [8,3] . . D=eE--------------------------------------------------------------R cmpl %edi, %edx -# SKYLAKE-NEXT: [9,0] . . D=eE--------------------------------------------------------------R addl $1, %edx +# SKYLAKE: [0,0] DeER . . . . addl $1, %edx +# SKYLAKE-NEXT: [0,1] DeeeeeeeeER . . vpaddd (%r8), %ymm0, %ymm0 +# SKYLAKE-NEXT: [0,2] DeE-------R . . addq $32, %r8 +# SKYLAKE-NEXT: [0,3] D=eE------R . . cmpl %edi, %edx +# SKYLAKE-NEXT: [1,0] D=eE------R . . addl $1, %edx +# SKYLAKE-NEXT: [1,1] .DeeeeeeeeER . . vpaddd (%r8), %ymm0, %ymm0 +# SKYLAKE-NEXT: [1,2] .DeE-------R . . addq $32, %r8 +# SKYLAKE-NEXT: [1,3] .D=eE------R . . cmpl %edi, %edx +# SKYLAKE-NEXT: [2,0] .D=eE------R . . addl $1, %edx +# SKYLAKE-NEXT: [2,1] . DeeeeeeeeER . . vpaddd (%r8), %ymm0, %ymm0 +# SKYLAKE-NEXT: [2,2] . DeE-------R . . addq $32, %r8 +# SKYLAKE-NEXT: [2,3] . D=eE------R . . cmpl %edi, %edx +# SKYLAKE-NEXT: [3,0] . D=eE------R . . addl $1, %edx +# SKYLAKE-NEXT: [3,1] . DeeeeeeeeER . . vpaddd (%r8), %ymm0, %ymm0 +# SKYLAKE-NEXT: [3,2] . DeE-------R . . addq $32, %r8 +# SKYLAKE-NEXT: [3,3] . D=eE------R . . cmpl %edi, %edx +# SKYLAKE-NEXT: [4,0] . D=eE------R . . addl $1, %edx +# SKYLAKE-NEXT: [4,1] . DeeeeeeeeER. . vpaddd (%r8), %ymm0, %ymm0 +# SKYLAKE-NEXT: [4,2] . DeE-------R. . addq $32, %r8 +# SKYLAKE-NEXT: [4,3] . D=eE------R. . cmpl %edi, %edx +# SKYLAKE-NEXT: [5,0] . D=eE------R. . addl $1, %edx +# SKYLAKE-NEXT: [5,1] . DeeeeeeeeER . vpaddd (%r8), %ymm0, %ymm0 +# SKYLAKE-NEXT: [5,2] . DeE-------R . addq $32, %r8 +# SKYLAKE-NEXT: [5,3] . D=eE------R . cmpl %edi, %edx +# SKYLAKE-NEXT: [6,0] . D=eE------R . addl $1, %edx +# SKYLAKE-NEXT: [6,1] . .DeeeeeeeeER . vpaddd (%r8), %ymm0, %ymm0 +# SKYLAKE-NEXT: [6,2] . .DeE-------R . addq $32, %r8 +# SKYLAKE-NEXT: [6,3] . .D=eE------R . cmpl %edi, %edx +# SKYLAKE-NEXT: [7,0] . .D=eE------R . addl $1, %edx +# SKYLAKE-NEXT: [7,1] . . DeeeeeeeeER . vpaddd (%r8), %ymm0, %ymm0 +# SKYLAKE-NEXT: [7,2] . . DeE-------R . addq $32, %r8 +# SKYLAKE-NEXT: [7,3] . . D=eE------R . cmpl %edi, %edx +# SKYLAKE-NEXT: [8,0] . . D=eE------R . addl $1, %edx +# SKYLAKE-NEXT: [8,1] . . DeeeeeeeeER. vpaddd (%r8), %ymm0, %ymm0 +# SKYLAKE-NEXT: [8,2] . . DeE-------R. addq $32, %r8 +# SKYLAKE-NEXT: [8,3] . . D=eE------R. cmpl %edi, %edx +# SKYLAKE-NEXT: [9,0] . . D=eE------R. addl $1, %edx +# SKYLAKE-NEXT: [9,1] . . DeeeeeeeeER vpaddd (%r8), %ymm0, %ymm0 +# SKYLAKE-NEXT: [9,2] . . DeE-------R addq $32, %r8 +# SKYLAKE-NEXT: [9,3] . . D=eE------R cmpl %edi, %edx # ZNVER1: [0,0] DeER . . . . addl $1, %edx # ZNVER1-NEXT: [0,1] DeeeeeeeeER . . vpaddd (%r8), %ymm0, %ymm0 @@ -233,21 +236,20 @@ cmp %edi, %edx # ALL: [0] [1] [2] [3] # BDWELL-NEXT: 0. 10 1.0 0.4 4.5 addl $1, %edx -# BDWELL-NEXT: 1. 10 1.0 0.1 0.0 vpaddd (%r8), %ymm0, %ymm0 +# HASWELL-NEXT: 0. 10 1.0 0.4 5.4 addl $1, %edx +# SKYLAKE-NEXT: 0. 10 1.9 0.1 5.4 addl $1, %edx +# ZNVER1-NEXT: 0. 10 1.0 0.1 5.4 addl $1, %edx + +# ALL-NEXT: 1. 10 1.0 0.1 0.0 vpaddd (%r8), %ymm0, %ymm0 + # BDWELL-NEXT: 2. 10 1.0 0.4 5.7 addq $32, %r8 # BDWELL-NEXT: 3. 10 1.0 0.0 5.3 cmpl %edi, %edx -# HASWELL-NEXT: 0. 10 1.0 0.4 5.4 addl $1, %edx -# HASWELL-NEXT: 1. 10 1.0 0.1 0.0 vpaddd (%r8), %ymm0, %ymm0 # HASWELL-NEXT: 2. 10 1.0 0.4 6.7 addq $32, %r8 # HASWELL-NEXT: 3. 10 1.0 0.0 6.3 cmpl %edi, %edx -# SKYLAKE-NEXT: 0. 10 1.9 0.1 30.6 addl $1, %edx -# SKYLAKE-NEXT: 1. 10 32.5 0.1 0.0 vpaddd (%r8), %ymm0, %ymm0 -# SKYLAKE-NEXT: 2. 10 1.0 0.1 38.5 addq $32, %r8 -# SKYLAKE-NEXT: 3. 10 2.0 0.0 37.5 cmpl %edi, %edx +# SKYLAKE-NEXT: 2. 10 1.0 0.1 7.0 addq $32, %r8 +# SKYLAKE-NEXT: 3. 10 2.0 0.0 6.0 cmpl %edi, %edx -# ZNVER1-NEXT: 0. 10 1.0 0.1 5.4 addl $1, %edx -# ZNVER1-NEXT: 1. 10 1.0 0.1 0.0 vpaddd (%r8), %ymm0, %ymm0 # ZNVER1-NEXT: 2. 10 1.0 0.1 7.0 addq $32, %r8 # ZNVER1-NEXT: 3. 10 2.0 0.0 6.0 cmpl %edi, %edx -- GitLab From b22a1a5cdb2fd318aa7e2273c23e34949efab70e Mon Sep 17 00:00:00 2001 From: Max Kazantsev Date: Tue, 16 Oct 2018 09:58:09 +0000 Subject: [PATCH 0243/1116] [NFC] Introduce ICFLoopSafetyInfo This is an alternative implementation of LoopSafetyInfo that uses the implicit control flow tracking to give precise answers on queries "whether or not this block contains throwing instructions". This rules out false-positive answers on LoopSafetyInfo's queries. This patch only introduces the new implementation. It is not currently used in any pass. The enabling patches will go separately, through review. The plan is to completely replace all uses of LoopSafetyInfo with ICFLoopSafetyInfo in the future, but to avoid introducing functional problems, we will do it pass by pass. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344601 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Analysis/MustExecute.h | 32 +++++++++++++++++++++++++++++ lib/Analysis/MustExecute.cpp | 31 ++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+) diff --git a/include/llvm/Analysis/MustExecute.h b/include/llvm/Analysis/MustExecute.h index f136ff750de..62d9b056e88 100644 --- a/include/llvm/Analysis/MustExecute.h +++ b/include/llvm/Analysis/MustExecute.h @@ -19,6 +19,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/Analysis/EHPersonalities.h" +#include "llvm/Analysis/InstructionPrecedenceTracking.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Dominators.h" @@ -122,6 +123,37 @@ public: virtual ~SimpleLoopSafetyInfo() {}; }; +/// This implementation of LoopSafetyInfo use ImplicitControlFlowTracking to +/// give precise answers on "may throw" queries. This implementation uses cache +/// that should be invalidated by calling the method dropCachedInfo whenever we +/// modify a basic block's contents by adding or removing instructions. +class ICFLoopSafetyInfo: public LoopSafetyInfo { + bool MayThrow = false; // The current loop contains an instruction which + // may throw. + // Contains information about implicit control flow in this loop's blocks. + mutable ImplicitControlFlowTracking ICF; + +public: + virtual bool blockMayThrow(const BasicBlock *BB) const; + + virtual bool anyBlockMayThrow() const; + + virtual void computeLoopSafetyInfo(const Loop *CurLoop); + + virtual bool isGuaranteedToExecute(const Instruction &Inst, + const DominatorTree *DT, + const Loop *CurLoop) const; + + /// Drops cached information regarding the implicit control flow in block + /// \p BB. It should be called for every block in which we add or remove any + /// instructions to a block before we make queries to it. + void dropCachedInfo(const BasicBlock *BB); + + ICFLoopSafetyInfo(DominatorTree *DT) : LoopSafetyInfo(), ICF(DT) {}; + + virtual ~ICFLoopSafetyInfo() {}; +}; + } #endif diff --git a/lib/Analysis/MustExecute.cpp b/lib/Analysis/MustExecute.cpp index 4e42f336dc7..64ee2a7e5b0 100644 --- a/lib/Analysis/MustExecute.cpp +++ b/lib/Analysis/MustExecute.cpp @@ -61,6 +61,31 @@ void SimpleLoopSafetyInfo::computeLoopSafetyInfo(const Loop *CurLoop) { computeBlockColors(CurLoop); } +bool ICFLoopSafetyInfo::blockMayThrow(const BasicBlock *BB) const { + return ICF.hasICF(BB); +} + +bool ICFLoopSafetyInfo::anyBlockMayThrow() const { + return MayThrow; +} + +void ICFLoopSafetyInfo::computeLoopSafetyInfo(const Loop *CurLoop) { + assert(CurLoop != nullptr && "CurLoop can't be null"); + ICF.clear(); + MayThrow = false; + // Figure out the fact that at least one block may throw. + for (auto &BB : CurLoop->blocks()) + if (ICF.hasICF(&*BB)) { + MayThrow = true; + break; + } + computeBlockColors(CurLoop); +} + +void ICFLoopSafetyInfo::dropCachedInfo(const BasicBlock *BB) { + ICF.invalidateBlock(BB); +} + void LoopSafetyInfo::computeBlockColors(const Loop *CurLoop) { // Compute funclet colors if we might sink/hoist in a function with a funclet // personality routine. @@ -215,6 +240,12 @@ bool SimpleLoopSafetyInfo::isGuaranteedToExecute(const Instruction &Inst, return allLoopPathsLeadToBlock(CurLoop, Inst.getParent(), DT); } +bool ICFLoopSafetyInfo::isGuaranteedToExecute(const Instruction &Inst, + const DominatorTree *DT, + const Loop *CurLoop) const { + return !ICF.isDominatedByICFIFromSameBlock(&Inst) && + allLoopPathsLeadToBlock(CurLoop, Inst.getParent(), DT); +} namespace { struct MustExecutePrinter : public FunctionPass { -- GitLab From 09fdf061bbfb5187bfd678fd2c215217013798fe Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 16 Oct 2018 10:06:15 +0000 Subject: [PATCH 0244/1116] [LegalizeDAG] ExpandLegalINT_TO_FP - cleanup UINT_TO_FP i64 -> f64 expansion. Use SrcVT/DestVT types, correct shift type and AND instead of ZERO_EXTEND_IN_REG. Part of prep work for D52965 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344602 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 37 +++++++++++------------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index 175df889ef2..07a37a5092a 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -2310,6 +2310,7 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned, SDValue Op0, EVT DestVT, const SDLoc &dl) { EVT SrcVT = Op0.getValueType(); + EVT ShiftVT = TLI.getShiftAmountTy(SrcVT, DAG.getDataLayout()); // TODO: Should any fast-math-flags be set for the created nodes? LLVM_DEBUG(dbgs() << "Legalizing INT_TO_FP\n"); @@ -2371,24 +2372,21 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned, SDValue Op0, // TODO: Generalize this for use with other types. if (SrcVT == MVT::i64 && DestVT == MVT::f64) { LLVM_DEBUG(dbgs() << "Converting unsigned i64 to f64\n"); - SDValue TwoP52 = - DAG.getConstant(UINT64_C(0x4330000000000000), dl, MVT::i64); - SDValue TwoP84PlusTwoP52 = - DAG.getConstantFP(BitsToDouble(UINT64_C(0x4530000000100000)), dl, - MVT::f64); - SDValue TwoP84 = - DAG.getConstant(UINT64_C(0x4530000000000000), dl, MVT::i64); - - SDValue Lo = DAG.getZeroExtendInReg(Op0, dl, MVT::i32); - SDValue Hi = DAG.getNode(ISD::SRL, dl, MVT::i64, Op0, - DAG.getConstant(32, dl, MVT::i64)); - SDValue LoOr = DAG.getNode(ISD::OR, dl, MVT::i64, Lo, TwoP52); - SDValue HiOr = DAG.getNode(ISD::OR, dl, MVT::i64, Hi, TwoP84); - SDValue LoFlt = DAG.getNode(ISD::BITCAST, dl, MVT::f64, LoOr); - SDValue HiFlt = DAG.getNode(ISD::BITCAST, dl, MVT::f64, HiOr); - SDValue HiSub = DAG.getNode(ISD::FSUB, dl, MVT::f64, HiFlt, - TwoP84PlusTwoP52); - return DAG.getNode(ISD::FADD, dl, MVT::f64, LoFlt, HiSub); + SDValue TwoP52 = DAG.getConstant(UINT64_C(0x4330000000000000), dl, SrcVT); + SDValue TwoP84PlusTwoP52 = DAG.getConstantFP( + BitsToDouble(UINT64_C(0x4530000000100000)), dl, DestVT); + SDValue TwoP84 = DAG.getConstant(UINT64_C(0x4530000000000000), dl, SrcVT); + SDValue LoMask = DAG.getConstant(UINT64_C(0x00000000FFFFFFFF), dl, SrcVT); + SDValue HiShift = DAG.getConstant(32, dl, ShiftVT); + + SDValue Lo = DAG.getNode(ISD::AND, dl, SrcVT, Op0, LoMask); + SDValue Hi = DAG.getNode(ISD::SRL, dl, SrcVT, Op0, HiShift); + SDValue LoOr = DAG.getNode(ISD::OR, dl, SrcVT, Lo, TwoP52); + SDValue HiOr = DAG.getNode(ISD::OR, dl, SrcVT, Hi, TwoP84); + SDValue LoFlt = DAG.getNode(ISD::BITCAST, dl, DestVT, LoOr); + SDValue HiFlt = DAG.getNode(ISD::BITCAST, dl, DestVT, HiOr); + SDValue HiSub = DAG.getNode(ISD::FSUB, dl, DestVT, HiFlt, TwoP84PlusTwoP52); + return DAG.getNode(ISD::FADD, dl, DestVT, LoFlt, HiSub); } // TODO: Generalize this for use with other types. @@ -2399,8 +2397,7 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned, SDValue Op0, if (!isSigned) { SDValue Fast = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Op0); - SDValue ShiftConst = DAG.getConstant( - 1, dl, TLI.getShiftAmountTy(SrcVT, DAG.getDataLayout())); + SDValue ShiftConst = DAG.getConstant(1, dl, ShiftVT); SDValue Shr = DAG.getNode(ISD::SRL, dl, MVT::i64, Op0, ShiftConst); SDValue AndConst = DAG.getConstant(1, dl, MVT::i64); SDValue And = DAG.getNode(ISD::AND, dl, MVT::i64, Op0, AndConst); -- GitLab From dacda52aca98da2ae2f3a4160f7197abb97093a6 Mon Sep 17 00:00:00 2001 From: Ayal Zaks Date: Tue, 16 Oct 2018 14:25:02 +0000 Subject: [PATCH 0245/1116] [LV] Add test checks when vectorizing loops under opt for size; NFC Landing this as a separate part of https://reviews.llvm.org/D50480, recording current behavior more accurately, to clarify subsequent diff ([LV] Vectorizing loops of arbitrary trip count without remainder under opt for size). git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@344606 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/Transforms/LoopVectorize/X86/optsize.ll | 57 +++++++++ .../LoopVectorize/X86/small-size.ll | 107 +++++++++++++++-- .../X86/vect.omp.force.small-tc.ll | 108 ++++++++++++++++-- 3 files changed, 253 insertions(+), 19 deletions(-) create mode 100644 test/Transforms/LoopVectorize/X86/optsize.ll diff --git a/test/Transforms/LoopVectorize/X86/optsize.ll b/test/Transforms/LoopVectorize/X86/optsize.ll new file mode 100644 index 00000000000..057c72044d9 --- /dev/null +++ b/test/Transforms/LoopVectorize/X86/optsize.ll @@ -0,0 +1,57 @@ +; This test verifies that the loop vectorizer will NOT vectorize loops that +; will produce a tail loop with the optimize for size or the minimize size +; attributes. This is a target-dependent version of the test. +; RUN: opt < %s -loop-vectorize -force-vector-width=64 -S -mtriple=x86_64-unknown-linux -mcpu=skx | FileCheck %s + +target datalayout = "E-m:e-p:32:32-i64:32-f64:32:64-a:0:32-n32-S128" + +@tab = common global [32 x i8] zeroinitializer, align 1 + +define i32 @foo_optsize() #0 { +; CHECK-LABEL: @foo_optsize( +; CHECK-NOT: x i8> + +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08 + %0 = load i8, i8* %arrayidx, align 1 + %cmp1 = icmp eq i8 %0, 0 + %. = select i1 %cmp1, i8 2, i8 1 + store i8 %., i8* %arrayidx, align 1 + %inc = add nsw i32 %i.08, 1 + %exitcond = icmp eq i32 %i.08, 202 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret i32 0 +} + +attributes #0 = { optsize } + +define i32 @foo_minsize() #1 { +; CHECK-LABEL: @foo_minsize( +; CHECK-NOT: x i8> + +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08 + %0 = load i8, i8* %arrayidx, align 1 + %cmp1 = icmp eq i8 %0, 0 + %. = select i1 %cmp1, i8 2, i8 1 + store i8 %., i8* %arrayidx, align 1 + %inc = add nsw i32 %i.08, 1 + %exitcond = icmp eq i32 %i.08, 202 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret i32 0 +} + +attributes #1 = { minsize } + diff --git a/test/Transforms/LoopVectorize/X86/small-size.ll b/test/Transforms/LoopVectorize/X86/small-size.ll index 89d69e232f5..8af7b2e7df9 100644 --- a/test/Transforms/LoopVectorize/X86/small-size.ll +++ b/test/Transforms/LoopVectorize/X86/small-size.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -basicaa -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -loop-vectorize-with-block-frequency -dce -instcombine -S | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" @@ -20,12 +21,33 @@ target triple = "x86_64-apple-macosx10.8.0" @dj = common global [1024 x i32] zeroinitializer, align 16 ; We can optimize this test without a tail. -;CHECK-LABEL: @example1( -;CHECK: load <4 x i32> -;CHECK: add nsw <4 x i32> -;CHECK: store <4 x i32> -;CHECK: ret void define void @example1() optsize { +; CHECK-LABEL: @example1( +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[INDEX]] +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 16 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 [[INDEX]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 16 +; CHECK-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 [[INDEX]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 16 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[TMP10:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: br label [[TMP9:%.*]] +; CHECK: br i1 undef, label [[TMP10]], label [[TMP9]], !llvm.loop !2 +; CHECK: ret void +; br label %1 ;