1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X86 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X64 4 5define <4 x double> @test_broadcast_2f64_4f64(ptr%p) nounwind { 6; X86-LABEL: test_broadcast_2f64_4f64: 7; X86: # %bb.0: 8; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 9; X86-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 10; X86-NEXT: retl 11; 12; X64-LABEL: test_broadcast_2f64_4f64: 13; X64: # %bb.0: 14; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 15; X64-NEXT: retq 16 %1 = load <2 x double>, ptr%p 17 %2 = shufflevector <2 x double> %1, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 18 ret <4 x double> %2 19} 20 21define <4 x i64> @test_broadcast_2i64_4i64(ptr%p) nounwind { 22; X86-LABEL: test_broadcast_2i64_4i64: 23; X86: # %bb.0: 24; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 25; X86-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 26; X86-NEXT: retl 27; 28; X64-LABEL: test_broadcast_2i64_4i64: 29; X64: # %bb.0: 30; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 31; X64-NEXT: retq 32 %1 = load <2 x i64>, ptr%p 33 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 34 ret <4 x i64> %2 35} 36 37define <8 x float> @test_broadcast_4f32_8f32(ptr%p) nounwind { 38; X86-LABEL: test_broadcast_4f32_8f32: 39; X86: # %bb.0: 40; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 41; X86-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 42; X86-NEXT: retl 43; 44; X64-LABEL: test_broadcast_4f32_8f32: 45; X64: # %bb.0: 46; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 47; X64-NEXT: retq 48 %1 = load <4 x float>, ptr%p 49 %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 50 ret <8 x float> %2 51} 52 53define <8 x i32> @test_broadcast_4i32_8i32(ptr%p) nounwind { 54; X86-LABEL: test_broadcast_4i32_8i32: 55; X86: # %bb.0: 56; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 57; X86-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 58; X86-NEXT: retl 59; 60; X64-LABEL: test_broadcast_4i32_8i32: 61; X64: # %bb.0: 62; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 63; X64-NEXT: retq 64 %1 = load <4 x i32>, ptr%p 65 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 66 ret <8 x i32> %2 67} 68 69define <16 x i16> @test_broadcast_8i16_16i16(ptr%p) nounwind { 70; X86-LABEL: test_broadcast_8i16_16i16: 71; X86: # %bb.0: 72; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 73; X86-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 74; X86-NEXT: retl 75; 76; X64-LABEL: test_broadcast_8i16_16i16: 77; X64: # %bb.0: 78; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 79; X64-NEXT: retq 80 %1 = load <8 x i16>, ptr%p 81 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 82 ret <16 x i16> %2 83} 84 85define <32 x i8> @test_broadcast_16i8_32i8(ptr%p) nounwind { 86; X86-LABEL: test_broadcast_16i8_32i8: 87; X86: # %bb.0: 88; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 89; X86-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 90; X86-NEXT: retl 91; 92; X64-LABEL: test_broadcast_16i8_32i8: 93; X64: # %bb.0: 94; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 95; X64-NEXT: retq 96 %1 = load <16 x i8>, ptr%p 97 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 98 ret <32 x i8> %2 99} 100 101; PR38949 - https://bugs.llvm.org/show_bug.cgi?id=38949 102; Don't limit the transform based on extra uses of the load itself (the store is a user of the load's chain value). 103 104define void @subv_reuse_is_ok(ptr %a, ptr %b) { 105; X86-LABEL: subv_reuse_is_ok: 106; X86: # %bb.0: 107; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 108; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 109; X86-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 110; X86-NEXT: vmovups %ymm0, (%eax) 111; X86-NEXT: vzeroupper 112; X86-NEXT: retl 113; 114; X64-LABEL: subv_reuse_is_ok: 115; X64: # %bb.0: 116; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 117; X64-NEXT: vmovups %ymm0, (%rsi) 118; X64-NEXT: vzeroupper 119; X64-NEXT: retq 120 %ld = load <4 x float>, ptr %a, align 1 121 %splat128 = shufflevector <4 x float> %ld, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 122 store <8 x float> %splat128, ptr %b, align 16 123 ret void 124} 125 126define <4 x double> @test_broadcast_2f64_4f64_reuse(ptr %p0, ptr %p1) { 127; X86-LABEL: test_broadcast_2f64_4f64_reuse: 128; X86: # %bb.0: 129; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 130; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 131; X86-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 132; X86-NEXT: vmovaps %xmm0, (%eax) 133; X86-NEXT: retl 134; 135; X64-LABEL: test_broadcast_2f64_4f64_reuse: 136; X64: # %bb.0: 137; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 138; X64-NEXT: vmovaps %xmm0, (%rsi) 139; X64-NEXT: retq 140 %1 = load <2 x double>, ptr %p0 141 %2 = shufflevector <2 x double> %1, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 142 store <2 x double> %1, ptr %p1 143 ret <4 x double> %2 144} 145 146define <4 x i64> @test_broadcast_2i64_4i64_reuse(ptr %p0, ptr %p1) { 147; X86-LABEL: test_broadcast_2i64_4i64_reuse: 148; X86: # %bb.0: 149; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 150; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 151; X86-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 152; X86-NEXT: vmovaps %xmm0, (%eax) 153; X86-NEXT: retl 154; 155; X64-LABEL: test_broadcast_2i64_4i64_reuse: 156; X64: # %bb.0: 157; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 158; X64-NEXT: vmovaps %xmm0, (%rsi) 159; X64-NEXT: retq 160 %1 = load <2 x i64>, ptr %p0 161 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 162 store <2 x i64> %1, ptr %p1 163 ret <4 x i64> %2 164} 165 166define <8 x float> @test_broadcast_4f32_8f32_reuse(ptr %p0, ptr %p1) { 167; X86-LABEL: test_broadcast_4f32_8f32_reuse: 168; X86: # %bb.0: 169; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 170; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 171; X86-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 172; X86-NEXT: vmovaps %xmm0, (%eax) 173; X86-NEXT: retl 174; 175; X64-LABEL: test_broadcast_4f32_8f32_reuse: 176; X64: # %bb.0: 177; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 178; X64-NEXT: vmovaps %xmm0, (%rsi) 179; X64-NEXT: retq 180 %1 = load <4 x float>, ptr %p0 181 %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 182 store <4 x float> %1, ptr %p1 183 ret <8 x float> %2 184} 185 186define <8 x i32> @test_broadcast_4i32_8i32_reuse(ptr %p0, ptr %p1) { 187; X86-LABEL: test_broadcast_4i32_8i32_reuse: 188; X86: # %bb.0: 189; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 190; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 191; X86-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 192; X86-NEXT: vmovaps %xmm0, (%eax) 193; X86-NEXT: retl 194; 195; X64-LABEL: test_broadcast_4i32_8i32_reuse: 196; X64: # %bb.0: 197; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 198; X64-NEXT: vmovaps %xmm0, (%rsi) 199; X64-NEXT: retq 200 %1 = load <4 x i32>, ptr %p0 201 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 202 store <4 x i32> %1, ptr %p1 203 ret <8 x i32> %2 204} 205 206define <16 x i16> @test_broadcast_8i16_16i16_reuse(ptr%p0, ptr%p1) nounwind { 207; X86-LABEL: test_broadcast_8i16_16i16_reuse: 208; X86: # %bb.0: 209; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 210; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 211; X86-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 212; X86-NEXT: vmovaps %xmm0, (%eax) 213; X86-NEXT: retl 214; 215; X64-LABEL: test_broadcast_8i16_16i16_reuse: 216; X64: # %bb.0: 217; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 218; X64-NEXT: vmovaps %xmm0, (%rsi) 219; X64-NEXT: retq 220 %1 = load <8 x i16>, ptr%p0 221 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 222 store <8 x i16> %1, ptr %p1 223 ret <16 x i16> %2 224} 225 226define <32 x i8> @test_broadcast_16i8_32i8_reuse(ptr%p0, ptr%p1) nounwind { 227; X86-LABEL: test_broadcast_16i8_32i8_reuse: 228; X86: # %bb.0: 229; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 230; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 231; X86-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 232; X86-NEXT: vmovaps %xmm0, (%eax) 233; X86-NEXT: retl 234; 235; X64-LABEL: test_broadcast_16i8_32i8_reuse: 236; X64: # %bb.0: 237; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 238; X64-NEXT: vmovaps %xmm0, (%rsi) 239; X64-NEXT: retq 240 %1 = load <16 x i8>, ptr%p0 241 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 242 store <16 x i8> %1, ptr %p1 243 ret <32 x i8> %2 244} 245 246define <8 x i32> @PR29088(ptr %p0, ptr %p1) { 247; X86-LABEL: PR29088: 248; X86: # %bb.0: 249; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 250; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 251; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1 252; X86-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 253; X86-NEXT: vmovaps %ymm1, (%eax) 254; X86-NEXT: retl 255; 256; X64-LABEL: PR29088: 257; X64: # %bb.0: 258; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 259; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 260; X64-NEXT: vmovaps %ymm1, (%rsi) 261; X64-NEXT: retq 262 %ld = load <4 x i32>, ptr %p0 263 store <8 x float> zeroinitializer, ptr %p1 264 %shuf = shufflevector <4 x i32> %ld, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 265 ret <8 x i32> %shuf 266} 267