1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=X86,X86-AVX,X86-AVX1 3; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X86,X86-AVX,X86-AVX2 4; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=X86,X86-AVX512 5; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=X86,X86-AVX512 6; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=X86,X86-AVX512 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX1 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX2 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=X64,X64-AVX512 10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=X64,X64-AVX512 11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=X64,X64-AVX512 12 13; 14; Subvector Load + Broadcast 15; 16 17define <4 x double> @test_broadcast_2f64_4f64(ptr%p) nounwind { 18; X86-LABEL: test_broadcast_2f64_4f64: 19; X86: # %bb.0: 20; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 21; X86-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 22; X86-NEXT: retl 23; 24; X64-LABEL: test_broadcast_2f64_4f64: 25; X64: # %bb.0: 26; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 27; X64-NEXT: retq 28 %1 = load <2 x double>, ptr%p 29 %2 = shufflevector <2 x double> %1, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 30 ret <4 x double> %2 31} 32 33define <8 x double> @test_broadcast_2f64_8f64(ptr%p) nounwind { 34; X86-AVX-LABEL: test_broadcast_2f64_8f64: 35; X86-AVX: # %bb.0: 36; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 37; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 38; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 39; X86-AVX-NEXT: retl 40; 41; X86-AVX512-LABEL: test_broadcast_2f64_8f64: 42; X86-AVX512: # %bb.0: 43; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 44; X86-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 45; X86-AVX512-NEXT: retl 46; 47; X64-AVX-LABEL: test_broadcast_2f64_8f64: 48; X64-AVX: # %bb.0: 49; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 50; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 51; X64-AVX-NEXT: retq 52; 53; X64-AVX512-LABEL: test_broadcast_2f64_8f64: 54; X64-AVX512: # %bb.0: 55; X64-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 56; X64-AVX512-NEXT: retq 57 %1 = load <2 x double>, ptr%p 58 %2 = shufflevector <2 x double> %1, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1> 59 ret <8 x double> %2 60} 61 62define <8 x double> @test_broadcast_4f64_8f64(ptr%p) nounwind { 63; X86-AVX-LABEL: test_broadcast_4f64_8f64: 64; X86-AVX: # %bb.0: 65; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 66; X86-AVX-NEXT: vmovaps (%eax), %ymm0 67; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 68; X86-AVX-NEXT: retl 69; 70; X86-AVX512-LABEL: test_broadcast_4f64_8f64: 71; X86-AVX512: # %bb.0: 72; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 73; X86-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 74; X86-AVX512-NEXT: retl 75; 76; X64-AVX-LABEL: test_broadcast_4f64_8f64: 77; X64-AVX: # %bb.0: 78; X64-AVX-NEXT: vmovaps (%rdi), %ymm0 79; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 80; X64-AVX-NEXT: retq 81; 82; X64-AVX512-LABEL: test_broadcast_4f64_8f64: 83; X64-AVX512: # %bb.0: 84; X64-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 85; X64-AVX512-NEXT: retq 86 %1 = load <4 x double>, ptr%p 87 %2 = shufflevector <4 x double> %1, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 88 ret <8 x double> %2 89} 90 91define <4 x i64> @test_broadcast_2i64_4i64(ptr%p) nounwind { 92; X86-AVX-LABEL: test_broadcast_2i64_4i64: 93; X86-AVX: # %bb.0: 94; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 95; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 96; X86-AVX-NEXT: retl 97; 98; X86-AVX512-LABEL: test_broadcast_2i64_4i64: 99; X86-AVX512: # %bb.0: 100; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 101; X86-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 102; X86-AVX512-NEXT: retl 103; 104; X64-AVX-LABEL: test_broadcast_2i64_4i64: 105; X64-AVX: # %bb.0: 106; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 107; X64-AVX-NEXT: retq 108; 109; X64-AVX512-LABEL: test_broadcast_2i64_4i64: 110; X64-AVX512: # %bb.0: 111; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 112; X64-AVX512-NEXT: retq 113 %1 = load <2 x i64>, ptr%p 114 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 115 ret <4 x i64> %2 116} 117 118define <8 x i64> @test_broadcast_2i64_8i64(ptr%p) nounwind { 119; X86-AVX-LABEL: test_broadcast_2i64_8i64: 120; X86-AVX: # %bb.0: 121; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 122; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 123; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 124; X86-AVX-NEXT: retl 125; 126; X86-AVX512-LABEL: test_broadcast_2i64_8i64: 127; X86-AVX512: # %bb.0: 128; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 129; X86-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 130; X86-AVX512-NEXT: retl 131; 132; X64-AVX-LABEL: test_broadcast_2i64_8i64: 133; X64-AVX: # %bb.0: 134; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 135; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 136; X64-AVX-NEXT: retq 137; 138; X64-AVX512-LABEL: test_broadcast_2i64_8i64: 139; X64-AVX512: # %bb.0: 140; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 141; X64-AVX512-NEXT: retq 142 %1 = load <2 x i64>, ptr%p 143 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1> 144 ret <8 x i64> %2 145} 146 147define <8 x i64> @test_broadcast_4i64_8i64(ptr%p) nounwind { 148; X86-AVX-LABEL: test_broadcast_4i64_8i64: 149; X86-AVX: # %bb.0: 150; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 151; X86-AVX-NEXT: vmovaps (%eax), %ymm0 152; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 153; X86-AVX-NEXT: retl 154; 155; X86-AVX512-LABEL: test_broadcast_4i64_8i64: 156; X86-AVX512: # %bb.0: 157; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 158; X86-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 159; X86-AVX512-NEXT: retl 160; 161; X64-AVX-LABEL: test_broadcast_4i64_8i64: 162; X64-AVX: # %bb.0: 163; X64-AVX-NEXT: vmovaps (%rdi), %ymm0 164; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 165; X64-AVX-NEXT: retq 166; 167; X64-AVX512-LABEL: test_broadcast_4i64_8i64: 168; X64-AVX512: # %bb.0: 169; X64-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 170; X64-AVX512-NEXT: retq 171 %1 = load <4 x i64>, ptr%p 172 %2 = shufflevector <4 x i64> %1, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 173 ret <8 x i64> %2 174} 175 176define <8 x float> @test_broadcast_4f32_8f32(ptr%p) nounwind { 177; X86-LABEL: test_broadcast_4f32_8f32: 178; X86: # %bb.0: 179; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 180; X86-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 181; X86-NEXT: retl 182; 183; X64-LABEL: test_broadcast_4f32_8f32: 184; X64: # %bb.0: 185; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 186; X64-NEXT: retq 187 %1 = load <4 x float>, ptr%p 188 %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 189 ret <8 x float> %2 190} 191 192define <16 x float> @test_broadcast_4f32_16f32(ptr%p) nounwind { 193; X86-AVX-LABEL: test_broadcast_4f32_16f32: 194; X86-AVX: # %bb.0: 195; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 196; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 197; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 198; X86-AVX-NEXT: retl 199; 200; X86-AVX512-LABEL: test_broadcast_4f32_16f32: 201; X86-AVX512: # %bb.0: 202; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 203; X86-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 204; X86-AVX512-NEXT: retl 205; 206; X64-AVX-LABEL: test_broadcast_4f32_16f32: 207; X64-AVX: # %bb.0: 208; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 209; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 210; X64-AVX-NEXT: retq 211; 212; X64-AVX512-LABEL: test_broadcast_4f32_16f32: 213; X64-AVX512: # %bb.0: 214; X64-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 215; X64-AVX512-NEXT: retq 216 %1 = load <4 x float>, ptr%p 217 %2 = shufflevector <4 x float> %1, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 218 ret <16 x float> %2 219} 220 221define <16 x float> @test_broadcast_8f32_16f32(ptr%p) nounwind { 222; X86-AVX-LABEL: test_broadcast_8f32_16f32: 223; X86-AVX: # %bb.0: 224; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 225; X86-AVX-NEXT: vmovaps (%eax), %ymm0 226; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 227; X86-AVX-NEXT: retl 228; 229; X86-AVX512-LABEL: test_broadcast_8f32_16f32: 230; X86-AVX512: # %bb.0: 231; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 232; X86-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 233; X86-AVX512-NEXT: retl 234; 235; X64-AVX-LABEL: test_broadcast_8f32_16f32: 236; X64-AVX: # %bb.0: 237; X64-AVX-NEXT: vmovaps (%rdi), %ymm0 238; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 239; X64-AVX-NEXT: retq 240; 241; X64-AVX512-LABEL: test_broadcast_8f32_16f32: 242; X64-AVX512: # %bb.0: 243; X64-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 244; X64-AVX512-NEXT: retq 245 %1 = load <8 x float>, ptr%p 246 %2 = shufflevector <8 x float> %1, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 247 ret <16 x float> %2 248} 249 250define <8 x i32> @test_broadcast_4i32_8i32(ptr%p) nounwind { 251; X86-AVX-LABEL: test_broadcast_4i32_8i32: 252; X86-AVX: # %bb.0: 253; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 254; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 255; X86-AVX-NEXT: retl 256; 257; X86-AVX512-LABEL: test_broadcast_4i32_8i32: 258; X86-AVX512: # %bb.0: 259; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 260; X86-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 261; X86-AVX512-NEXT: retl 262; 263; X64-AVX-LABEL: test_broadcast_4i32_8i32: 264; X64-AVX: # %bb.0: 265; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 266; X64-AVX-NEXT: retq 267; 268; X64-AVX512-LABEL: test_broadcast_4i32_8i32: 269; X64-AVX512: # %bb.0: 270; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 271; X64-AVX512-NEXT: retq 272 %1 = load <4 x i32>, ptr%p 273 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 274 ret <8 x i32> %2 275} 276 277define <16 x i32> @test_broadcast_4i32_16i32(ptr%p) nounwind { 278; X86-AVX-LABEL: test_broadcast_4i32_16i32: 279; X86-AVX: # %bb.0: 280; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 281; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 282; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 283; X86-AVX-NEXT: retl 284; 285; X86-AVX512-LABEL: test_broadcast_4i32_16i32: 286; X86-AVX512: # %bb.0: 287; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 288; X86-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 289; X86-AVX512-NEXT: retl 290; 291; X64-AVX-LABEL: test_broadcast_4i32_16i32: 292; X64-AVX: # %bb.0: 293; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 294; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 295; X64-AVX-NEXT: retq 296; 297; X64-AVX512-LABEL: test_broadcast_4i32_16i32: 298; X64-AVX512: # %bb.0: 299; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 300; X64-AVX512-NEXT: retq 301 %1 = load <4 x i32>, ptr%p 302 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 303 ret <16 x i32> %2 304} 305 306define <16 x i32> @test_broadcast_8i32_16i32(ptr%p) nounwind { 307; X86-AVX-LABEL: test_broadcast_8i32_16i32: 308; X86-AVX: # %bb.0: 309; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 310; X86-AVX-NEXT: vmovaps (%eax), %ymm0 311; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 312; X86-AVX-NEXT: retl 313; 314; X86-AVX512-LABEL: test_broadcast_8i32_16i32: 315; X86-AVX512: # %bb.0: 316; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 317; X86-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 318; X86-AVX512-NEXT: retl 319; 320; X64-AVX-LABEL: test_broadcast_8i32_16i32: 321; X64-AVX: # %bb.0: 322; X64-AVX-NEXT: vmovaps (%rdi), %ymm0 323; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 324; X64-AVX-NEXT: retq 325; 326; X64-AVX512-LABEL: test_broadcast_8i32_16i32: 327; X64-AVX512: # %bb.0: 328; X64-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 329; X64-AVX512-NEXT: retq 330 %1 = load <8 x i32>, ptr%p 331 %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 332 ret <16 x i32> %2 333} 334 335define <16 x i16> @test_broadcast_8i16_16i16(ptr%p) nounwind { 336; X86-AVX-LABEL: test_broadcast_8i16_16i16: 337; X86-AVX: # %bb.0: 338; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 339; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 340; X86-AVX-NEXT: retl 341; 342; X86-AVX512-LABEL: test_broadcast_8i16_16i16: 343; X86-AVX512: # %bb.0: 344; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 345; X86-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 346; X86-AVX512-NEXT: retl 347; 348; X64-AVX-LABEL: test_broadcast_8i16_16i16: 349; X64-AVX: # %bb.0: 350; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 351; X64-AVX-NEXT: retq 352; 353; X64-AVX512-LABEL: test_broadcast_8i16_16i16: 354; X64-AVX512: # %bb.0: 355; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 356; X64-AVX512-NEXT: retq 357 %1 = load <8 x i16>, ptr%p 358 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 359 ret <16 x i16> %2 360} 361 362define <32 x i16> @test_broadcast_8i16_32i16(ptr%p) nounwind { 363; X86-AVX-LABEL: test_broadcast_8i16_32i16: 364; X86-AVX: # %bb.0: 365; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 366; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 367; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 368; X86-AVX-NEXT: retl 369; 370; X86-AVX512-LABEL: test_broadcast_8i16_32i16: 371; X86-AVX512: # %bb.0: 372; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 373; X86-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 374; X86-AVX512-NEXT: retl 375; 376; X64-AVX-LABEL: test_broadcast_8i16_32i16: 377; X64-AVX: # %bb.0: 378; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 379; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 380; X64-AVX-NEXT: retq 381; 382; X64-AVX512-LABEL: test_broadcast_8i16_32i16: 383; X64-AVX512: # %bb.0: 384; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 385; X64-AVX512-NEXT: retq 386 %1 = load <8 x i16>, ptr%p 387 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 388 ret <32 x i16> %2 389} 390 391define <32 x i16> @test_broadcast_16i16_32i16(ptr%p) nounwind { 392; X86-AVX-LABEL: test_broadcast_16i16_32i16: 393; X86-AVX: # %bb.0: 394; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 395; X86-AVX-NEXT: vmovaps (%eax), %ymm0 396; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 397; X86-AVX-NEXT: retl 398; 399; X86-AVX512-LABEL: test_broadcast_16i16_32i16: 400; X86-AVX512: # %bb.0: 401; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 402; X86-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 403; X86-AVX512-NEXT: retl 404; 405; X64-AVX-LABEL: test_broadcast_16i16_32i16: 406; X64-AVX: # %bb.0: 407; X64-AVX-NEXT: vmovaps (%rdi), %ymm0 408; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 409; X64-AVX-NEXT: retq 410; 411; X64-AVX512-LABEL: test_broadcast_16i16_32i16: 412; X64-AVX512: # %bb.0: 413; X64-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 414; X64-AVX512-NEXT: retq 415 %1 = load <16 x i16>, ptr%p 416 %2 = shufflevector <16 x i16> %1, <16 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 417 ret <32 x i16> %2 418} 419 420define <32 x i8> @test_broadcast_16i8_32i8(ptr%p) nounwind { 421; X86-AVX-LABEL: test_broadcast_16i8_32i8: 422; X86-AVX: # %bb.0: 423; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 424; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 425; X86-AVX-NEXT: retl 426; 427; X86-AVX512-LABEL: test_broadcast_16i8_32i8: 428; X86-AVX512: # %bb.0: 429; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 430; X86-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 431; X86-AVX512-NEXT: retl 432; 433; X64-AVX-LABEL: test_broadcast_16i8_32i8: 434; X64-AVX: # %bb.0: 435; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 436; X64-AVX-NEXT: retq 437; 438; X64-AVX512-LABEL: test_broadcast_16i8_32i8: 439; X64-AVX512: # %bb.0: 440; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 441; X64-AVX512-NEXT: retq 442 %1 = load <16 x i8>, ptr%p 443 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 444 ret <32 x i8> %2 445} 446 447define <64 x i8> @test_broadcast_16i8_64i8(ptr%p) nounwind { 448; X86-AVX-LABEL: test_broadcast_16i8_64i8: 449; X86-AVX: # %bb.0: 450; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 451; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 452; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 453; X86-AVX-NEXT: retl 454; 455; X86-AVX512-LABEL: test_broadcast_16i8_64i8: 456; X86-AVX512: # %bb.0: 457; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 458; X86-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 459; X86-AVX512-NEXT: retl 460; 461; X64-AVX-LABEL: test_broadcast_16i8_64i8: 462; X64-AVX: # %bb.0: 463; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 464; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 465; X64-AVX-NEXT: retq 466; 467; X64-AVX512-LABEL: test_broadcast_16i8_64i8: 468; X64-AVX512: # %bb.0: 469; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 470; X64-AVX512-NEXT: retq 471 %1 = load <16 x i8>, ptr%p 472 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 473 ret <64 x i8> %2 474} 475 476define <64 x i8> @test_broadcast_32i8_64i8(ptr%p) nounwind { 477; X86-AVX-LABEL: test_broadcast_32i8_64i8: 478; X86-AVX: # %bb.0: 479; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 480; X86-AVX-NEXT: vmovaps (%eax), %ymm0 481; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 482; X86-AVX-NEXT: retl 483; 484; X86-AVX512-LABEL: test_broadcast_32i8_64i8: 485; X86-AVX512: # %bb.0: 486; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 487; X86-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 488; X86-AVX512-NEXT: retl 489; 490; X64-AVX-LABEL: test_broadcast_32i8_64i8: 491; X64-AVX: # %bb.0: 492; X64-AVX-NEXT: vmovaps (%rdi), %ymm0 493; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 494; X64-AVX-NEXT: retq 495; 496; X64-AVX512-LABEL: test_broadcast_32i8_64i8: 497; X64-AVX512: # %bb.0: 498; X64-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 499; X64-AVX512-NEXT: retq 500 %1 = load <32 x i8>, ptr%p 501 %2 = shufflevector <32 x i8> %1, <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 502 ret <64 x i8> %2 503} 504 505; 506; Subvector Load + Broadcast + Store 507; 508 509define <4 x double> @test_broadcast_2f64_4f64_reuse(ptr %p0, ptr %p1) { 510; X86-AVX-LABEL: test_broadcast_2f64_4f64_reuse: 511; X86-AVX: # %bb.0: 512; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 513; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 514; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 515; X86-AVX-NEXT: vmovaps %xmm0, (%eax) 516; X86-AVX-NEXT: retl 517; 518; X86-AVX512-LABEL: test_broadcast_2f64_4f64_reuse: 519; X86-AVX512: # %bb.0: 520; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 521; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx 522; X86-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 523; X86-AVX512-NEXT: vmovdqa %xmm0, (%eax) 524; X86-AVX512-NEXT: retl 525; 526; X64-AVX-LABEL: test_broadcast_2f64_4f64_reuse: 527; X64-AVX: # %bb.0: 528; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 529; X64-AVX-NEXT: vmovaps %xmm0, (%rsi) 530; X64-AVX-NEXT: retq 531; 532; X64-AVX512-LABEL: test_broadcast_2f64_4f64_reuse: 533; X64-AVX512: # %bb.0: 534; X64-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 535; X64-AVX512-NEXT: vmovdqa %xmm0, (%rsi) 536; X64-AVX512-NEXT: retq 537 %1 = load <2 x double>, ptr %p0 538 store <2 x double> %1, ptr %p1 539 %2 = shufflevector <2 x double> %1, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 540 ret <4 x double> %2 541} 542 543define <4 x i64> @test_broadcast_2i64_4i64_reuse(ptr %p0, ptr %p1) { 544; X86-AVX-LABEL: test_broadcast_2i64_4i64_reuse: 545; X86-AVX: # %bb.0: 546; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 547; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 548; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 549; X86-AVX-NEXT: vmovaps %xmm0, (%eax) 550; X86-AVX-NEXT: retl 551; 552; X86-AVX512-LABEL: test_broadcast_2i64_4i64_reuse: 553; X86-AVX512: # %bb.0: 554; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 555; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx 556; X86-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 557; X86-AVX512-NEXT: vmovdqa %xmm0, (%eax) 558; X86-AVX512-NEXT: retl 559; 560; X64-AVX-LABEL: test_broadcast_2i64_4i64_reuse: 561; X64-AVX: # %bb.0: 562; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 563; X64-AVX-NEXT: vmovaps %xmm0, (%rsi) 564; X64-AVX-NEXT: retq 565; 566; X64-AVX512-LABEL: test_broadcast_2i64_4i64_reuse: 567; X64-AVX512: # %bb.0: 568; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 569; X64-AVX512-NEXT: vmovdqa %xmm0, (%rsi) 570; X64-AVX512-NEXT: retq 571 %1 = load <2 x i64>, ptr %p0 572 store <2 x i64> %1, ptr %p1 573 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 574 ret <4 x i64> %2 575} 576 577define <8 x float> @test_broadcast_4f32_8f32_reuse(ptr %p0, ptr %p1) { 578; X86-AVX-LABEL: test_broadcast_4f32_8f32_reuse: 579; X86-AVX: # %bb.0: 580; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 581; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 582; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 583; X86-AVX-NEXT: vmovaps %xmm0, (%eax) 584; X86-AVX-NEXT: retl 585; 586; X86-AVX512-LABEL: test_broadcast_4f32_8f32_reuse: 587; X86-AVX512: # %bb.0: 588; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 589; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx 590; X86-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 591; X86-AVX512-NEXT: vmovdqa %xmm0, (%eax) 592; X86-AVX512-NEXT: retl 593; 594; X64-AVX-LABEL: test_broadcast_4f32_8f32_reuse: 595; X64-AVX: # %bb.0: 596; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 597; X64-AVX-NEXT: vmovaps %xmm0, (%rsi) 598; X64-AVX-NEXT: retq 599; 600; X64-AVX512-LABEL: test_broadcast_4f32_8f32_reuse: 601; X64-AVX512: # %bb.0: 602; X64-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 603; X64-AVX512-NEXT: vmovdqa %xmm0, (%rsi) 604; X64-AVX512-NEXT: retq 605 %1 = load <4 x float>, ptr %p0 606 store <4 x float> %1, ptr %p1 607 %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 608 ret <8 x float> %2 609} 610 611define <8 x i32> @test_broadcast_4i32_8i32_reuse(ptr %p0, ptr %p1) { 612; X86-AVX-LABEL: test_broadcast_4i32_8i32_reuse: 613; X86-AVX: # %bb.0: 614; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 615; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 616; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 617; X86-AVX-NEXT: vmovaps %xmm0, (%eax) 618; X86-AVX-NEXT: retl 619; 620; X86-AVX512-LABEL: test_broadcast_4i32_8i32_reuse: 621; X86-AVX512: # %bb.0: 622; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 623; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx 624; X86-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 625; X86-AVX512-NEXT: vmovdqa %xmm0, (%eax) 626; X86-AVX512-NEXT: retl 627; 628; X64-AVX-LABEL: test_broadcast_4i32_8i32_reuse: 629; X64-AVX: # %bb.0: 630; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 631; X64-AVX-NEXT: vmovaps %xmm0, (%rsi) 632; X64-AVX-NEXT: retq 633; 634; X64-AVX512-LABEL: test_broadcast_4i32_8i32_reuse: 635; X64-AVX512: # %bb.0: 636; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 637; X64-AVX512-NEXT: vmovdqa %xmm0, (%rsi) 638; X64-AVX512-NEXT: retq 639 %1 = load <4 x i32>, ptr %p0 640 store <4 x i32> %1, ptr %p1 641 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 642 ret <8 x i32> %2 643} 644 645define <16 x i16> @test_broadcast_8i16_16i16_reuse(ptr%p0, ptr%p1) nounwind { 646; X86-AVX-LABEL: test_broadcast_8i16_16i16_reuse: 647; X86-AVX: # %bb.0: 648; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 649; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 650; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 651; X86-AVX-NEXT: vmovaps %xmm0, (%eax) 652; X86-AVX-NEXT: retl 653; 654; X86-AVX512-LABEL: test_broadcast_8i16_16i16_reuse: 655; X86-AVX512: # %bb.0: 656; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 657; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx 658; X86-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 659; X86-AVX512-NEXT: vmovdqa %xmm0, (%eax) 660; X86-AVX512-NEXT: retl 661; 662; X64-AVX-LABEL: test_broadcast_8i16_16i16_reuse: 663; X64-AVX: # %bb.0: 664; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 665; X64-AVX-NEXT: vmovaps %xmm0, (%rsi) 666; X64-AVX-NEXT: retq 667; 668; X64-AVX512-LABEL: test_broadcast_8i16_16i16_reuse: 669; X64-AVX512: # %bb.0: 670; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 671; X64-AVX512-NEXT: vmovdqa %xmm0, (%rsi) 672; X64-AVX512-NEXT: retq 673 %1 = load <8 x i16>, ptr%p0 674 store <8 x i16> %1, ptr %p1 675 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 676 ret <16 x i16> %2 677} 678 679define <32 x i8> @test_broadcast_16i8_32i8_reuse(ptr%p0, ptr%p1) nounwind { 680; X86-AVX-LABEL: test_broadcast_16i8_32i8_reuse: 681; X86-AVX: # %bb.0: 682; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 683; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 684; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 685; X86-AVX-NEXT: vmovaps %xmm0, (%eax) 686; X86-AVX-NEXT: retl 687; 688; X86-AVX512-LABEL: test_broadcast_16i8_32i8_reuse: 689; X86-AVX512: # %bb.0: 690; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 691; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx 692; X86-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 693; X86-AVX512-NEXT: vmovdqa %xmm0, (%eax) 694; X86-AVX512-NEXT: retl 695; 696; X64-AVX-LABEL: test_broadcast_16i8_32i8_reuse: 697; X64-AVX: # %bb.0: 698; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 699; X64-AVX-NEXT: vmovaps %xmm0, (%rsi) 700; X64-AVX-NEXT: retq 701; 702; X64-AVX512-LABEL: test_broadcast_16i8_32i8_reuse: 703; X64-AVX512: # %bb.0: 704; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 705; X64-AVX512-NEXT: vmovdqa %xmm0, (%rsi) 706; X64-AVX512-NEXT: retq 707 %1 = load <16 x i8>, ptr%p0 708 store <16 x i8> %1, ptr %p1 709 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 710 ret <32 x i8> %2 711} 712 713; 714; Subvector Load + Broadcast with Separate Store 715; 716 717define <8 x i32> @test_broadcast_4i32_8i32_chain(ptr %p0, ptr %p1) { 718; X86-AVX-LABEL: test_broadcast_4i32_8i32_chain: 719; X86-AVX: # %bb.0: 720; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 721; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 722; X86-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 723; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 724; X86-AVX-NEXT: vmovaps %xmm1, (%eax) 725; X86-AVX-NEXT: retl 726; 727; X86-AVX512-LABEL: test_broadcast_4i32_8i32_chain: 728; X86-AVX512: # %bb.0: 729; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 730; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx 731; X86-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 732; X86-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 733; X86-AVX512-NEXT: vmovaps %xmm1, (%eax) 734; X86-AVX512-NEXT: retl 735; 736; X64-AVX-LABEL: test_broadcast_4i32_8i32_chain: 737; X64-AVX: # %bb.0: 738; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 739; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 740; X64-AVX-NEXT: vmovaps %xmm1, (%rsi) 741; X64-AVX-NEXT: retq 742; 743; X64-AVX512-LABEL: test_broadcast_4i32_8i32_chain: 744; X64-AVX512: # %bb.0: 745; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 746; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 747; X64-AVX512-NEXT: vmovaps %xmm1, (%rsi) 748; X64-AVX512-NEXT: retq 749 %1 = load <4 x i32>, ptr %p0 750 store <4 x float> zeroinitializer, ptr %p1 751 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 752 ret <8 x i32> %2 753} 754 755define <16 x i32> @test_broadcast_4i32_16i32_chain(ptr %p0, ptr %p1) { 756; X86-AVX-LABEL: test_broadcast_4i32_16i32_chain: 757; X86-AVX: # %bb.0: 758; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 759; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 760; X86-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 761; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 762; X86-AVX-NEXT: vmovaps %xmm1, (%eax) 763; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 764; X86-AVX-NEXT: retl 765; 766; X86-AVX512-LABEL: test_broadcast_4i32_16i32_chain: 767; X86-AVX512: # %bb.0: 768; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 769; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx 770; X86-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 771; X86-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 772; X86-AVX512-NEXT: vmovaps %xmm1, (%eax) 773; X86-AVX512-NEXT: retl 774; 775; X64-AVX-LABEL: test_broadcast_4i32_16i32_chain: 776; X64-AVX: # %bb.0: 777; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 778; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 779; X64-AVX-NEXT: vmovaps %xmm1, (%rsi) 780; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 781; X64-AVX-NEXT: retq 782; 783; X64-AVX512-LABEL: test_broadcast_4i32_16i32_chain: 784; X64-AVX512: # %bb.0: 785; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 786; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 787; X64-AVX512-NEXT: vmovaps %xmm1, (%rsi) 788; X64-AVX512-NEXT: retq 789 %1 = load <4 x i32>, ptr %p0 790 store <4 x float> zeroinitializer, ptr %p1 791 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 792 ret <16 x i32> %2 793} 794 795; 796; subvector Load with multiple uses + broadcast 797; Fallback to the broadcast should be done 798; 799 800@ga4 = dso_local global <4 x i64> zeroinitializer, align 8 801@gb4 = dso_local global <8 x i64> zeroinitializer, align 8 802 803define dso_local void @fallback_broadcast_v4i64_to_v8i64(<4 x i64> %a, <8 x i64> %b) { 804; X86-AVX1-LABEL: fallback_broadcast_v4i64_to_v8i64: 805; X86-AVX1: # %bb.0: # %entry 806; X86-AVX1-NEXT: vmovdqa {{.*#+}} ymm3 = [1,0,2,0,3,0,4,0] 807; X86-AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm4 808; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 809; X86-AVX1-NEXT: vpmovsxbq {{.*#+}} xmm5 = [3,4] 810; X86-AVX1-NEXT: vpaddq %xmm5, %xmm0, %xmm0 811; X86-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 812; X86-AVX1-NEXT: vpaddq %xmm5, %xmm6, %xmm6 813; X86-AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2 814; X86-AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 815; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 816; X86-AVX1-NEXT: vpaddq %xmm5, %xmm6, %xmm5 817; X86-AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 818; X86-AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 819; X86-AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 820; X86-AVX1-NEXT: vandps %ymm3, %ymm2, %ymm2 821; X86-AVX1-NEXT: vmovdqu %xmm0, ga4+16 822; X86-AVX1-NEXT: vmovdqu %xmm4, ga4 823; X86-AVX1-NEXT: vmovups %ymm2, gb4+32 824; X86-AVX1-NEXT: vmovups %ymm1, gb4 825; X86-AVX1-NEXT: vzeroupper 826; X86-AVX1-NEXT: retl 827; 828; X86-AVX2-LABEL: fallback_broadcast_v4i64_to_v8i64: 829; X86-AVX2: # %bb.0: # %entry 830; X86-AVX2-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,2,3,4] 831; X86-AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm0 832; X86-AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2 833; X86-AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1 834; X86-AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 835; X86-AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 836; X86-AVX2-NEXT: vmovdqu %ymm0, ga4 837; X86-AVX2-NEXT: vmovdqu %ymm2, gb4+32 838; X86-AVX2-NEXT: vmovdqu %ymm1, gb4 839; X86-AVX2-NEXT: vzeroupper 840; X86-AVX2-NEXT: retl 841; 842; X86-AVX512-LABEL: fallback_broadcast_v4i64_to_v8i64: 843; X86-AVX512: # %bb.0: # %entry 844; X86-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [1,0,2,0,3,0,4,0,1,0,2,0,3,0,4,0] 845; X86-AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] 846; X86-AVX512-NEXT: vpaddq %ymm2, %ymm0, %ymm0 847; X86-AVX512-NEXT: vpaddq %zmm2, %zmm1, %zmm1 848; X86-AVX512-NEXT: vpandq %zmm2, %zmm1, %zmm1 849; X86-AVX512-NEXT: vmovdqu %ymm0, ga4 850; X86-AVX512-NEXT: vmovdqu64 %zmm1, gb4 851; X86-AVX512-NEXT: vzeroupper 852; X86-AVX512-NEXT: retl 853; 854; X64-AVX1-LABEL: fallback_broadcast_v4i64_to_v8i64: 855; X64-AVX1: # %bb.0: # %entry 856; X64-AVX1-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,3,4] 857; X64-AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm4 858; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 859; X64-AVX1-NEXT: vpmovsxbq {{.*#+}} xmm5 = [3,4] 860; X64-AVX1-NEXT: vpaddq %xmm5, %xmm0, %xmm0 861; X64-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 862; X64-AVX1-NEXT: vpaddq %xmm5, %xmm6, %xmm6 863; X64-AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2 864; X64-AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 865; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 866; X64-AVX1-NEXT: vpaddq %xmm5, %xmm6, %xmm5 867; X64-AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 868; X64-AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 869; X64-AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 870; X64-AVX1-NEXT: vandps %ymm3, %ymm2, %ymm2 871; X64-AVX1-NEXT: vmovdqu %xmm0, ga4+16(%rip) 872; X64-AVX1-NEXT: vmovdqu %xmm4, ga4(%rip) 873; X64-AVX1-NEXT: vmovups %ymm2, gb4+32(%rip) 874; X64-AVX1-NEXT: vmovups %ymm1, gb4(%rip) 875; X64-AVX1-NEXT: vzeroupper 876; X64-AVX1-NEXT: retq 877; 878; X64-AVX2-LABEL: fallback_broadcast_v4i64_to_v8i64: 879; X64-AVX2: # %bb.0: # %entry 880; X64-AVX2-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,2,3,4] 881; X64-AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm0 882; X64-AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2 883; X64-AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1 884; X64-AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 885; X64-AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 886; X64-AVX2-NEXT: vmovdqu %ymm0, ga4(%rip) 887; X64-AVX2-NEXT: vmovdqu %ymm2, gb4+32(%rip) 888; X64-AVX2-NEXT: vmovdqu %ymm1, gb4(%rip) 889; X64-AVX2-NEXT: vzeroupper 890; X64-AVX2-NEXT: retq 891; 892; X64-AVX512-LABEL: fallback_broadcast_v4i64_to_v8i64: 893; X64-AVX512: # %bb.0: # %entry 894; X64-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [1,2,3,4,1,2,3,4] 895; X64-AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] 896; X64-AVX512-NEXT: vpaddq %ymm2, %ymm0, %ymm0 897; X64-AVX512-NEXT: vpaddq %zmm2, %zmm1, %zmm1 898; X64-AVX512-NEXT: vpandq %zmm2, %zmm1, %zmm1 899; X64-AVX512-NEXT: vmovdqu %ymm0, ga4(%rip) 900; X64-AVX512-NEXT: vmovdqu64 %zmm1, gb4(%rip) 901; X64-AVX512-NEXT: vzeroupper 902; X64-AVX512-NEXT: retq 903entry: 904 %0 = add <4 x i64> %a, <i64 1, i64 2, i64 3, i64 4> 905 %1 = add <8 x i64> %b, <i64 1, i64 2, i64 3, i64 4, i64 1, i64 2, i64 3, i64 4> 906 %2 = and <8 x i64> %1, <i64 1, i64 2, i64 3, i64 4, i64 1, i64 2, i64 3, i64 4> 907 store <4 x i64> %0, ptr @ga4, align 8 908 store <8 x i64> %2, ptr @gb4, align 8 909 ret void 910} 911 912 913@ga2 = dso_local global <4 x double> zeroinitializer, align 8 914@gb2 = dso_local global <8 x double> zeroinitializer, align 8 915 916define dso_local void @fallback_broadcast_v4f64_to_v8f64(<4 x double> %a, <8 x double> %b) { 917; X86-AVX-LABEL: fallback_broadcast_v4f64_to_v8f64: 918; X86-AVX: # %bb.0: # %entry 919; X86-AVX-NEXT: vmovapd {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] 920; X86-AVX-NEXT: vaddpd %ymm3, %ymm0, %ymm0 921; X86-AVX-NEXT: vaddpd %ymm3, %ymm2, %ymm2 922; X86-AVX-NEXT: vaddpd %ymm3, %ymm1, %ymm1 923; X86-AVX-NEXT: vdivpd %ymm3, %ymm1, %ymm1 924; X86-AVX-NEXT: vdivpd %ymm3, %ymm2, %ymm2 925; X86-AVX-NEXT: vmovupd %ymm0, ga2 926; X86-AVX-NEXT: vmovupd %ymm2, gb2+32 927; X86-AVX-NEXT: vmovupd %ymm1, gb2 928; X86-AVX-NEXT: vzeroupper 929; X86-AVX-NEXT: retl 930; 931; X86-AVX512-LABEL: fallback_broadcast_v4f64_to_v8f64: 932; X86-AVX512: # %bb.0: # %entry 933; X86-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0] 934; X86-AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] 935; X86-AVX512-NEXT: vaddpd %ymm2, %ymm0, %ymm0 936; X86-AVX512-NEXT: vaddpd %zmm2, %zmm1, %zmm1 937; X86-AVX512-NEXT: vdivpd %zmm2, %zmm1, %zmm1 938; X86-AVX512-NEXT: vmovupd %ymm0, ga2 939; X86-AVX512-NEXT: vmovupd %zmm1, gb2 940; X86-AVX512-NEXT: vzeroupper 941; X86-AVX512-NEXT: retl 942; 943; X64-AVX-LABEL: fallback_broadcast_v4f64_to_v8f64: 944; X64-AVX: # %bb.0: # %entry 945; X64-AVX-NEXT: vmovapd {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] 946; X64-AVX-NEXT: vaddpd %ymm3, %ymm0, %ymm0 947; X64-AVX-NEXT: vaddpd %ymm3, %ymm2, %ymm2 948; X64-AVX-NEXT: vaddpd %ymm3, %ymm1, %ymm1 949; X64-AVX-NEXT: vdivpd %ymm3, %ymm1, %ymm1 950; X64-AVX-NEXT: vdivpd %ymm3, %ymm2, %ymm2 951; X64-AVX-NEXT: vmovupd %ymm0, ga2(%rip) 952; X64-AVX-NEXT: vmovupd %ymm2, gb2+32(%rip) 953; X64-AVX-NEXT: vmovupd %ymm1, gb2(%rip) 954; X64-AVX-NEXT: vzeroupper 955; X64-AVX-NEXT: retq 956; 957; X64-AVX512-LABEL: fallback_broadcast_v4f64_to_v8f64: 958; X64-AVX512: # %bb.0: # %entry 959; X64-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0] 960; X64-AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] 961; X64-AVX512-NEXT: vaddpd %ymm2, %ymm0, %ymm0 962; X64-AVX512-NEXT: vaddpd %zmm2, %zmm1, %zmm1 963; X64-AVX512-NEXT: vdivpd %zmm2, %zmm1, %zmm1 964; X64-AVX512-NEXT: vmovupd %ymm0, ga2(%rip) 965; X64-AVX512-NEXT: vmovupd %zmm1, gb2(%rip) 966; X64-AVX512-NEXT: vzeroupper 967; X64-AVX512-NEXT: retq 968entry: 969 %0 = fadd <4 x double> %a, <double 1.0, double 2.0, double 3.0, double 4.0> 970 %1 = fadd <8 x double> %b, <double 1.0, double 2.0, double 3.0, double 4.0, double 1.0, double 2.0, double 3.0, double 4.0> 971 %2 = fdiv <8 x double> %1, <double 1.0, double 2.0, double 3.0, double 4.0, double 1.0, double 2.0, double 3.0, double 4.0> 972 store <4 x double> %0, ptr @ga2, align 8 973 store <8 x double> %2, ptr @gb2, align 8 974 ret void 975} 976 977@ha4 = dso_local global <4 x i32> zeroinitializer, align 8 978@hb4 = dso_local global <8 x i32> zeroinitializer, align 8 979@hc4 = dso_local global <16 x i32> zeroinitializer, align 8 980 981define dso_local void @fallback_broadcast_v4i32_v8i32_v16i32(<4 x i32> %a, <8 x i32> %b, <16 x i32> %c) nounwind { 982; X86-AVX1-LABEL: fallback_broadcast_v4i32_v8i32_v16i32: 983; X86-AVX1: # %bb.0: # %entry 984; X86-AVX1-NEXT: pushl %ebp 985; X86-AVX1-NEXT: movl %esp, %ebp 986; X86-AVX1-NEXT: andl $-32, %esp 987; X86-AVX1-NEXT: subl $32, %esp 988; X86-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [1,2,3,4,1,2,3,4] 989; X86-AVX1-NEXT: # ymm3 = mem[0,1,0,1] 990; X86-AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0 991; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 992; X86-AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm4 993; X86-AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 994; X86-AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 995; X86-AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 996; X86-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 997; X86-AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm4 998; X86-AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 999; X86-AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 1000; X86-AVX1-NEXT: vpaddd 8(%ebp), %xmm3, %xmm4 1001; X86-AVX1-NEXT: vpaddd 24(%ebp), %xmm3, %xmm5 1002; X86-AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 1003; X86-AVX1-NEXT: vandps %ymm3, %ymm2, %ymm2 1004; X86-AVX1-NEXT: vandps %ymm3, %ymm4, %ymm3 1005; X86-AVX1-NEXT: vmovdqu %xmm0, ha4 1006; X86-AVX1-NEXT: vmovups %ymm1, hb4 1007; X86-AVX1-NEXT: vmovups %ymm3, hc4+32 1008; X86-AVX1-NEXT: vmovups %ymm2, hc4 1009; X86-AVX1-NEXT: movl %ebp, %esp 1010; X86-AVX1-NEXT: popl %ebp 1011; X86-AVX1-NEXT: vzeroupper 1012; X86-AVX1-NEXT: retl 1013; 1014; X86-AVX2-LABEL: fallback_broadcast_v4i32_v8i32_v16i32: 1015; X86-AVX2: # %bb.0: # %entry 1016; X86-AVX2-NEXT: pushl %ebp 1017; X86-AVX2-NEXT: movl %esp, %ebp 1018; X86-AVX2-NEXT: andl $-32, %esp 1019; X86-AVX2-NEXT: subl $32, %esp 1020; X86-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,2,3,4,1,2,3,4] 1021; X86-AVX2-NEXT: # ymm3 = mem[0,1,0,1] 1022; X86-AVX2-NEXT: vpaddd %xmm3, %xmm0, %xmm0 1023; X86-AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1 1024; X86-AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 1025; X86-AVX2-NEXT: vpaddd %ymm3, %ymm2, %ymm2 1026; X86-AVX2-NEXT: vpaddd 8(%ebp), %ymm3, %ymm4 1027; X86-AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 1028; X86-AVX2-NEXT: vpand %ymm3, %ymm4, %ymm3 1029; X86-AVX2-NEXT: vmovdqu %xmm0, ha4 1030; X86-AVX2-NEXT: vmovdqu %ymm1, hb4 1031; X86-AVX2-NEXT: vmovdqu %ymm3, hc4+32 1032; X86-AVX2-NEXT: vmovdqu %ymm2, hc4 1033; X86-AVX2-NEXT: movl %ebp, %esp 1034; X86-AVX2-NEXT: popl %ebp 1035; X86-AVX2-NEXT: vzeroupper 1036; X86-AVX2-NEXT: retl 1037; 1038; X86-AVX512-LABEL: fallback_broadcast_v4i32_v8i32_v16i32: 1039; X86-AVX512: # %bb.0: # %entry 1040; X86-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4] 1041; X86-AVX512-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 1042; X86-AVX512-NEXT: vpaddd %xmm3, %xmm0, %xmm0 1043; X86-AVX512-NEXT: vpaddd %ymm3, %ymm1, %ymm1 1044; X86-AVX512-NEXT: vpand %ymm3, %ymm1, %ymm1 1045; X86-AVX512-NEXT: vpaddd %zmm3, %zmm2, %zmm2 1046; X86-AVX512-NEXT: vpandd %zmm3, %zmm2, %zmm2 1047; X86-AVX512-NEXT: vmovdqu %xmm0, ha4 1048; X86-AVX512-NEXT: vmovdqu %ymm1, hb4 1049; X86-AVX512-NEXT: vmovdqu64 %zmm2, hc4 1050; X86-AVX512-NEXT: vzeroupper 1051; X86-AVX512-NEXT: retl 1052; 1053; X64-AVX1-LABEL: fallback_broadcast_v4i32_v8i32_v16i32: 1054; X64-AVX1: # %bb.0: # %entry 1055; X64-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [1,2,3,4,1,2,3,4] 1056; X64-AVX1-NEXT: # ymm4 = mem[0,1,0,1] 1057; X64-AVX1-NEXT: vpaddd %xmm4, %xmm0, %xmm0 1058; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 1059; X64-AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm5 1060; X64-AVX1-NEXT: vpaddd %xmm4, %xmm1, %xmm1 1061; X64-AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 1062; X64-AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 1063; X64-AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 1064; X64-AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm5 1065; X64-AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 1066; X64-AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 1067; X64-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 1068; X64-AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm5 1069; X64-AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2 1070; X64-AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 1071; X64-AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 1072; X64-AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 1073; X64-AVX1-NEXT: vmovdqu %xmm0, ha4(%rip) 1074; X64-AVX1-NEXT: vmovups %ymm1, hb4(%rip) 1075; X64-AVX1-NEXT: vmovups %ymm3, hc4+32(%rip) 1076; X64-AVX1-NEXT: vmovups %ymm2, hc4(%rip) 1077; X64-AVX1-NEXT: vzeroupper 1078; X64-AVX1-NEXT: retq 1079; 1080; X64-AVX2-LABEL: fallback_broadcast_v4i32_v8i32_v16i32: 1081; X64-AVX2: # %bb.0: # %entry 1082; X64-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [1,2,3,4,1,2,3,4] 1083; X64-AVX2-NEXT: # ymm4 = mem[0,1,0,1] 1084; X64-AVX2-NEXT: vpaddd %xmm4, %xmm0, %xmm0 1085; X64-AVX2-NEXT: vpaddd %ymm4, %ymm1, %ymm1 1086; X64-AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 1087; X64-AVX2-NEXT: vpaddd %ymm4, %ymm3, %ymm3 1088; X64-AVX2-NEXT: vpaddd %ymm4, %ymm2, %ymm2 1089; X64-AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 1090; X64-AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 1091; X64-AVX2-NEXT: vmovdqu %xmm0, ha4(%rip) 1092; X64-AVX2-NEXT: vmovdqu %ymm1, hb4(%rip) 1093; X64-AVX2-NEXT: vmovdqu %ymm3, hc4+32(%rip) 1094; X64-AVX2-NEXT: vmovdqu %ymm2, hc4(%rip) 1095; X64-AVX2-NEXT: vzeroupper 1096; X64-AVX2-NEXT: retq 1097; 1098; X64-AVX512-LABEL: fallback_broadcast_v4i32_v8i32_v16i32: 1099; X64-AVX512: # %bb.0: # %entry 1100; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4] 1101; X64-AVX512-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 1102; X64-AVX512-NEXT: vpaddd %xmm3, %xmm0, %xmm0 1103; X64-AVX512-NEXT: vpaddd %ymm3, %ymm1, %ymm1 1104; X64-AVX512-NEXT: vpand %ymm3, %ymm1, %ymm1 1105; X64-AVX512-NEXT: vpaddd %zmm3, %zmm2, %zmm2 1106; X64-AVX512-NEXT: vpandd %zmm3, %zmm2, %zmm2 1107; X64-AVX512-NEXT: vmovdqu %xmm0, ha4(%rip) 1108; X64-AVX512-NEXT: vmovdqu %ymm1, hb4(%rip) 1109; X64-AVX512-NEXT: vmovdqu64 %zmm2, hc4(%rip) 1110; X64-AVX512-NEXT: vzeroupper 1111; X64-AVX512-NEXT: retq 1112entry: 1113 %0 = add <4 x i32> %a, <i32 1, i32 2, i32 3, i32 4> 1114 %1 = add <8 x i32> %b, <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4> 1115 %2 = and <8 x i32> %1, <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4> 1116 %3 = add <16 x i32> %c, <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4> 1117 %4 = and <16 x i32> %3, <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4> 1118 store <4 x i32> %0, ptr @ha4, align 8 1119 store <8 x i32> %2, ptr @hb4, align 8 1120 store <16 x i32> %4, ptr @hc4, align 8 1121 ret void 1122} 1123 1124; 1125; Subvector Broadcast from register 1126; 1127 1128define <4 x double> @reg_broadcast_2f64_4f64(<2 x double> %a0) nounwind { 1129; X86-LABEL: reg_broadcast_2f64_4f64: 1130; X86: # %bb.0: 1131; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1132; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1133; X86-NEXT: retl 1134; 1135; X64-LABEL: reg_broadcast_2f64_4f64: 1136; X64: # %bb.0: 1137; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1138; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1139; X64-NEXT: retq 1140 %1 = shufflevector <2 x double> %a0, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 1141 ret <4 x double> %1 1142} 1143 1144define <8 x double> @reg_broadcast_2f64_8f64(<2 x double> %a0) nounwind { 1145; X86-AVX-LABEL: reg_broadcast_2f64_8f64: 1146; X86-AVX: # %bb.0: 1147; X86-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1148; X86-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1149; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 1150; X86-AVX-NEXT: retl 1151; 1152; X86-AVX512-LABEL: reg_broadcast_2f64_8f64: 1153; X86-AVX512: # %bb.0: 1154; X86-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1155; X86-AVX512-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] 1156; X86-AVX512-NEXT: retl 1157; 1158; X64-AVX-LABEL: reg_broadcast_2f64_8f64: 1159; X64-AVX: # %bb.0: 1160; X64-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1161; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1162; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 1163; X64-AVX-NEXT: retq 1164; 1165; X64-AVX512-LABEL: reg_broadcast_2f64_8f64: 1166; X64-AVX512: # %bb.0: 1167; X64-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1168; X64-AVX512-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] 1169; X64-AVX512-NEXT: retq 1170 %1 = shufflevector <2 x double> %a0, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1> 1171 ret <8 x double> %1 1172} 1173 1174define <8 x double> @reg_broadcast_4f64_8f64(<4 x double> %a0) nounwind { 1175; X86-AVX-LABEL: reg_broadcast_4f64_8f64: 1176; X86-AVX: # %bb.0: 1177; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 1178; X86-AVX-NEXT: retl 1179; 1180; X86-AVX512-LABEL: reg_broadcast_4f64_8f64: 1181; X86-AVX512: # %bb.0: 1182; X86-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1183; X86-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1184; X86-AVX512-NEXT: retl 1185; 1186; X64-AVX-LABEL: reg_broadcast_4f64_8f64: 1187; X64-AVX: # %bb.0: 1188; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 1189; X64-AVX-NEXT: retq 1190; 1191; X64-AVX512-LABEL: reg_broadcast_4f64_8f64: 1192; X64-AVX512: # %bb.0: 1193; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1194; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1195; X64-AVX512-NEXT: retq 1196 %1 = shufflevector <4 x double> %a0, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 1197 ret <8 x double> %1 1198} 1199 1200define <4 x i64> @reg_broadcast_2i64_4i64(<2 x i64> %a0) nounwind { 1201; X86-LABEL: reg_broadcast_2i64_4i64: 1202; X86: # %bb.0: 1203; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1204; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1205; X86-NEXT: retl 1206; 1207; X64-LABEL: reg_broadcast_2i64_4i64: 1208; X64: # %bb.0: 1209; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1210; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1211; X64-NEXT: retq 1212 %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 1213 ret <4 x i64> %1 1214} 1215 1216define <8 x i64> @reg_broadcast_2i64_8i64(<2 x i64> %a0) nounwind { 1217; X86-AVX-LABEL: reg_broadcast_2i64_8i64: 1218; X86-AVX: # %bb.0: 1219; X86-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1220; X86-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1221; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 1222; X86-AVX-NEXT: retl 1223; 1224; X86-AVX512-LABEL: reg_broadcast_2i64_8i64: 1225; X86-AVX512: # %bb.0: 1226; X86-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1227; X86-AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] 1228; X86-AVX512-NEXT: retl 1229; 1230; X64-AVX-LABEL: reg_broadcast_2i64_8i64: 1231; X64-AVX: # %bb.0: 1232; X64-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1233; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1234; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 1235; X64-AVX-NEXT: retq 1236; 1237; X64-AVX512-LABEL: reg_broadcast_2i64_8i64: 1238; X64-AVX512: # %bb.0: 1239; X64-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1240; X64-AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] 1241; X64-AVX512-NEXT: retq 1242 %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1> 1243 ret <8 x i64> %1 1244} 1245 1246define <8 x i64> @reg_broadcast_4i64_8i64(<4 x i64> %a0) nounwind { 1247; X86-AVX-LABEL: reg_broadcast_4i64_8i64: 1248; X86-AVX: # %bb.0: 1249; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 1250; X86-AVX-NEXT: retl 1251; 1252; X86-AVX512-LABEL: reg_broadcast_4i64_8i64: 1253; X86-AVX512: # %bb.0: 1254; X86-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1255; X86-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1256; X86-AVX512-NEXT: retl 1257; 1258; X64-AVX-LABEL: reg_broadcast_4i64_8i64: 1259; X64-AVX: # %bb.0: 1260; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 1261; X64-AVX-NEXT: retq 1262; 1263; X64-AVX512-LABEL: reg_broadcast_4i64_8i64: 1264; X64-AVX512: # %bb.0: 1265; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1266; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1267; X64-AVX512-NEXT: retq 1268 %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 1269 ret <8 x i64> %1 1270} 1271 1272define <8 x float> @reg_broadcast_4f32_8f32(<4 x float> %a0) nounwind { 1273; X86-LABEL: reg_broadcast_4f32_8f32: 1274; X86: # %bb.0: 1275; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1276; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1277; X86-NEXT: retl 1278; 1279; X64-LABEL: reg_broadcast_4f32_8f32: 1280; X64: # %bb.0: 1281; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1282; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1283; X64-NEXT: retq 1284 %1 = shufflevector <4 x float> %a0, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 1285 ret <8 x float> %1 1286} 1287 1288define <16 x float> @reg_broadcast_4f32_16f32(<4 x float> %a0) nounwind { 1289; X86-AVX-LABEL: reg_broadcast_4f32_16f32: 1290; X86-AVX: # %bb.0: 1291; X86-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1292; X86-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1293; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 1294; X86-AVX-NEXT: retl 1295; 1296; X86-AVX512-LABEL: reg_broadcast_4f32_16f32: 1297; X86-AVX512: # %bb.0: 1298; X86-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1299; X86-AVX512-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] 1300; X86-AVX512-NEXT: retl 1301; 1302; X64-AVX-LABEL: reg_broadcast_4f32_16f32: 1303; X64-AVX: # %bb.0: 1304; X64-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1305; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1306; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 1307; X64-AVX-NEXT: retq 1308; 1309; X64-AVX512-LABEL: reg_broadcast_4f32_16f32: 1310; X64-AVX512: # %bb.0: 1311; X64-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1312; X64-AVX512-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] 1313; X64-AVX512-NEXT: retq 1314 %1 = shufflevector <4 x float> %a0, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 1315 ret <16 x float> %1 1316} 1317 1318define <16 x float> @reg_broadcast_8f32_16f32(<8 x float> %a0) nounwind { 1319; X86-AVX-LABEL: reg_broadcast_8f32_16f32: 1320; X86-AVX: # %bb.0: 1321; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 1322; X86-AVX-NEXT: retl 1323; 1324; X86-AVX512-LABEL: reg_broadcast_8f32_16f32: 1325; X86-AVX512: # %bb.0: 1326; X86-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1327; X86-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1328; X86-AVX512-NEXT: retl 1329; 1330; X64-AVX-LABEL: reg_broadcast_8f32_16f32: 1331; X64-AVX: # %bb.0: 1332; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 1333; X64-AVX-NEXT: retq 1334; 1335; X64-AVX512-LABEL: reg_broadcast_8f32_16f32: 1336; X64-AVX512: # %bb.0: 1337; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1338; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1339; X64-AVX512-NEXT: retq 1340 %1 = shufflevector <8 x float> %a0, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1341 ret <16 x float> %1 1342} 1343 1344define <8 x i32> @reg_broadcast_4i32_8i32(<4 x i32> %a0) nounwind { 1345; X86-LABEL: reg_broadcast_4i32_8i32: 1346; X86: # %bb.0: 1347; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1348; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1349; X86-NEXT: retl 1350; 1351; X64-LABEL: reg_broadcast_4i32_8i32: 1352; X64: # %bb.0: 1353; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1354; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1355; X64-NEXT: retq 1356 %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 1357 ret <8 x i32> %1 1358} 1359 1360define <16 x i32> @reg_broadcast_4i32_16i32(<4 x i32> %a0) nounwind { 1361; X86-AVX-LABEL: reg_broadcast_4i32_16i32: 1362; X86-AVX: # %bb.0: 1363; X86-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1364; X86-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1365; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 1366; X86-AVX-NEXT: retl 1367; 1368; X86-AVX512-LABEL: reg_broadcast_4i32_16i32: 1369; X86-AVX512: # %bb.0: 1370; X86-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1371; X86-AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] 1372; X86-AVX512-NEXT: retl 1373; 1374; X64-AVX-LABEL: reg_broadcast_4i32_16i32: 1375; X64-AVX: # %bb.0: 1376; X64-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1377; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1378; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 1379; X64-AVX-NEXT: retq 1380; 1381; X64-AVX512-LABEL: reg_broadcast_4i32_16i32: 1382; X64-AVX512: # %bb.0: 1383; X64-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1384; X64-AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] 1385; X64-AVX512-NEXT: retq 1386 %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 1387 ret <16 x i32> %1 1388} 1389 1390define <16 x i32> @reg_broadcast_8i32_16i32(<8 x i32> %a0) nounwind { 1391; X86-AVX-LABEL: reg_broadcast_8i32_16i32: 1392; X86-AVX: # %bb.0: 1393; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 1394; X86-AVX-NEXT: retl 1395; 1396; X86-AVX512-LABEL: reg_broadcast_8i32_16i32: 1397; X86-AVX512: # %bb.0: 1398; X86-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1399; X86-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1400; X86-AVX512-NEXT: retl 1401; 1402; X64-AVX-LABEL: reg_broadcast_8i32_16i32: 1403; X64-AVX: # %bb.0: 1404; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 1405; X64-AVX-NEXT: retq 1406; 1407; X64-AVX512-LABEL: reg_broadcast_8i32_16i32: 1408; X64-AVX512: # %bb.0: 1409; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1410; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1411; X64-AVX512-NEXT: retq 1412 %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1413 ret <16 x i32> %1 1414} 1415 1416define <16 x i16> @reg_broadcast_8i16_16i16(<8 x i16> %a0) nounwind { 1417; X86-LABEL: reg_broadcast_8i16_16i16: 1418; X86: # %bb.0: 1419; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1420; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1421; X86-NEXT: retl 1422; 1423; X64-LABEL: reg_broadcast_8i16_16i16: 1424; X64: # %bb.0: 1425; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1426; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1427; X64-NEXT: retq 1428 %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1429 ret <16 x i16> %1 1430} 1431 1432define <32 x i16> @reg_broadcast_8i16_32i16(<8 x i16> %a0) nounwind { 1433; X86-AVX-LABEL: reg_broadcast_8i16_32i16: 1434; X86-AVX: # %bb.0: 1435; X86-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1436; X86-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1437; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 1438; X86-AVX-NEXT: retl 1439; 1440; X86-AVX512-LABEL: reg_broadcast_8i16_32i16: 1441; X86-AVX512: # %bb.0: 1442; X86-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1443; X86-AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] 1444; X86-AVX512-NEXT: retl 1445; 1446; X64-AVX-LABEL: reg_broadcast_8i16_32i16: 1447; X64-AVX: # %bb.0: 1448; X64-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1449; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1450; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 1451; X64-AVX-NEXT: retq 1452; 1453; X64-AVX512-LABEL: reg_broadcast_8i16_32i16: 1454; X64-AVX512: # %bb.0: 1455; X64-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1456; X64-AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] 1457; X64-AVX512-NEXT: retq 1458 %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1459 ret <32 x i16> %1 1460} 1461 1462define <32 x i16> @reg_broadcast_16i16_32i16(<16 x i16> %a0) nounwind { 1463; X86-AVX-LABEL: reg_broadcast_16i16_32i16: 1464; X86-AVX: # %bb.0: 1465; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 1466; X86-AVX-NEXT: retl 1467; 1468; X86-AVX512-LABEL: reg_broadcast_16i16_32i16: 1469; X86-AVX512: # %bb.0: 1470; X86-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1471; X86-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1472; X86-AVX512-NEXT: retl 1473; 1474; X64-AVX-LABEL: reg_broadcast_16i16_32i16: 1475; X64-AVX: # %bb.0: 1476; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 1477; X64-AVX-NEXT: retq 1478; 1479; X64-AVX512-LABEL: reg_broadcast_16i16_32i16: 1480; X64-AVX512: # %bb.0: 1481; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1482; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1483; X64-AVX512-NEXT: retq 1484 %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1485 ret <32 x i16> %1 1486} 1487 1488define <32 x i8> @reg_broadcast_16i8_32i8(<16 x i8> %a0) nounwind { 1489; X86-LABEL: reg_broadcast_16i8_32i8: 1490; X86: # %bb.0: 1491; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1492; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1493; X86-NEXT: retl 1494; 1495; X64-LABEL: reg_broadcast_16i8_32i8: 1496; X64: # %bb.0: 1497; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1498; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1499; X64-NEXT: retq 1500 %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1501 ret <32 x i8> %1 1502} 1503 1504define <64 x i8> @reg_broadcast_16i8_64i8(<16 x i8> %a0) nounwind { 1505; X86-AVX-LABEL: reg_broadcast_16i8_64i8: 1506; X86-AVX: # %bb.0: 1507; X86-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1508; X86-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1509; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 1510; X86-AVX-NEXT: retl 1511; 1512; X86-AVX512-LABEL: reg_broadcast_16i8_64i8: 1513; X86-AVX512: # %bb.0: 1514; X86-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1515; X86-AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] 1516; X86-AVX512-NEXT: retl 1517; 1518; X64-AVX-LABEL: reg_broadcast_16i8_64i8: 1519; X64-AVX: # %bb.0: 1520; X64-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1521; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1522; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 1523; X64-AVX-NEXT: retq 1524; 1525; X64-AVX512-LABEL: reg_broadcast_16i8_64i8: 1526; X64-AVX512: # %bb.0: 1527; X64-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1528; X64-AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] 1529; X64-AVX512-NEXT: retq 1530 %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1531 ret <64 x i8> %1 1532} 1533 1534define <64 x i8> @reg_broadcast_32i8_64i8(<32 x i8> %a0) nounwind { 1535; X86-AVX-LABEL: reg_broadcast_32i8_64i8: 1536; X86-AVX: # %bb.0: 1537; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 1538; X86-AVX-NEXT: retl 1539; 1540; X86-AVX512-LABEL: reg_broadcast_32i8_64i8: 1541; X86-AVX512: # %bb.0: 1542; X86-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1543; X86-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1544; X86-AVX512-NEXT: retl 1545; 1546; X64-AVX-LABEL: reg_broadcast_32i8_64i8: 1547; X64-AVX: # %bb.0: 1548; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 1549; X64-AVX-NEXT: retq 1550; 1551; X64-AVX512-LABEL: reg_broadcast_32i8_64i8: 1552; X64-AVX512: # %bb.0: 1553; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1554; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1555; X64-AVX512-NEXT: retq 1556 %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 1557 ret <64 x i8> %1 1558} 1559 1560; 1561; PR34394 1562; 1563 1564define <4 x i32> @test_2xi32_to_4xi32_mem(ptr %vp) { 1565; X86-LABEL: test_2xi32_to_4xi32_mem: 1566; X86: # %bb.0: 1567; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1568; X86-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] 1569; X86-NEXT: retl 1570; 1571; X64-LABEL: test_2xi32_to_4xi32_mem: 1572; X64: # %bb.0: 1573; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] 1574; X64-NEXT: retq 1575 %vec = load <2 x i32>, ptr %vp 1576 %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 1577 ret <4 x i32> %res 1578} 1579 1580define <8 x i32> @test_2xi32_to_8xi32_mem(ptr %vp) { 1581; X86-LABEL: test_2xi32_to_8xi32_mem: 1582; X86: # %bb.0: 1583; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1584; X86-NEXT: vbroadcastsd (%eax), %ymm0 1585; X86-NEXT: retl 1586; 1587; X64-LABEL: test_2xi32_to_8xi32_mem: 1588; X64: # %bb.0: 1589; X64-NEXT: vbroadcastsd (%rdi), %ymm0 1590; X64-NEXT: retq 1591 %vec = load <2 x i32>, ptr %vp 1592 %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1> 1593 ret <8 x i32> %res 1594} 1595 1596define <16 x i32> @test_2xi32_to_16xi32_mem(ptr %vp) { 1597; X86-AVX-LABEL: test_2xi32_to_16xi32_mem: 1598; X86-AVX: # %bb.0: 1599; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 1600; X86-AVX-NEXT: vbroadcastsd (%eax), %ymm0 1601; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 1602; X86-AVX-NEXT: retl 1603; 1604; X86-AVX512-LABEL: test_2xi32_to_16xi32_mem: 1605; X86-AVX512: # %bb.0: 1606; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 1607; X86-AVX512-NEXT: vbroadcastsd (%eax), %zmm0 1608; X86-AVX512-NEXT: retl 1609; 1610; X64-AVX-LABEL: test_2xi32_to_16xi32_mem: 1611; X64-AVX: # %bb.0: 1612; X64-AVX-NEXT: vbroadcastsd (%rdi), %ymm0 1613; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 1614; X64-AVX-NEXT: retq 1615; 1616; X64-AVX512-LABEL: test_2xi32_to_16xi32_mem: 1617; X64-AVX512: # %bb.0: 1618; X64-AVX512-NEXT: vbroadcastsd (%rdi), %zmm0 1619; X64-AVX512-NEXT: retq 1620 %vec = load <2 x i32>, ptr %vp 1621 %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1> 1622 ret <16 x i32> %res 1623} 1624 1625; 1626; PR34041 1627; 1628 1629define <4 x double> @broadcast_v4f64_f64_u000(ptr %p) { 1630; X86-LABEL: broadcast_v4f64_f64_u000: 1631; X86: # %bb.0: 1632; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1633; X86-NEXT: vbroadcastsd (%eax), %ymm0 1634; X86-NEXT: retl 1635; 1636; X64-LABEL: broadcast_v4f64_f64_u000: 1637; X64: # %bb.0: 1638; X64-NEXT: vbroadcastsd (%rdi), %ymm0 1639; X64-NEXT: retq 1640 %s = load double, ptr %p 1641 %vec = insertelement <2 x double> undef, double %s, i32 0 1642 %res = shufflevector <2 x double> %vec, <2 x double> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0> 1643 ret <4 x double> %res 1644} 1645 1646define <4 x double> @broadcast_v4f64_v2f64_4u61(ptr %vp, <4 x double> %default) { 1647; X86-LABEL: broadcast_v4f64_v2f64_4u61: 1648; X86: # %bb.0: 1649; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1650; X86-NEXT: vinsertf128 $1, (%eax), %ymm0, %ymm1 1651; X86-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 1652; X86-NEXT: retl 1653; 1654; X64-LABEL: broadcast_v4f64_v2f64_4u61: 1655; X64: # %bb.0: 1656; X64-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm1 1657; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 1658; X64-NEXT: retq 1659 %vec = load <2 x double>, ptr %vp 1660 %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <4 x i32> <i32 0, i32 3, i32 undef, i32 1> 1661 %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x double> %shuf, <4 x double> %default 1662 ret <4 x double> %res 1663} 1664 1665define <8 x float> @broadcast_v8f32_v2f32_u1uu0uEu(ptr %vp, <8 x float> %default) { 1666; X86-LABEL: broadcast_v8f32_v2f32_u1uu0uEu: 1667; X86: # %bb.0: 1668; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1669; X86-NEXT: vbroadcastsd (%eax), %ymm1 1670; X86-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 1671; X86-NEXT: retl 1672; 1673; X64-LABEL: broadcast_v8f32_v2f32_u1uu0uEu: 1674; X64: # %bb.0: 1675; X64-NEXT: vbroadcastsd (%rdi), %ymm1 1676; X64-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] 1677; X64-NEXT: retq 1678 %vec = load <2 x float>, ptr %vp 1679 %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <8 x i32> <i32 undef, i32 1, i32 undef, i32 undef, i32 0, i32 2, i32 3, i32 undef> 1680 %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1>, <8 x float> %shuf, <8 x float> %default 1681 ret <8 x float> %res 1682} 1683 1684define <8 x double> @broadcast_v8f64_v2f64_u1u10101(ptr %vp) { 1685; X86-AVX-LABEL: broadcast_v8f64_v2f64_u1u10101: 1686; X86-AVX: # %bb.0: 1687; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 1688; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 1689; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 1690; X86-AVX-NEXT: retl 1691; 1692; X86-AVX512-LABEL: broadcast_v8f64_v2f64_u1u10101: 1693; X86-AVX512: # %bb.0: 1694; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 1695; X86-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 1696; X86-AVX512-NEXT: retl 1697; 1698; X64-AVX-LABEL: broadcast_v8f64_v2f64_u1u10101: 1699; X64-AVX: # %bb.0: 1700; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 1701; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 1702; X64-AVX-NEXT: retq 1703; 1704; X64-AVX512-LABEL: broadcast_v8f64_v2f64_u1u10101: 1705; X64-AVX512: # %bb.0: 1706; X64-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 1707; X64-AVX512-NEXT: retq 1708 %vec = load <2 x double>, ptr %vp 1709 %res = shufflevector <2 x double> %vec, <2 x double> undef, <8 x i32> <i32 3, i32 1, i32 undef, i32 1, i32 0, i32 1, i32 0, i32 1> 1710 ret <8 x double> %res 1711} 1712 1713define <8 x double> @broadcast_v8f64_v2f64_0uuu0101(ptr %vp) { 1714; X86-AVX-LABEL: broadcast_v8f64_v2f64_0uuu0101: 1715; X86-AVX: # %bb.0: 1716; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 1717; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 1718; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 1719; X86-AVX-NEXT: retl 1720; 1721; X86-AVX512-LABEL: broadcast_v8f64_v2f64_0uuu0101: 1722; X86-AVX512: # %bb.0: 1723; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 1724; X86-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 1725; X86-AVX512-NEXT: retl 1726; 1727; X64-AVX-LABEL: broadcast_v8f64_v2f64_0uuu0101: 1728; X64-AVX: # %bb.0: 1729; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 1730; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 1731; X64-AVX-NEXT: retq 1732; 1733; X64-AVX512-LABEL: broadcast_v8f64_v2f64_0uuu0101: 1734; X64-AVX512: # %bb.0: 1735; X64-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 1736; X64-AVX512-NEXT: retq 1737 %vec = load <2 x double>, ptr %vp 1738 %res = shufflevector <2 x double> %vec, <2 x double> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 0, i32 1> 1739 ret <8 x double> %res 1740} 1741 1742define void @PR51226() { 1743; X86-AVX1-LABEL: PR51226: 1744; X86-AVX1: # %bb.0: 1745; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1746; X86-AVX1-NEXT: vpslld $16, %xmm0, %xmm0 1747; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1748; X86-AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 1749; X86-AVX1-NEXT: vminps %ymm1, %ymm0, %ymm0 1750; X86-AVX1-NEXT: vmovups %ymm0, (%eax) 1751; X86-AVX1-NEXT: vzeroupper 1752; X86-AVX1-NEXT: retl 1753; 1754; X86-AVX2-LABEL: PR51226: 1755; X86-AVX2: # %bb.0: 1756; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1757; X86-AVX2-NEXT: vpslld $16, %xmm0, %xmm0 1758; X86-AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 1759; X86-AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 1760; X86-AVX2-NEXT: vminps %ymm1, %ymm0, %ymm0 1761; X86-AVX2-NEXT: vmovups %ymm0, (%eax) 1762; X86-AVX2-NEXT: vzeroupper 1763; X86-AVX2-NEXT: retl 1764; 1765; X86-AVX512-LABEL: PR51226: 1766; X86-AVX512: # %bb.0: 1767; X86-AVX512-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1768; X86-AVX512-NEXT: vpslld $16, %xmm0, %xmm0 1769; X86-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 1770; X86-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 1771; X86-AVX512-NEXT: vminps %ymm1, %ymm0, %ymm0 1772; X86-AVX512-NEXT: vmovups %ymm0, (%eax) 1773; X86-AVX512-NEXT: vzeroupper 1774; X86-AVX512-NEXT: retl 1775; 1776; X64-AVX1-LABEL: PR51226: 1777; X64-AVX1: # %bb.0: 1778; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1779; X64-AVX1-NEXT: vpslld $16, %xmm0, %xmm0 1780; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1781; X64-AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 1782; X64-AVX1-NEXT: vminps %ymm1, %ymm0, %ymm0 1783; X64-AVX1-NEXT: vmovups %ymm0, (%rax) 1784; X64-AVX1-NEXT: vzeroupper 1785; X64-AVX1-NEXT: retq 1786; 1787; X64-AVX2-LABEL: PR51226: 1788; X64-AVX2: # %bb.0: 1789; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1790; X64-AVX2-NEXT: vpslld $16, %xmm0, %xmm0 1791; X64-AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 1792; X64-AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 1793; X64-AVX2-NEXT: vminps %ymm1, %ymm0, %ymm0 1794; X64-AVX2-NEXT: vmovups %ymm0, (%rax) 1795; X64-AVX2-NEXT: vzeroupper 1796; X64-AVX2-NEXT: retq 1797; 1798; X64-AVX512-LABEL: PR51226: 1799; X64-AVX512: # %bb.0: 1800; X64-AVX512-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1801; X64-AVX512-NEXT: vpslld $16, %xmm0, %xmm0 1802; X64-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 1803; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 1804; X64-AVX512-NEXT: vminps %ymm1, %ymm0, %ymm0 1805; X64-AVX512-NEXT: vmovups %ymm0, (%rax) 1806; X64-AVX512-NEXT: vzeroupper 1807; X64-AVX512-NEXT: retq 1808 %i = load <4 x i16>, ptr undef, align 8 1809 %i1 = zext <4 x i16> %i to <4 x i32> 1810 %i2 = shl nuw <4 x i32> %i1, <i32 16, i32 16, i32 16, i32 16> 1811 %i3 = bitcast <4 x i32> %i2 to <4 x float> 1812 %shuffle99 = shufflevector <4 x float> %i3, <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 1813 %i4 = fcmp reassoc nsz contract ogt <8 x float> zeroinitializer, %shuffle99 1814 %i5 = select <8 x i1> %i4, <8 x float> %shuffle99, <8 x float> zeroinitializer 1815 store <8 x float> %i5, ptr undef, align 16 1816 ret void 1817} 1818