1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2,FALLBACK0 3; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42,FALLBACK1 4; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1-ONLY,FALLBACK2 5; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2,AVX2-SLOW,FALLBACK3 6; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2,AVX2-FAST-PERLANE,FALLBACK4 7; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2,AVX2-FAST,FALLBACK5 8; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512F,AVX512F-SLOW,FALLBACK6 9; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512F,AVX512F-FAST,FALLBACK7 10; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW,AVX512BW-SLOW,FALLBACK8 11; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW,AVX512BW-FAST,FALLBACK9 12 13define void @concat_a_to_shuf_of_a(ptr %a.ptr, ptr %dst) { 14; SSE-LABEL: concat_a_to_shuf_of_a: 15; SSE: # %bb.0: 16; SSE-NEXT: movdqa (%rdi), %xmm0 17; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 18; SSE-NEXT: movdqa %xmm0, 16(%rsi) 19; SSE-NEXT: movdqa %xmm1, (%rsi) 20; SSE-NEXT: retq 21; 22; AVX-LABEL: concat_a_to_shuf_of_a: 23; AVX: # %bb.0: 24; AVX-NEXT: vmovaps (%rdi), %xmm0 25; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,0,1] 26; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 27; AVX-NEXT: vmovaps %ymm0, (%rsi) 28; AVX-NEXT: vzeroupper 29; AVX-NEXT: retq 30; 31; AVX2-LABEL: concat_a_to_shuf_of_a: 32; AVX2: # %bb.0: 33; AVX2-NEXT: vmovaps (%rdi), %xmm0 34; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,0,1] 35; AVX2-NEXT: vmovaps %ymm0, (%rsi) 36; AVX2-NEXT: vzeroupper 37; AVX2-NEXT: retq 38; 39; AVX512F-LABEL: concat_a_to_shuf_of_a: 40; AVX512F: # %bb.0: 41; AVX512F-NEXT: vmovaps (%rdi), %xmm0 42; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,0,1] 43; AVX512F-NEXT: vmovaps %ymm0, (%rsi) 44; AVX512F-NEXT: vzeroupper 45; AVX512F-NEXT: retq 46; 47; AVX512BW-LABEL: concat_a_to_shuf_of_a: 48; AVX512BW: # %bb.0: 49; AVX512BW-NEXT: vmovaps (%rdi), %xmm0 50; AVX512BW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,0,1] 51; AVX512BW-NEXT: vmovaps %ymm0, (%rsi) 52; AVX512BW-NEXT: vzeroupper 53; AVX512BW-NEXT: retq 54 %a = load <2 x i64>, ptr %a.ptr, align 64 55 %shuffle = shufflevector <2 x i64> %a, <2 x i64> poison, <2 x i32> <i32 1, i32 0> 56 %concat = shufflevector <2 x i64> %shuffle, <2 x i64> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 57 store <4 x i64> %concat, ptr %dst, align 64 58 ret void 59} 60define void @concat_shuf_of_a_to_a(ptr %a.ptr, ptr %b.ptr, ptr %dst) { 61; SSE-LABEL: concat_shuf_of_a_to_a: 62; SSE: # %bb.0: 63; SSE-NEXT: movdqa (%rdi), %xmm0 64; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 65; SSE-NEXT: movdqa %xmm0, (%rdx) 66; SSE-NEXT: movdqa %xmm1, 16(%rdx) 67; SSE-NEXT: retq 68; 69; AVX-LABEL: concat_shuf_of_a_to_a: 70; AVX: # %bb.0: 71; AVX-NEXT: vmovaps (%rdi), %xmm0 72; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,0,1] 73; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 74; AVX-NEXT: vmovaps %ymm0, (%rdx) 75; AVX-NEXT: vzeroupper 76; AVX-NEXT: retq 77; 78; AVX2-LABEL: concat_shuf_of_a_to_a: 79; AVX2: # %bb.0: 80; AVX2-NEXT: vmovaps (%rdi), %xmm0 81; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,0] 82; AVX2-NEXT: vmovaps %ymm0, (%rdx) 83; AVX2-NEXT: vzeroupper 84; AVX2-NEXT: retq 85; 86; AVX512F-LABEL: concat_shuf_of_a_to_a: 87; AVX512F: # %bb.0: 88; AVX512F-NEXT: vmovaps (%rdi), %xmm0 89; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,0] 90; AVX512F-NEXT: vmovaps %ymm0, (%rdx) 91; AVX512F-NEXT: vzeroupper 92; AVX512F-NEXT: retq 93; 94; AVX512BW-LABEL: concat_shuf_of_a_to_a: 95; AVX512BW: # %bb.0: 96; AVX512BW-NEXT: vmovaps (%rdi), %xmm0 97; AVX512BW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,0] 98; AVX512BW-NEXT: vmovaps %ymm0, (%rdx) 99; AVX512BW-NEXT: vzeroupper 100; AVX512BW-NEXT: retq 101 %a = load <2 x i64>, ptr %a.ptr, align 64 102 %b = load <2 x i64>, ptr %b.ptr, align 64 103 %shuffle = shufflevector <2 x i64> %a, <2 x i64> poison, <2 x i32> <i32 1, i32 0> 104 %concat = shufflevector <2 x i64> %a, <2 x i64> %shuffle, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 105 store <4 x i64> %concat, ptr %dst, align 64 106 ret void 107} 108 109define void @concat_a_to_shuf_of_a_extrause_of_shuf(ptr %a.ptr, ptr %dst, ptr %shuf.escape.ptr) { 110; SSE-LABEL: concat_a_to_shuf_of_a_extrause_of_shuf: 111; SSE: # %bb.0: 112; SSE-NEXT: movdqa (%rdi), %xmm0 113; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 114; SSE-NEXT: movdqa %xmm1, (%rdx) 115; SSE-NEXT: movdqa %xmm0, 16(%rsi) 116; SSE-NEXT: movdqa %xmm1, (%rsi) 117; SSE-NEXT: retq 118; 119; AVX-LABEL: concat_a_to_shuf_of_a_extrause_of_shuf: 120; AVX: # %bb.0: 121; AVX-NEXT: vmovaps (%rdi), %xmm0 122; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,0,1] 123; AVX-NEXT: vmovaps %xmm1, (%rdx) 124; AVX-NEXT: vmovaps %xmm0, 16(%rsi) 125; AVX-NEXT: vmovaps %xmm1, (%rsi) 126; AVX-NEXT: retq 127; 128; AVX2-LABEL: concat_a_to_shuf_of_a_extrause_of_shuf: 129; AVX2: # %bb.0: 130; AVX2-NEXT: vmovaps (%rdi), %xmm0 131; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,0,1] 132; AVX2-NEXT: vmovaps %xmm1, (%rdx) 133; AVX2-NEXT: vmovaps %xmm0, 16(%rsi) 134; AVX2-NEXT: vmovaps %xmm1, (%rsi) 135; AVX2-NEXT: retq 136; 137; AVX512F-LABEL: concat_a_to_shuf_of_a_extrause_of_shuf: 138; AVX512F: # %bb.0: 139; AVX512F-NEXT: vmovaps (%rdi), %xmm0 140; AVX512F-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,0,1] 141; AVX512F-NEXT: vmovaps %xmm1, (%rdx) 142; AVX512F-NEXT: vmovaps %xmm0, 16(%rsi) 143; AVX512F-NEXT: vmovaps %xmm1, (%rsi) 144; AVX512F-NEXT: retq 145; 146; AVX512BW-LABEL: concat_a_to_shuf_of_a_extrause_of_shuf: 147; AVX512BW: # %bb.0: 148; AVX512BW-NEXT: vmovaps (%rdi), %xmm0 149; AVX512BW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,0,1] 150; AVX512BW-NEXT: vmovaps %xmm1, (%rdx) 151; AVX512BW-NEXT: vmovaps %xmm0, 16(%rsi) 152; AVX512BW-NEXT: vmovaps %xmm1, (%rsi) 153; AVX512BW-NEXT: retq 154 %a = load <2 x i64>, ptr %a.ptr, align 64 155 %shuffle = shufflevector <2 x i64> %a, <2 x i64> poison, <2 x i32> <i32 1, i32 0> 156 store <2 x i64> %shuffle, ptr %shuf.escape.ptr, align 64 157 %concat = shufflevector <2 x i64> %shuffle, <2 x i64> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 158 store <4 x i64> %concat, ptr %dst, align 64 159 ret void 160} 161 162define void @concat_a_to_shuf_of_ab(ptr %a.ptr, ptr %b.ptr, ptr %dst) { 163; SSE2-LABEL: concat_a_to_shuf_of_ab: 164; SSE2: # %bb.0: 165; SSE2-NEXT: movapd (%rdi), %xmm0 166; SSE2-NEXT: movapd (%rsi), %xmm1 167; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 168; SSE2-NEXT: movapd %xmm0, 16(%rdx) 169; SSE2-NEXT: movapd %xmm1, (%rdx) 170; SSE2-NEXT: retq 171; 172; SSE42-LABEL: concat_a_to_shuf_of_ab: 173; SSE42: # %bb.0: 174; SSE42-NEXT: movaps (%rdi), %xmm0 175; SSE42-NEXT: movaps (%rsi), %xmm1 176; SSE42-NEXT: blendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3] 177; SSE42-NEXT: movaps %xmm0, 16(%rdx) 178; SSE42-NEXT: movaps %xmm1, (%rdx) 179; SSE42-NEXT: retq 180; 181; AVX-LABEL: concat_a_to_shuf_of_ab: 182; AVX: # %bb.0: 183; AVX-NEXT: vmovaps (%rdi), %xmm0 184; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],mem[2,3] 185; AVX-NEXT: vmovaps %xmm0, 16(%rdx) 186; AVX-NEXT: vmovaps %xmm1, (%rdx) 187; AVX-NEXT: retq 188; 189; AVX2-LABEL: concat_a_to_shuf_of_ab: 190; AVX2: # %bb.0: 191; AVX2-NEXT: vmovaps (%rdi), %xmm0 192; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],mem[2,3] 193; AVX2-NEXT: vmovaps %xmm0, 16(%rdx) 194; AVX2-NEXT: vmovaps %xmm1, (%rdx) 195; AVX2-NEXT: retq 196; 197; AVX512F-LABEL: concat_a_to_shuf_of_ab: 198; AVX512F: # %bb.0: 199; AVX512F-NEXT: vmovaps (%rdi), %xmm0 200; AVX512F-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],mem[2,3] 201; AVX512F-NEXT: vmovaps %xmm0, 16(%rdx) 202; AVX512F-NEXT: vmovaps %xmm1, (%rdx) 203; AVX512F-NEXT: retq 204; 205; AVX512BW-LABEL: concat_a_to_shuf_of_ab: 206; AVX512BW: # %bb.0: 207; AVX512BW-NEXT: vmovaps (%rdi), %xmm0 208; AVX512BW-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],mem[2,3] 209; AVX512BW-NEXT: vmovaps %xmm0, 16(%rdx) 210; AVX512BW-NEXT: vmovaps %xmm1, (%rdx) 211; AVX512BW-NEXT: retq 212 %a = load <2 x i64>, ptr %a.ptr, align 64 213 %b = load <2 x i64>, ptr %b.ptr, align 64 214 %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 3> 215 %concat = shufflevector <2 x i64> %shuffle, <2 x i64> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 216 store <4 x i64> %concat, ptr %dst, align 64 217 ret void 218} 219define void @concat_b_to_shuf_of_ab(ptr %a.ptr, ptr %b.ptr, ptr %dst) { 220; SSE2-LABEL: concat_b_to_shuf_of_ab: 221; SSE2: # %bb.0: 222; SSE2-NEXT: movaps (%rsi), %xmm0 223; SSE2-NEXT: movaps %xmm0, %xmm1 224; SSE2-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] 225; SSE2-NEXT: movaps %xmm0, 16(%rdx) 226; SSE2-NEXT: movaps %xmm1, (%rdx) 227; SSE2-NEXT: retq 228; 229; SSE42-LABEL: concat_b_to_shuf_of_ab: 230; SSE42: # %bb.0: 231; SSE42-NEXT: movaps (%rsi), %xmm0 232; SSE42-NEXT: movaps (%rdi), %xmm1 233; SSE42-NEXT: blendps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] 234; SSE42-NEXT: movaps %xmm0, 16(%rdx) 235; SSE42-NEXT: movaps %xmm1, (%rdx) 236; SSE42-NEXT: retq 237; 238; AVX-LABEL: concat_b_to_shuf_of_ab: 239; AVX: # %bb.0: 240; AVX-NEXT: vmovaps (%rsi), %xmm0 241; AVX-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm0[2,3] 242; AVX-NEXT: vmovaps %xmm0, 16(%rdx) 243; AVX-NEXT: vmovaps %xmm1, (%rdx) 244; AVX-NEXT: retq 245; 246; AVX2-LABEL: concat_b_to_shuf_of_ab: 247; AVX2: # %bb.0: 248; AVX2-NEXT: vmovaps (%rsi), %xmm0 249; AVX2-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm0[2,3] 250; AVX2-NEXT: vmovaps %xmm0, 16(%rdx) 251; AVX2-NEXT: vmovaps %xmm1, (%rdx) 252; AVX2-NEXT: retq 253; 254; AVX512F-LABEL: concat_b_to_shuf_of_ab: 255; AVX512F: # %bb.0: 256; AVX512F-NEXT: vmovaps (%rsi), %xmm0 257; AVX512F-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm0[2,3] 258; AVX512F-NEXT: vmovaps %xmm0, 16(%rdx) 259; AVX512F-NEXT: vmovaps %xmm1, (%rdx) 260; AVX512F-NEXT: retq 261; 262; AVX512BW-LABEL: concat_b_to_shuf_of_ab: 263; AVX512BW: # %bb.0: 264; AVX512BW-NEXT: vmovaps (%rsi), %xmm0 265; AVX512BW-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm0[2,3] 266; AVX512BW-NEXT: vmovaps %xmm0, 16(%rdx) 267; AVX512BW-NEXT: vmovaps %xmm1, (%rdx) 268; AVX512BW-NEXT: retq 269 %a = load <2 x i64>, ptr %a.ptr, align 64 270 %b = load <2 x i64>, ptr %b.ptr, align 64 271 %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 3> 272 %concat = shufflevector <2 x i64> %shuffle, <2 x i64> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 273 store <4 x i64> %concat, ptr %dst, align 64 274 ret void 275} 276 277define void @concat_shuf_of_ab_to_a(ptr %a.ptr, ptr %b.ptr, ptr %dst) { 278; SSE2-LABEL: concat_shuf_of_ab_to_a: 279; SSE2: # %bb.0: 280; SSE2-NEXT: movapd (%rdi), %xmm0 281; SSE2-NEXT: movapd (%rsi), %xmm1 282; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 283; SSE2-NEXT: movapd %xmm0, (%rdx) 284; SSE2-NEXT: movapd %xmm1, 16(%rdx) 285; SSE2-NEXT: retq 286; 287; SSE42-LABEL: concat_shuf_of_ab_to_a: 288; SSE42: # %bb.0: 289; SSE42-NEXT: movaps (%rdi), %xmm0 290; SSE42-NEXT: movaps (%rsi), %xmm1 291; SSE42-NEXT: blendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3] 292; SSE42-NEXT: movaps %xmm1, 16(%rdx) 293; SSE42-NEXT: movaps %xmm0, (%rdx) 294; SSE42-NEXT: retq 295; 296; AVX-LABEL: concat_shuf_of_ab_to_a: 297; AVX: # %bb.0: 298; AVX-NEXT: vmovaps (%rdi), %xmm0 299; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],mem[2,3] 300; AVX-NEXT: vmovaps %xmm1, 16(%rdx) 301; AVX-NEXT: vmovaps %xmm0, (%rdx) 302; AVX-NEXT: retq 303; 304; AVX2-LABEL: concat_shuf_of_ab_to_a: 305; AVX2: # %bb.0: 306; AVX2-NEXT: vmovaps (%rdi), %xmm0 307; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],mem[2,3] 308; AVX2-NEXT: vmovaps %xmm1, 16(%rdx) 309; AVX2-NEXT: vmovaps %xmm0, (%rdx) 310; AVX2-NEXT: retq 311; 312; AVX512F-LABEL: concat_shuf_of_ab_to_a: 313; AVX512F: # %bb.0: 314; AVX512F-NEXT: vmovaps (%rdi), %xmm0 315; AVX512F-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],mem[2,3] 316; AVX512F-NEXT: vmovaps %xmm1, 16(%rdx) 317; AVX512F-NEXT: vmovaps %xmm0, (%rdx) 318; AVX512F-NEXT: retq 319; 320; AVX512BW-LABEL: concat_shuf_of_ab_to_a: 321; AVX512BW: # %bb.0: 322; AVX512BW-NEXT: vmovaps (%rdi), %xmm0 323; AVX512BW-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],mem[2,3] 324; AVX512BW-NEXT: vmovaps %xmm1, 16(%rdx) 325; AVX512BW-NEXT: vmovaps %xmm0, (%rdx) 326; AVX512BW-NEXT: retq 327 %a = load <2 x i64>, ptr %a.ptr, align 64 328 %b = load <2 x i64>, ptr %b.ptr, align 64 329 %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 3> 330 %concat = shufflevector <2 x i64> %a, <2 x i64> %shuffle, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 331 store <4 x i64> %concat, ptr %dst, align 64 332 ret void 333} 334define void @concat_shuf_of_ab_to_b(ptr %a.ptr, ptr %b.ptr, ptr %dst) { 335; SSE2-LABEL: concat_shuf_of_ab_to_b: 336; SSE2: # %bb.0: 337; SSE2-NEXT: movaps (%rsi), %xmm0 338; SSE2-NEXT: movaps %xmm0, %xmm1 339; SSE2-NEXT: movlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3] 340; SSE2-NEXT: movaps %xmm1, 16(%rdx) 341; SSE2-NEXT: movaps %xmm0, (%rdx) 342; SSE2-NEXT: retq 343; 344; SSE42-LABEL: concat_shuf_of_ab_to_b: 345; SSE42: # %bb.0: 346; SSE42-NEXT: movaps (%rsi), %xmm0 347; SSE42-NEXT: movaps (%rdi), %xmm1 348; SSE42-NEXT: blendps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] 349; SSE42-NEXT: movaps %xmm1, 16(%rdx) 350; SSE42-NEXT: movaps %xmm0, (%rdx) 351; SSE42-NEXT: retq 352; 353; AVX-LABEL: concat_shuf_of_ab_to_b: 354; AVX: # %bb.0: 355; AVX-NEXT: vmovaps (%rsi), %xmm0 356; AVX-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm0[2,3] 357; AVX-NEXT: vmovaps %xmm1, 16(%rdx) 358; AVX-NEXT: vmovaps %xmm0, (%rdx) 359; AVX-NEXT: retq 360; 361; AVX2-LABEL: concat_shuf_of_ab_to_b: 362; AVX2: # %bb.0: 363; AVX2-NEXT: vmovaps (%rsi), %xmm0 364; AVX2-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm0[2,3] 365; AVX2-NEXT: vmovaps %xmm1, 16(%rdx) 366; AVX2-NEXT: vmovaps %xmm0, (%rdx) 367; AVX2-NEXT: retq 368; 369; AVX512F-LABEL: concat_shuf_of_ab_to_b: 370; AVX512F: # %bb.0: 371; AVX512F-NEXT: vmovaps (%rsi), %xmm0 372; AVX512F-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm0[2,3] 373; AVX512F-NEXT: vmovaps %xmm1, 16(%rdx) 374; AVX512F-NEXT: vmovaps %xmm0, (%rdx) 375; AVX512F-NEXT: retq 376; 377; AVX512BW-LABEL: concat_shuf_of_ab_to_b: 378; AVX512BW: # %bb.0: 379; AVX512BW-NEXT: vmovaps (%rsi), %xmm0 380; AVX512BW-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm0[2,3] 381; AVX512BW-NEXT: vmovaps %xmm1, 16(%rdx) 382; AVX512BW-NEXT: vmovaps %xmm0, (%rdx) 383; AVX512BW-NEXT: retq 384 %a = load <2 x i64>, ptr %a.ptr, align 64 385 %b = load <2 x i64>, ptr %b.ptr, align 64 386 %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 3> 387 %concat = shufflevector <2 x i64> %b, <2 x i64> %shuffle, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 388 store <4 x i64> %concat, ptr %dst, align 64 389 ret void 390} 391 392define void @concat_b_to_shuf_of_a(ptr %a.ptr, ptr %b.ptr, ptr %dst) { 393; SSE-LABEL: concat_b_to_shuf_of_a: 394; SSE: # %bb.0: 395; SSE-NEXT: movaps (%rsi), %xmm0 396; SSE-NEXT: pshufd {{.*#+}} xmm1 = mem[2,3,0,1] 397; SSE-NEXT: movaps %xmm0, 16(%rdx) 398; SSE-NEXT: movdqa %xmm1, (%rdx) 399; SSE-NEXT: retq 400; 401; AVX-LABEL: concat_b_to_shuf_of_a: 402; AVX: # %bb.0: 403; AVX-NEXT: vmovaps (%rsi), %xmm0 404; AVX-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,0,1] 405; AVX-NEXT: vmovaps %xmm0, 16(%rdx) 406; AVX-NEXT: vmovaps %xmm1, (%rdx) 407; AVX-NEXT: retq 408; 409; AVX2-LABEL: concat_b_to_shuf_of_a: 410; AVX2: # %bb.0: 411; AVX2-NEXT: vmovaps (%rsi), %xmm0 412; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,0,1] 413; AVX2-NEXT: vmovaps %xmm0, 16(%rdx) 414; AVX2-NEXT: vmovaps %xmm1, (%rdx) 415; AVX2-NEXT: retq 416; 417; AVX512F-LABEL: concat_b_to_shuf_of_a: 418; AVX512F: # %bb.0: 419; AVX512F-NEXT: vmovaps (%rsi), %xmm0 420; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,0,1] 421; AVX512F-NEXT: vmovaps %xmm0, 16(%rdx) 422; AVX512F-NEXT: vmovaps %xmm1, (%rdx) 423; AVX512F-NEXT: retq 424; 425; AVX512BW-LABEL: concat_b_to_shuf_of_a: 426; AVX512BW: # %bb.0: 427; AVX512BW-NEXT: vmovaps (%rsi), %xmm0 428; AVX512BW-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,0,1] 429; AVX512BW-NEXT: vmovaps %xmm0, 16(%rdx) 430; AVX512BW-NEXT: vmovaps %xmm1, (%rdx) 431; AVX512BW-NEXT: retq 432 %a = load <2 x i64>, ptr %a.ptr, align 64 433 %b = load <2 x i64>, ptr %b.ptr, align 64 434 %shuffle = shufflevector <2 x i64> %a, <2 x i64> poison, <2 x i32> <i32 1, i32 0> 435 %concat = shufflevector <2 x i64> %shuffle, <2 x i64> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 436 store <4 x i64> %concat, ptr %dst, align 64 437 ret void 438} 439define void @concat_shuf_of_a_to_b(ptr %a.ptr, ptr %b.ptr, ptr %dst) { 440; SSE-LABEL: concat_shuf_of_a_to_b: 441; SSE: # %bb.0: 442; SSE-NEXT: movaps (%rsi), %xmm0 443; SSE-NEXT: pshufd {{.*#+}} xmm1 = mem[2,3,0,1] 444; SSE-NEXT: movdqa %xmm1, 16(%rdx) 445; SSE-NEXT: movaps %xmm0, (%rdx) 446; SSE-NEXT: retq 447; 448; AVX-LABEL: concat_shuf_of_a_to_b: 449; AVX: # %bb.0: 450; AVX-NEXT: vmovaps (%rsi), %xmm0 451; AVX-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,0,1] 452; AVX-NEXT: vmovaps %xmm1, 16(%rdx) 453; AVX-NEXT: vmovaps %xmm0, (%rdx) 454; AVX-NEXT: retq 455; 456; AVX2-LABEL: concat_shuf_of_a_to_b: 457; AVX2: # %bb.0: 458; AVX2-NEXT: vmovaps (%rsi), %xmm0 459; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,0,1] 460; AVX2-NEXT: vmovaps %xmm1, 16(%rdx) 461; AVX2-NEXT: vmovaps %xmm0, (%rdx) 462; AVX2-NEXT: retq 463; 464; AVX512F-LABEL: concat_shuf_of_a_to_b: 465; AVX512F: # %bb.0: 466; AVX512F-NEXT: vmovaps (%rsi), %xmm0 467; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,0,1] 468; AVX512F-NEXT: vmovaps %xmm1, 16(%rdx) 469; AVX512F-NEXT: vmovaps %xmm0, (%rdx) 470; AVX512F-NEXT: retq 471; 472; AVX512BW-LABEL: concat_shuf_of_a_to_b: 473; AVX512BW: # %bb.0: 474; AVX512BW-NEXT: vmovaps (%rsi), %xmm0 475; AVX512BW-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,0,1] 476; AVX512BW-NEXT: vmovaps %xmm1, 16(%rdx) 477; AVX512BW-NEXT: vmovaps %xmm0, (%rdx) 478; AVX512BW-NEXT: retq 479 %a = load <2 x i64>, ptr %a.ptr, align 64 480 %b = load <2 x i64>, ptr %b.ptr, align 64 481 %shuffle = shufflevector <2 x i64> %a, <2 x i64> poison, <2 x i32> <i32 1, i32 0> 482 %concat = shufflevector <2 x i64> %b, <2 x i64> %shuffle, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 483 store <4 x i64> %concat, ptr %dst, align 64 484 ret void 485} 486 487define void @concat_poison_to_shuf_of_a(ptr %a.ptr, ptr %dst) { 488; SSE-LABEL: concat_poison_to_shuf_of_a: 489; SSE: # %bb.0: 490; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[2,3,0,1] 491; SSE-NEXT: movdqa %xmm0, (%rsi) 492; SSE-NEXT: retq 493; 494; AVX-LABEL: concat_poison_to_shuf_of_a: 495; AVX: # %bb.0: 496; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,0,1] 497; AVX-NEXT: vmovaps %xmm0, (%rsi) 498; AVX-NEXT: retq 499; 500; AVX2-LABEL: concat_poison_to_shuf_of_a: 501; AVX2: # %bb.0: 502; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,0,1] 503; AVX2-NEXT: vmovaps %xmm0, (%rsi) 504; AVX2-NEXT: retq 505; 506; AVX512F-LABEL: concat_poison_to_shuf_of_a: 507; AVX512F: # %bb.0: 508; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,0,1] 509; AVX512F-NEXT: vmovaps %xmm0, (%rsi) 510; AVX512F-NEXT: retq 511; 512; AVX512BW-LABEL: concat_poison_to_shuf_of_a: 513; AVX512BW: # %bb.0: 514; AVX512BW-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,0,1] 515; AVX512BW-NEXT: vmovaps %xmm0, (%rsi) 516; AVX512BW-NEXT: retq 517 %a = load <2 x i64>, ptr %a.ptr, align 64 518 %shuffle = shufflevector <2 x i64> %a, <2 x i64> poison, <2 x i32> <i32 1, i32 0> 519 %concat = shufflevector <2 x i64> %shuffle, <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 520 store <4 x i64> %concat, ptr %dst, align 64 521 ret void 522} 523define void @concat_shuf_of_a_to_poison(ptr %a.ptr, ptr %b.ptr, ptr %dst) { 524; SSE-LABEL: concat_shuf_of_a_to_poison: 525; SSE: # %bb.0: 526; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[2,3,0,1] 527; SSE-NEXT: movdqa %xmm0, 16(%rdx) 528; SSE-NEXT: retq 529; 530; AVX-LABEL: concat_shuf_of_a_to_poison: 531; AVX: # %bb.0: 532; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,0,1] 533; AVX-NEXT: vmovaps %xmm0, 16(%rdx) 534; AVX-NEXT: retq 535; 536; AVX2-LABEL: concat_shuf_of_a_to_poison: 537; AVX2: # %bb.0: 538; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,0,1] 539; AVX2-NEXT: vmovaps %xmm0, 16(%rdx) 540; AVX2-NEXT: retq 541; 542; AVX512F-LABEL: concat_shuf_of_a_to_poison: 543; AVX512F: # %bb.0: 544; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,0,1] 545; AVX512F-NEXT: vmovaps %xmm0, 16(%rdx) 546; AVX512F-NEXT: retq 547; 548; AVX512BW-LABEL: concat_shuf_of_a_to_poison: 549; AVX512BW: # %bb.0: 550; AVX512BW-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,0,1] 551; AVX512BW-NEXT: vmovaps %xmm0, 16(%rdx) 552; AVX512BW-NEXT: retq 553 %a = load <2 x i64>, ptr %a.ptr, align 64 554 %b = load <2 x i64>, ptr %b.ptr, align 64 555 %shuffle = shufflevector <2 x i64> %a, <2 x i64> poison, <2 x i32> <i32 1, i32 0> 556 %concat = shufflevector <2 x i64> poison, <2 x i64> %shuffle, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 557 store <4 x i64> %concat, ptr %dst, align 64 558 ret void 559} 560 561define void @concat_shuf_of_a_to_itself(ptr %a.ptr, ptr %dst) { 562; SSE-LABEL: concat_shuf_of_a_to_itself: 563; SSE: # %bb.0: 564; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[2,3,0,1] 565; SSE-NEXT: movdqa %xmm0, 16(%rsi) 566; SSE-NEXT: movdqa %xmm0, (%rsi) 567; SSE-NEXT: retq 568; 569; AVX-LABEL: concat_shuf_of_a_to_itself: 570; AVX: # %bb.0: 571; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = mem[1,0] 572; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 573; AVX-NEXT: vmovaps %ymm0, (%rsi) 574; AVX-NEXT: vzeroupper 575; AVX-NEXT: retq 576; 577; AVX2-LABEL: concat_shuf_of_a_to_itself: 578; AVX2: # %bb.0: 579; AVX2-NEXT: vmovaps (%rdi), %xmm0 580; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,1,0] 581; AVX2-NEXT: vmovaps %ymm0, (%rsi) 582; AVX2-NEXT: vzeroupper 583; AVX2-NEXT: retq 584; 585; AVX512F-LABEL: concat_shuf_of_a_to_itself: 586; AVX512F: # %bb.0: 587; AVX512F-NEXT: vmovaps (%rdi), %xmm0 588; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,1,0] 589; AVX512F-NEXT: vmovaps %ymm0, (%rsi) 590; AVX512F-NEXT: vzeroupper 591; AVX512F-NEXT: retq 592; 593; AVX512BW-LABEL: concat_shuf_of_a_to_itself: 594; AVX512BW: # %bb.0: 595; AVX512BW-NEXT: vmovaps (%rdi), %xmm0 596; AVX512BW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,1,0] 597; AVX512BW-NEXT: vmovaps %ymm0, (%rsi) 598; AVX512BW-NEXT: vzeroupper 599; AVX512BW-NEXT: retq 600 %a = load <2 x i64>, ptr %a.ptr, align 64 601 %shuffle = shufflevector <2 x i64> %a, <2 x i64> poison, <2 x i32> <i32 1, i32 0> 602 %concat = shufflevector <2 x i64> %shuffle, <2 x i64> %shuffle, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 603 store <4 x i64> %concat, ptr %dst, align 64 604 ret void 605} 606 607define void @concat_aaa_to_shuf_of_a(ptr %a.ptr, ptr %dst) { 608; SSE-LABEL: concat_aaa_to_shuf_of_a: 609; SSE: # %bb.0: 610; SSE-NEXT: movdqa (%rdi), %xmm0 611; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 612; SSE-NEXT: movdqa %xmm0, 32(%rsi) 613; SSE-NEXT: movdqa %xmm0, 48(%rsi) 614; SSE-NEXT: movdqa %xmm0, 16(%rsi) 615; SSE-NEXT: movdqa %xmm1, (%rsi) 616; SSE-NEXT: retq 617; 618; AVX-LABEL: concat_aaa_to_shuf_of_a: 619; AVX: # %bb.0: 620; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 621; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,0,1] 622; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 623; AVX-NEXT: vmovaps %ymm0, 32(%rsi) 624; AVX-NEXT: vmovaps %ymm1, (%rsi) 625; AVX-NEXT: vzeroupper 626; AVX-NEXT: retq 627; 628; AVX2-LABEL: concat_aaa_to_shuf_of_a: 629; AVX2: # %bb.0: 630; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 631; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[1,0,0,1] 632; AVX2-NEXT: vmovaps %ymm0, 32(%rsi) 633; AVX2-NEXT: vmovaps %ymm1, (%rsi) 634; AVX2-NEXT: vzeroupper 635; AVX2-NEXT: retq 636; 637; AVX512F-LABEL: concat_aaa_to_shuf_of_a: 638; AVX512F: # %bb.0: 639; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 640; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 641; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 642; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 643; AVX512F-NEXT: vmovdqa64 %zmm0, (%rsi) 644; AVX512F-NEXT: vzeroupper 645; AVX512F-NEXT: retq 646; 647; AVX512BW-LABEL: concat_aaa_to_shuf_of_a: 648; AVX512BW: # %bb.0: 649; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 650; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 651; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 652; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 653; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rsi) 654; AVX512BW-NEXT: vzeroupper 655; AVX512BW-NEXT: retq 656 %a = load <2 x i64>, ptr %a.ptr, align 64 657 %shuffle = shufflevector <2 x i64> %a, <2 x i64> poison, <2 x i32> <i32 1, i32 0> 658 %concat01 = shufflevector <2 x i64> %shuffle, <2 x i64> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 659 %concat23 = shufflevector <2 x i64> %a, <2 x i64> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 660 %concat = shufflevector <4 x i64> %concat01, <4 x i64> %concat23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 661 store <8 x i64> %concat, ptr %dst, align 64 662 ret void 663} 664define void @concat_shuf_of_a_to_aaa(ptr %a.ptr, ptr %dst) { 665; SSE-LABEL: concat_shuf_of_a_to_aaa: 666; SSE: # %bb.0: 667; SSE-NEXT: movdqa (%rdi), %xmm0 668; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 669; SSE-NEXT: movdqa %xmm0, 32(%rsi) 670; SSE-NEXT: movdqa %xmm0, 16(%rsi) 671; SSE-NEXT: movdqa %xmm0, (%rsi) 672; SSE-NEXT: movdqa %xmm1, 48(%rsi) 673; SSE-NEXT: retq 674; 675; AVX-LABEL: concat_shuf_of_a_to_aaa: 676; AVX: # %bb.0: 677; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 678; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,0,1] 679; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 680; AVX-NEXT: vmovaps %ymm0, (%rsi) 681; AVX-NEXT: vmovaps %ymm1, 32(%rsi) 682; AVX-NEXT: vzeroupper 683; AVX-NEXT: retq 684; 685; AVX2-LABEL: concat_shuf_of_a_to_aaa: 686; AVX2: # %bb.0: 687; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 688; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,1,1,0] 689; AVX2-NEXT: vmovaps %ymm0, (%rsi) 690; AVX2-NEXT: vmovaps %ymm1, 32(%rsi) 691; AVX2-NEXT: vzeroupper 692; AVX2-NEXT: retq 693; 694; AVX512F-LABEL: concat_shuf_of_a_to_aaa: 695; AVX512F: # %bb.0: 696; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 697; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 698; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 699; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 700; AVX512F-NEXT: vmovdqa64 %zmm0, (%rsi) 701; AVX512F-NEXT: vzeroupper 702; AVX512F-NEXT: retq 703; 704; AVX512BW-LABEL: concat_shuf_of_a_to_aaa: 705; AVX512BW: # %bb.0: 706; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 707; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 708; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 709; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 710; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rsi) 711; AVX512BW-NEXT: vzeroupper 712; AVX512BW-NEXT: retq 713 %a = load <2 x i64>, ptr %a.ptr, align 64 714 %shuffle = shufflevector <2 x i64> %a, <2 x i64> poison, <2 x i32> <i32 1, i32 0> 715 %concat01 = shufflevector <2 x i64> %a, <2 x i64> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 716 %concat23 = shufflevector <2 x i64> %a, <2 x i64> %shuffle, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 717 %concat = shufflevector <4 x i64> %concat01, <4 x i64> %concat23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 718 store <8 x i64> %concat, ptr %dst, align 64 719 ret void 720} 721;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: 722; AVX1-ONLY: {{.*}} 723; AVX2-FAST: {{.*}} 724; AVX2-FAST-PERLANE: {{.*}} 725; AVX2-SLOW: {{.*}} 726; AVX512BW-FAST: {{.*}} 727; AVX512BW-SLOW: {{.*}} 728; AVX512F-FAST: {{.*}} 729; AVX512F-SLOW: {{.*}} 730; FALLBACK0: {{.*}} 731; FALLBACK1: {{.*}} 732; FALLBACK2: {{.*}} 733; FALLBACK3: {{.*}} 734; FALLBACK4: {{.*}} 735; FALLBACK5: {{.*}} 736; FALLBACK6: {{.*}} 737; FALLBACK7: {{.*}} 738; FALLBACK8: {{.*}} 739; FALLBACK9: {{.*}} 740