1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -verify-machineinstrs -mcpu=alderlake -mattr=+false-deps-perm -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefixes=ENABLE,ENABLE-ADL 3; RUN: llc -verify-machineinstrs -mcpu=sapphirerapids -mattr=+false-deps-perm -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefixes=ENABLE,ENABLE-SPR 4; RUN: llc -verify-machineinstrs -mcpu=alderlake -mattr=-false-deps-perm -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefixes=DISABLE,DISABLE-ADL 5; RUN: llc -verify-machineinstrs -mcpu=sapphirerapids -mattr=-false-deps-perm -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefixes=DISABLE,DISABLE-SPR 6 7define <8 x i32> @permd(<8 x i32> %a0, <8 x i32> %a1) { 8; ENABLE-ADL-LABEL: permd: 9; ENABLE-ADL: # %bb.0: 10; ENABLE-ADL-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 11; ENABLE-ADL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 12; ENABLE-ADL-NEXT: #APP 13; ENABLE-ADL-NEXT: nop 14; ENABLE-ADL-NEXT: #NO_APP 15; ENABLE-ADL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 16; ENABLE-ADL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 17; ENABLE-ADL-NEXT: vxorps %xmm0, %xmm0, %xmm0 18; ENABLE-ADL-NEXT: vpermd %ymm2, %ymm1, %ymm0 19; ENABLE-ADL-NEXT: vpaddd %ymm1, %ymm2, %ymm1 20; ENABLE-ADL-NEXT: vpaddd %ymm1, %ymm0, %ymm0 21; ENABLE-ADL-NEXT: retq 22; 23; ENABLE-SPR-LABEL: permd: 24; ENABLE-SPR: # %bb.0: 25; ENABLE-SPR-NEXT: vmovdqa64 %ymm1, %ymm16 26; ENABLE-SPR-NEXT: vmovdqa64 %ymm0, %ymm17 27; ENABLE-SPR-NEXT: #APP 28; ENABLE-SPR-NEXT: nop 29; ENABLE-SPR-NEXT: #NO_APP 30; ENABLE-SPR-NEXT: vxorps %xmm0, %xmm0, %xmm0 31; ENABLE-SPR-NEXT: vpermd %ymm17, %ymm16, %ymm0 32; ENABLE-SPR-NEXT: vpaddd %ymm16, %ymm17, %ymm1 33; ENABLE-SPR-NEXT: vpaddd %ymm1, %ymm0, %ymm0 34; ENABLE-SPR-NEXT: retq 35; 36; DISABLE-ADL-LABEL: permd: 37; DISABLE-ADL: # %bb.0: 38; DISABLE-ADL-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 39; DISABLE-ADL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 40; DISABLE-ADL-NEXT: #APP 41; DISABLE-ADL-NEXT: nop 42; DISABLE-ADL-NEXT: #NO_APP 43; DISABLE-ADL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 44; DISABLE-ADL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 45; DISABLE-ADL-NEXT: vpermd %ymm2, %ymm1, %ymm0 46; DISABLE-ADL-NEXT: vpaddd %ymm1, %ymm2, %ymm1 47; DISABLE-ADL-NEXT: vpaddd %ymm1, %ymm0, %ymm0 48; DISABLE-ADL-NEXT: retq 49; 50; DISABLE-SPR-LABEL: permd: 51; DISABLE-SPR: # %bb.0: 52; DISABLE-SPR-NEXT: vmovdqa64 %ymm1, %ymm16 53; DISABLE-SPR-NEXT: vmovdqa64 %ymm0, %ymm17 54; DISABLE-SPR-NEXT: #APP 55; DISABLE-SPR-NEXT: nop 56; DISABLE-SPR-NEXT: #NO_APP 57; DISABLE-SPR-NEXT: vpermd %ymm17, %ymm16, %ymm0 58; DISABLE-SPR-NEXT: vpaddd %ymm16, %ymm17, %ymm1 59; DISABLE-SPR-NEXT: vpaddd %ymm1, %ymm0, %ymm0 60; DISABLE-SPR-NEXT: retq 61 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 62 %2 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> %a1) 63 %3 = add <8 x i32> %a0, %a1 64 %res = add <8 x i32> %2, %3 65 ret <8 x i32> %res 66} 67 68define <8 x i32> @permd_mem(ptr %p0, <8 x i32> %a1) { 69; ENABLE-ADL-LABEL: permd_mem: 70; ENABLE-ADL: # %bb.0: 71; ENABLE-ADL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 72; ENABLE-ADL-NEXT: #APP 73; ENABLE-ADL-NEXT: nop 74; ENABLE-ADL-NEXT: #NO_APP 75; ENABLE-ADL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 76; ENABLE-ADL-NEXT: vxorps %xmm0, %xmm0, %xmm0 77; ENABLE-ADL-NEXT: vpermd (%rdi), %ymm1, %ymm0 78; ENABLE-ADL-NEXT: vpaddd %ymm1, %ymm0, %ymm0 79; ENABLE-ADL-NEXT: retq 80; 81; ENABLE-SPR-LABEL: permd_mem: 82; ENABLE-SPR: # %bb.0: 83; ENABLE-SPR-NEXT: vmovdqa64 %ymm0, %ymm16 84; ENABLE-SPR-NEXT: #APP 85; ENABLE-SPR-NEXT: nop 86; ENABLE-SPR-NEXT: #NO_APP 87; ENABLE-SPR-NEXT: vxorps %xmm0, %xmm0, %xmm0 88; ENABLE-SPR-NEXT: vpermd (%rdi), %ymm16, %ymm0 89; ENABLE-SPR-NEXT: vpaddd %ymm16, %ymm0, %ymm0 90; ENABLE-SPR-NEXT: retq 91; 92; DISABLE-ADL-LABEL: permd_mem: 93; DISABLE-ADL: # %bb.0: 94; DISABLE-ADL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 95; DISABLE-ADL-NEXT: #APP 96; DISABLE-ADL-NEXT: nop 97; DISABLE-ADL-NEXT: #NO_APP 98; DISABLE-ADL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 99; DISABLE-ADL-NEXT: vpermd (%rdi), %ymm1, %ymm0 100; DISABLE-ADL-NEXT: vpaddd %ymm1, %ymm0, %ymm0 101; DISABLE-ADL-NEXT: retq 102; 103; DISABLE-SPR-LABEL: permd_mem: 104; DISABLE-SPR: # %bb.0: 105; DISABLE-SPR-NEXT: vmovdqa64 %ymm0, %ymm16 106; DISABLE-SPR-NEXT: #APP 107; DISABLE-SPR-NEXT: nop 108; DISABLE-SPR-NEXT: #NO_APP 109; DISABLE-SPR-NEXT: vpermd (%rdi), %ymm16, %ymm0 110; DISABLE-SPR-NEXT: vpaddd %ymm16, %ymm0, %ymm0 111; DISABLE-SPR-NEXT: retq 112 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 113 %a0 = load <8 x i32>, ptr %p0, align 64 114 %2 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> %a1) 115 %res = add <8 x i32> %2, %a1 116 ret <8 x i32> %res 117} 118 119declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly 120 121define <4 x i64> @permq(<4 x i64> %a0) { 122; ENABLE-LABEL: permq: 123; ENABLE: # %bb.0: 124; ENABLE-NEXT: #APP 125; ENABLE-NEXT: nop 126; ENABLE-NEXT: #NO_APP 127; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1 128; ENABLE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[1,2,1,0] 129; ENABLE-NEXT: vpaddq %ymm0, %ymm1, %ymm0 130; ENABLE-NEXT: retq 131; 132; DISABLE-LABEL: permq: 133; DISABLE: # %bb.0: 134; DISABLE-NEXT: #APP 135; DISABLE-NEXT: nop 136; DISABLE-NEXT: #NO_APP 137; DISABLE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[1,2,1,0] 138; DISABLE-NEXT: vpaddq %ymm0, %ymm1, %ymm0 139; DISABLE-NEXT: retq 140 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 141 %2 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 1, i32 2, i32 1, i32 0> 142 %res = add <4 x i64> %2, %a0 143 ret <4 x i64> %res 144} 145 146define <4 x i64> @permq_mem(ptr %p0) { 147; ENABLE-LABEL: permq_mem: 148; ENABLE: # %bb.0: 149; ENABLE-NEXT: #APP 150; ENABLE-NEXT: nop 151; ENABLE-NEXT: #NO_APP 152; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0 153; ENABLE-NEXT: vpermpd {{.*#+}} ymm0 = mem[1,2,1,0] 154; ENABLE-NEXT: retq 155; 156; DISABLE-LABEL: permq_mem: 157; DISABLE: # %bb.0: 158; DISABLE-NEXT: #APP 159; DISABLE-NEXT: nop 160; DISABLE-NEXT: #NO_APP 161; DISABLE-NEXT: vpermpd {{.*#+}} ymm0 = mem[1,2,1,0] 162; DISABLE-NEXT: retq 163 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 164 %a0 = load <4 x i64>, ptr %p0, align 64 165 %2 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 1, i32 2, i32 1, i32 0> 166 ret <4 x i64> %2 167} 168 169define <8 x float> @permps(<8 x float> %a0, <8 x i32> %a1) { 170; ENABLE-ADL-LABEL: permps: 171; ENABLE-ADL: # %bb.0: 172; ENABLE-ADL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 173; ENABLE-ADL-NEXT: #APP 174; ENABLE-ADL-NEXT: nop 175; ENABLE-ADL-NEXT: #NO_APP 176; ENABLE-ADL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 177; ENABLE-ADL-NEXT: vxorps %xmm1, %xmm1, %xmm1 178; ENABLE-ADL-NEXT: vpermps %ymm2, %ymm0, %ymm1 179; ENABLE-ADL-NEXT: vcvtdq2ps %ymm0, %ymm0 180; ENABLE-ADL-NEXT: vaddps %ymm2, %ymm0, %ymm0 181; ENABLE-ADL-NEXT: vaddps %ymm0, %ymm1, %ymm0 182; ENABLE-ADL-NEXT: retq 183; 184; ENABLE-SPR-LABEL: permps: 185; ENABLE-SPR: # %bb.0: 186; ENABLE-SPR-NEXT: vmovaps %ymm0, %ymm16 187; ENABLE-SPR-NEXT: #APP 188; ENABLE-SPR-NEXT: nop 189; ENABLE-SPR-NEXT: #NO_APP 190; ENABLE-SPR-NEXT: vxorps %xmm1, %xmm1, %xmm1 191; ENABLE-SPR-NEXT: vpermps %ymm16, %ymm0, %ymm1 192; ENABLE-SPR-NEXT: vcvtdq2ps %ymm0, %ymm0 193; ENABLE-SPR-NEXT: vaddps %ymm16, %ymm0, %ymm0 194; ENABLE-SPR-NEXT: vaddps %ymm0, %ymm1, %ymm0 195; ENABLE-SPR-NEXT: retq 196; 197; DISABLE-ADL-LABEL: permps: 198; DISABLE-ADL: # %bb.0: 199; DISABLE-ADL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 200; DISABLE-ADL-NEXT: #APP 201; DISABLE-ADL-NEXT: nop 202; DISABLE-ADL-NEXT: #NO_APP 203; DISABLE-ADL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 204; DISABLE-ADL-NEXT: vpermps %ymm2, %ymm0, %ymm1 205; DISABLE-ADL-NEXT: vcvtdq2ps %ymm0, %ymm0 206; DISABLE-ADL-NEXT: vaddps %ymm2, %ymm0, %ymm0 207; DISABLE-ADL-NEXT: vaddps %ymm0, %ymm1, %ymm0 208; DISABLE-ADL-NEXT: retq 209; 210; DISABLE-SPR-LABEL: permps: 211; DISABLE-SPR: # %bb.0: 212; DISABLE-SPR-NEXT: vmovaps %ymm0, %ymm16 213; DISABLE-SPR-NEXT: #APP 214; DISABLE-SPR-NEXT: nop 215; DISABLE-SPR-NEXT: #NO_APP 216; DISABLE-SPR-NEXT: vpermps %ymm16, %ymm0, %ymm1 217; DISABLE-SPR-NEXT: vcvtdq2ps %ymm0, %ymm0 218; DISABLE-SPR-NEXT: vaddps %ymm16, %ymm0, %ymm0 219; DISABLE-SPR-NEXT: vaddps %ymm0, %ymm1, %ymm0 220; DISABLE-SPR-NEXT: retq 221 %1 = tail call <8 x i32> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 222 %2 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> %1) 223 %t = sitofp <8 x i32> %1 to <8 x float> 224 %3 = fadd <8 x float> %t, %a0 225 %res = fadd <8 x float> %2, %3 226 ret <8 x float> %res 227} 228 229define <8 x float> @permps_mem(ptr %p0, <8 x i32> %a1) { 230; ENABLE-LABEL: permps_mem: 231; ENABLE: # %bb.0: 232; ENABLE-NEXT: #APP 233; ENABLE-NEXT: nop 234; ENABLE-NEXT: #NO_APP 235; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1 236; ENABLE-NEXT: vpermps (%rdi), %ymm0, %ymm1 237; ENABLE-NEXT: vcvtdq2ps %ymm0, %ymm0 238; ENABLE-NEXT: vaddps %ymm0, %ymm1, %ymm0 239; ENABLE-NEXT: retq 240; 241; DISABLE-LABEL: permps_mem: 242; DISABLE: # %bb.0: 243; DISABLE-NEXT: #APP 244; DISABLE-NEXT: nop 245; DISABLE-NEXT: #NO_APP 246; DISABLE-NEXT: vpermps (%rdi), %ymm0, %ymm1 247; DISABLE-NEXT: vcvtdq2ps %ymm0, %ymm0 248; DISABLE-NEXT: vaddps %ymm0, %ymm1, %ymm0 249; DISABLE-NEXT: retq 250 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 251 %a0 = load <8 x float>, ptr %p0, align 64 252 %2 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> %a1) 253 %t = sitofp <8 x i32> %a1 to <8 x float> 254 %res = fadd <8 x float> %2, %t 255 ret <8 x float> %res 256} 257 258declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind readonly 259 260define <4 x double> @permpd(<4 x double> %a0) { 261; ENABLE-LABEL: permpd: 262; ENABLE: # %bb.0: 263; ENABLE-NEXT: #APP 264; ENABLE-NEXT: nop 265; ENABLE-NEXT: #NO_APP 266; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1 267; ENABLE-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[1,2,1,0] 268; ENABLE-NEXT: vaddpd %ymm0, %ymm1, %ymm0 269; ENABLE-NEXT: retq 270; 271; DISABLE-LABEL: permpd: 272; DISABLE: # %bb.0: 273; DISABLE-NEXT: #APP 274; DISABLE-NEXT: nop 275; DISABLE-NEXT: #NO_APP 276; DISABLE-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[1,2,1,0] 277; DISABLE-NEXT: vaddpd %ymm0, %ymm1, %ymm0 278; DISABLE-NEXT: retq 279 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 280 %2 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 1, i32 2, i32 1, i32 0> 281 %res = fadd <4 x double> %2, %a0 282 ret <4 x double> %res 283} 284 285define <4 x double> @permpd_mem(ptr %p0) { 286; ENABLE-LABEL: permpd_mem: 287; ENABLE: # %bb.0: 288; ENABLE-NEXT: #APP 289; ENABLE-NEXT: nop 290; ENABLE-NEXT: #NO_APP 291; ENABLE-NEXT: vxorps %xmm0, %xmm0, %xmm0 292; ENABLE-NEXT: vpermpd {{.*#+}} ymm0 = mem[1,2,1,0] 293; ENABLE-NEXT: retq 294; 295; DISABLE-LABEL: permpd_mem: 296; DISABLE: # %bb.0: 297; DISABLE-NEXT: #APP 298; DISABLE-NEXT: nop 299; DISABLE-NEXT: #NO_APP 300; DISABLE-NEXT: vpermpd {{.*#+}} ymm0 = mem[1,2,1,0] 301; DISABLE-NEXT: retq 302 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 303 %a0 = load <4 x double>, ptr %p0, align 64 304 %2 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 1, i32 2, i32 1, i32 0> 305 ret <4 x double> %2 306} 307