1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2 < %s | FileCheck %s 3 4target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" 5target triple = "x86_64-unknown-unknown" 6 7; Stack reload folding tests. 8; 9; By including a nop call with sideeffects we can force a partial register spill of the 10; relevant registers and check that the reload is correctly folded into the instruction. 11 12define <4 x double> @stack_fold_broadcastsd_ymm(<2 x double> %a0) { 13; CHECK-LABEL: stack_fold_broadcastsd_ymm: 14; CHECK: # %bb.0: 15; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 16; CHECK-NEXT: #APP 17; CHECK-NEXT: nop 18; CHECK-NEXT: #NO_APP 19; CHECK-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload 20; CHECK-NEXT: vmovsd {{.*#+}} xmm1 = [4.9406564584124654E-324,0.0E+0] 21; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 22; CHECK-NEXT: retq 23 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 24 %2 = shufflevector <2 x double> %a0, <2 x double> undef, <4 x i32> zeroinitializer 25 ; fadd forces execution domain 26 %3 = fadd <4 x double> %2, <double 0x1, double 0x0, double 0x0, double 0x0> 27 ret <4 x double> %3 28} 29 30define <4 x float> @stack_fold_broadcastss(<4 x float> %a0) { 31; CHECK-LABEL: stack_fold_broadcastss: 32; CHECK: # %bb.0: 33; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 34; CHECK-NEXT: #APP 35; CHECK-NEXT: nop 36; CHECK-NEXT: #NO_APP 37; CHECK-NEXT: vbroadcastss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 38; CHECK-NEXT: vmovss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] 39; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 40; CHECK-NEXT: retq 41 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 42 %2 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> zeroinitializer 43 ; fadd forces execution domain 44 %3 = fadd <4 x float> %2, <float 1.0, float 0x0, float 0x0, float 0x0> 45 ret <4 x float> %3 46} 47 48define <8 x float> @stack_fold_broadcastss_ymm(<4 x float> %a0) { 49; CHECK-LABEL: stack_fold_broadcastss_ymm: 50; CHECK: # %bb.0: 51; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 52; CHECK-NEXT: #APP 53; CHECK-NEXT: nop 54; CHECK-NEXT: #NO_APP 55; CHECK-NEXT: vbroadcastss {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload 56; CHECK-NEXT: vmovss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] 57; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 58; CHECK-NEXT: retq 59 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 60 %2 = shufflevector <4 x float> %a0, <4 x float> undef, <8 x i32> zeroinitializer 61 ; fadd forces execution domain 62 %3 = fadd <8 x float> %2, <float 1.0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0> 63 ret <8 x float> %3 64} 65 66define <4 x i32> @stack_fold_extracti128(<8 x i16> %a0, <8 x i32> %a1) { 67; CHECK-LABEL: stack_fold_extracti128: 68; CHECK: # %bb.0: 69; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 70; CHECK-NEXT: vextracti128 $1, %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Folded Spill 71; CHECK-NEXT: #APP 72; CHECK-NEXT: nop 73; CHECK-NEXT: #NO_APP 74; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 75; CHECK-NEXT: vzeroupper 76; CHECK-NEXT: retq 77 ; zext forces execution domain 78 %t1 = zext <8 x i16> %a0 to <8 x i32> 79 %t2 = shufflevector <8 x i32> %t1, <8 x i32> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 80 %t3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 81 ret <4 x i32> %t2 82} 83 84define <8 x i32> @stack_fold_inserti128(<4 x i32> %a0, <4 x i32> %a1) { 85; CHECK-LABEL: stack_fold_inserti128: 86; CHECK: # %bb.0: 87; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 88; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 89; CHECK-NEXT: #APP 90; CHECK-NEXT: nop 91; CHECK-NEXT: #NO_APP 92; CHECK-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 93; CHECK-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 94; CHECK-NEXT: retq 95 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 96 %2 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 97 ; add forces execution domain 98 %3 = add <8 x i32> %2, <i32 2, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 99 ret <8 x i32> %3 100} 101 102define <16 x i16> @stack_fold_mpsadbw(<32 x i8> %a0, <32 x i8> %a1) { 103; CHECK-LABEL: stack_fold_mpsadbw: 104; CHECK: # %bb.0: 105; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 106; CHECK-NEXT: #APP 107; CHECK-NEXT: nop 108; CHECK-NEXT: #NO_APP 109; CHECK-NEXT: vmpsadbw $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 110; CHECK-NEXT: retq 111 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 112 %2 = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %a0, <32 x i8> %a1, i8 7) 113 ret <16 x i16> %2 114} 115declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i8) nounwind readnone 116 117define <32 x i8> @stack_fold_pabsb(<32 x i8> %a0) { 118; CHECK-LABEL: stack_fold_pabsb: 119; CHECK: # %bb.0: 120; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 121; CHECK-NEXT: #APP 122; CHECK-NEXT: nop 123; CHECK-NEXT: #NO_APP 124; CHECK-NEXT: vpabsb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 125; CHECK-NEXT: retq 126 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 127 %2 = icmp sgt <32 x i8> %a0, zeroinitializer 128 %3 = sub <32 x i8> zeroinitializer, %a0 129 %4 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %3 130 ret <32 x i8> %4 131} 132 133define <8 x i32> @stack_fold_pabsd(<8 x i32> %a0) { 134; CHECK-LABEL: stack_fold_pabsd: 135; CHECK: # %bb.0: 136; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 137; CHECK-NEXT: #APP 138; CHECK-NEXT: nop 139; CHECK-NEXT: #NO_APP 140; CHECK-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 141; CHECK-NEXT: retq 142 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 143 %2 = icmp sgt <8 x i32> %a0, zeroinitializer 144 %3 = sub <8 x i32> zeroinitializer, %a0 145 %4 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %3 146 ret <8 x i32> %4 147} 148 149define <16 x i16> @stack_fold_pabsw(<16 x i16> %a0) { 150; CHECK-LABEL: stack_fold_pabsw: 151; CHECK: # %bb.0: 152; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 153; CHECK-NEXT: #APP 154; CHECK-NEXT: nop 155; CHECK-NEXT: #NO_APP 156; CHECK-NEXT: vpabsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 157; CHECK-NEXT: retq 158 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 159 %2 = icmp sgt <16 x i16> %a0, zeroinitializer 160 %3 = sub <16 x i16> zeroinitializer, %a0 161 %4 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %3 162 ret <16 x i16> %4 163} 164 165define <16 x i16> @stack_fold_packssdw(<8 x i32> %a0, <8 x i32> %a1) { 166; CHECK-LABEL: stack_fold_packssdw: 167; CHECK: # %bb.0: 168; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 169; CHECK-NEXT: #APP 170; CHECK-NEXT: nop 171; CHECK-NEXT: #NO_APP 172; CHECK-NEXT: vpackssdw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 173; CHECK-NEXT: retq 174 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 175 %2 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a0, <8 x i32> %a1) 176 ret <16 x i16> %2 177} 178declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone 179 180define <32 x i8> @stack_fold_packsswb(<16 x i16> %a0, <16 x i16> %a1) { 181; CHECK-LABEL: stack_fold_packsswb: 182; CHECK: # %bb.0: 183; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 184; CHECK-NEXT: #APP 185; CHECK-NEXT: nop 186; CHECK-NEXT: #NO_APP 187; CHECK-NEXT: vpacksswb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 188; CHECK-NEXT: retq 189 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 190 %2 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a0, <16 x i16> %a1) 191 ret <32 x i8> %2 192} 193declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone 194 195define <16 x i16> @stack_fold_packusdw(<8 x i32> %a0, <8 x i32> %a1) { 196; CHECK-LABEL: stack_fold_packusdw: 197; CHECK: # %bb.0: 198; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 199; CHECK-NEXT: #APP 200; CHECK-NEXT: nop 201; CHECK-NEXT: #NO_APP 202; CHECK-NEXT: vpackusdw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 203; CHECK-NEXT: retq 204 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 205 %2 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a0, <8 x i32> %a1) 206 ret <16 x i16> %2 207} 208declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone 209 210define <32 x i8> @stack_fold_packuswb(<16 x i16> %a0, <16 x i16> %a1) { 211; CHECK-LABEL: stack_fold_packuswb: 212; CHECK: # %bb.0: 213; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 214; CHECK-NEXT: #APP 215; CHECK-NEXT: nop 216; CHECK-NEXT: #NO_APP 217; CHECK-NEXT: vpackuswb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 218; CHECK-NEXT: retq 219 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 220 %2 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a0, <16 x i16> %a1) 221 ret <32 x i8> %2 222} 223declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone 224 225define <32 x i8> @stack_fold_paddb(<32 x i8> %a0, <32 x i8> %a1) { 226; CHECK-LABEL: stack_fold_paddb: 227; CHECK: # %bb.0: 228; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 229; CHECK-NEXT: #APP 230; CHECK-NEXT: nop 231; CHECK-NEXT: #NO_APP 232; CHECK-NEXT: vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 233; CHECK-NEXT: retq 234 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 235 %2 = add <32 x i8> %a0, %a1 236 ret <32 x i8> %2 237} 238 239define <8 x i32> @stack_fold_paddd(<8 x i32> %a0, <8 x i32> %a1) { 240; CHECK-LABEL: stack_fold_paddd: 241; CHECK: # %bb.0: 242; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 243; CHECK-NEXT: #APP 244; CHECK-NEXT: nop 245; CHECK-NEXT: #NO_APP 246; CHECK-NEXT: vpaddd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 247; CHECK-NEXT: retq 248 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 249 %2 = add <8 x i32> %a0, %a1 250 ret <8 x i32> %2 251} 252 253define <4 x i64> @stack_fold_paddq(<4 x i64> %a0, <4 x i64> %a1) { 254; CHECK-LABEL: stack_fold_paddq: 255; CHECK: # %bb.0: 256; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 257; CHECK-NEXT: #APP 258; CHECK-NEXT: nop 259; CHECK-NEXT: #NO_APP 260; CHECK-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 261; CHECK-NEXT: retq 262 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 263 %2 = add <4 x i64> %a0, %a1 264 ret <4 x i64> %2 265} 266 267define <32 x i8> @stack_fold_paddsb(<32 x i8> %a0, <32 x i8> %a1) { 268; CHECK-LABEL: stack_fold_paddsb: 269; CHECK: # %bb.0: 270; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 271; CHECK-NEXT: #APP 272; CHECK-NEXT: nop 273; CHECK-NEXT: #NO_APP 274; CHECK-NEXT: vpaddsb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 275; CHECK-NEXT: retq 276 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 277 %2 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> %a0, <32 x i8> %a1) 278 ret <32 x i8> %2 279} 280declare <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone 281 282define <16 x i16> @stack_fold_paddsw(<16 x i16> %a0, <16 x i16> %a1) { 283; CHECK-LABEL: stack_fold_paddsw: 284; CHECK: # %bb.0: 285; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 286; CHECK-NEXT: #APP 287; CHECK-NEXT: nop 288; CHECK-NEXT: #NO_APP 289; CHECK-NEXT: vpaddsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 290; CHECK-NEXT: retq 291 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 292 %2 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> %a0, <16 x i16> %a1) 293 ret <16 x i16> %2 294} 295declare <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone 296 297define <32 x i8> @stack_fold_paddusb(<32 x i8> %a0, <32 x i8> %a1) { 298; CHECK-LABEL: stack_fold_paddusb: 299; CHECK: # %bb.0: 300; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 301; CHECK-NEXT: #APP 302; CHECK-NEXT: nop 303; CHECK-NEXT: #NO_APP 304; CHECK-NEXT: vpaddusb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 305; CHECK-NEXT: retq 306 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 307 %2 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> %a0, <32 x i8> %a1) 308 ret <32 x i8> %2 309} 310declare <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone 311 312define <16 x i16> @stack_fold_paddusw(<16 x i16> %a0, <16 x i16> %a1) { 313; CHECK-LABEL: stack_fold_paddusw: 314; CHECK: # %bb.0: 315; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 316; CHECK-NEXT: #APP 317; CHECK-NEXT: nop 318; CHECK-NEXT: #NO_APP 319; CHECK-NEXT: vpaddusw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 320; CHECK-NEXT: retq 321 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 322 %2 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> %a0, <16 x i16> %a1) 323 ret <16 x i16> %2 324} 325declare <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone 326 327define <16 x i16> @stack_fold_paddw(<16 x i16> %a0, <16 x i16> %a1) { 328; CHECK-LABEL: stack_fold_paddw: 329; CHECK: # %bb.0: 330; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 331; CHECK-NEXT: #APP 332; CHECK-NEXT: nop 333; CHECK-NEXT: #NO_APP 334; CHECK-NEXT: vpaddw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 335; CHECK-NEXT: retq 336 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 337 %2 = add <16 x i16> %a0, %a1 338 ret <16 x i16> %2 339} 340 341define <32 x i8> @stack_fold_palignr(<32 x i8> %a0, <32 x i8> %a1) { 342; CHECK-LABEL: stack_fold_palignr: 343; CHECK: # %bb.0: 344; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 345; CHECK-NEXT: #APP 346; CHECK-NEXT: nop 347; CHECK-NEXT: #NO_APP 348; CHECK-NEXT: vpalignr $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 349; CHECK-NEXT: # ymm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0],mem[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16] 350; CHECK-NEXT: retq 351 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 352 %2 = shufflevector <32 x i8> %a1, <32 x i8> %a0, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48> 353 ret <32 x i8> %2 354} 355 356define <32 x i8> @stack_fold_pand(<32 x i8> %a0, <32 x i8> %a1) { 357; CHECK-LABEL: stack_fold_pand: 358; CHECK: # %bb.0: 359; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 360; CHECK-NEXT: #APP 361; CHECK-NEXT: nop 362; CHECK-NEXT: #NO_APP 363; CHECK-NEXT: vpand {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 364; CHECK-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 365; CHECK-NEXT: retq 366 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 367 %2 = and <32 x i8> %a0, %a1 368 ; add forces execution domain 369 %3 = add <32 x i8> %2, <i8 2, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 370 ret <32 x i8> %3 371} 372 373define <32 x i8> @stack_fold_pandn(<32 x i8> %a0, <32 x i8> %a1) { 374; CHECK-LABEL: stack_fold_pandn: 375; CHECK: # %bb.0: 376; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 377; CHECK-NEXT: #APP 378; CHECK-NEXT: nop 379; CHECK-NEXT: #NO_APP 380; CHECK-NEXT: vpandn {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 381; CHECK-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 382; CHECK-NEXT: retq 383 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 384 %2 = xor <32 x i8> %a0, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> 385 %3 = and <32 x i8> %2, %a1 386 ; add forces execution domain 387 %4 = add <32 x i8> %3, <i8 2, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 388 ret <32 x i8> %4 389} 390 391define <32 x i8> @stack_fold_pavgb(<32 x i8> %a0, <32 x i8> %a1) { 392; CHECK-LABEL: stack_fold_pavgb: 393; CHECK: # %bb.0: 394; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 395; CHECK-NEXT: #APP 396; CHECK-NEXT: nop 397; CHECK-NEXT: #NO_APP 398; CHECK-NEXT: vpavgb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 399; CHECK-NEXT: retq 400 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 401 %2 = zext <32 x i8> %a0 to <32 x i16> 402 %3 = zext <32 x i8> %a1 to <32 x i16> 403 %4 = add <32 x i16> %2, %3 404 %5 = add <32 x i16> %4, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 405 %6 = lshr <32 x i16> %5, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 406 %7 = trunc <32 x i16> %6 to <32 x i8> 407 ret <32 x i8> %7 408} 409 410define <16 x i16> @stack_fold_pavgw(<16 x i16> %a0, <16 x i16> %a1) { 411; CHECK-LABEL: stack_fold_pavgw: 412; CHECK: # %bb.0: 413; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 414; CHECK-NEXT: #APP 415; CHECK-NEXT: nop 416; CHECK-NEXT: #NO_APP 417; CHECK-NEXT: vpavgw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 418; CHECK-NEXT: retq 419 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 420 %2 = zext <16 x i16> %a0 to <16 x i32> 421 %3 = zext <16 x i16> %a1 to <16 x i32> 422 %4 = add <16 x i32> %2, %3 423 %5 = add <16 x i32> %4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 424 %6 = lshr <16 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 425 %7 = trunc <16 x i32> %6 to <16 x i16> 426 ret <16 x i16> %7 427} 428 429define <4 x i32> @stack_fold_pblendd(<4 x i32> %a0, <4 x i32> %a1) { 430; CHECK-LABEL: stack_fold_pblendd: 431; CHECK: # %bb.0: 432; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 433; CHECK-NEXT: #APP 434; CHECK-NEXT: nop 435; CHECK-NEXT: #NO_APP 436; CHECK-NEXT: vpblendd $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 437; CHECK-NEXT: # xmm0 = mem[0,1,2],xmm0[3] 438; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 439; CHECK-NEXT: vpsubd %xmm1, %xmm0, %xmm0 440; CHECK-NEXT: retq 441 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 442 %2 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 3> 443 ; add forces execution domain 444 %3 = add <4 x i32> %2, <i32 1, i32 1, i32 1, i32 1> 445 ret <4 x i32> %3 446} 447 448define <8 x i32> @stack_fold_pblendd_ymm(<8 x i32> %a0, <8 x i32> %a1) { 449; CHECK-LABEL: stack_fold_pblendd_ymm: 450; CHECK: # %bb.0: 451; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 452; CHECK-NEXT: #APP 453; CHECK-NEXT: nop 454; CHECK-NEXT: #NO_APP 455; CHECK-NEXT: vpblendd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 456; CHECK-NEXT: # ymm0 = mem[0,1,2],ymm0[3,4,5,6,7] 457; CHECK-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 458; CHECK-NEXT: retq 459 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 460 %2 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 8, i32 9, i32 10, i32 3, i32 4, i32 5, i32 6, i32 7> 461 ; add forces execution domain 462 %3 = add <8 x i32> %2, <i32 2, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 463 ret <8 x i32> %3 464} 465 466define <32 x i8> @stack_fold_pblendvb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> %c) { 467; CHECK-LABEL: stack_fold_pblendvb: 468; CHECK: # %bb.0: 469; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 470; CHECK-NEXT: #APP 471; CHECK-NEXT: nop 472; CHECK-NEXT: #NO_APP 473; CHECK-NEXT: vpblendvb %ymm0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload 474; CHECK-NEXT: retq 475 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 476 %2 = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %a1, <32 x i8> %c, <32 x i8> %a0) 477 ret <32 x i8> %2 478} 479declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>) nounwind readnone 480 481define <16 x i16> @stack_fold_pblendw(<16 x i16> %a0, <16 x i16> %a1) { 482; CHECK-LABEL: stack_fold_pblendw: 483; CHECK: # %bb.0: 484; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 485; CHECK-NEXT: #APP 486; CHECK-NEXT: nop 487; CHECK-NEXT: #NO_APP 488; CHECK-NEXT: vpblendw $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 489; CHECK-NEXT: # ymm0 = mem[0,1,2],ymm0[3,4,5,6,7],mem[8,9,10],ymm0[11,12,13,14,15] 490; CHECK-NEXT: retq 491 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 492 %2 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 16, i32 17, i32 18, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 11, i32 12, i32 13, i32 14, i32 15> 493 ret <16 x i16> %2 494} 495 496define <16 x i8> @stack_fold_pbroadcastb(<16 x i8> %a0) { 497; CHECK-LABEL: stack_fold_pbroadcastb: 498; CHECK: # %bb.0: 499; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 500; CHECK-NEXT: #APP 501; CHECK-NEXT: nop 502; CHECK-NEXT: #NO_APP 503; CHECK-NEXT: vpbroadcastb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 504; CHECK-NEXT: retq 505 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 506 %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> zeroinitializer 507 ret <16 x i8> %2 508} 509 510define <32 x i8> @stack_fold_pbroadcastb_ymm(<16 x i8> %a0) { 511; CHECK-LABEL: stack_fold_pbroadcastb_ymm: 512; CHECK: # %bb.0: 513; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 514; CHECK-NEXT: #APP 515; CHECK-NEXT: nop 516; CHECK-NEXT: #NO_APP 517; CHECK-NEXT: vpbroadcastb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload 518; CHECK-NEXT: retq 519 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 520 %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <32 x i32> zeroinitializer 521 ret <32 x i8> %2 522} 523 524define <4 x i32> @stack_fold_pbroadcastd(<4 x i32> %a0) { 525; CHECK-LABEL: stack_fold_pbroadcastd: 526; CHECK: # %bb.0: 527; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 528; CHECK-NEXT: #APP 529; CHECK-NEXT: nop 530; CHECK-NEXT: #NO_APP 531; CHECK-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 532; CHECK-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 533; CHECK-NEXT: retq 534 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 535 %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> zeroinitializer 536 ; add forces execution domain 537 %3 = add <4 x i32> %2, <i32 2, i32 1, i32 1, i32 1> 538 ret <4 x i32> %3 539} 540 541define <8 x i32> @stack_fold_pbroadcastd_ymm(<4 x i32> %a0) { 542; CHECK-LABEL: stack_fold_pbroadcastd_ymm: 543; CHECK: # %bb.0: 544; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 545; CHECK-NEXT: #APP 546; CHECK-NEXT: nop 547; CHECK-NEXT: #NO_APP 548; CHECK-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload 549; CHECK-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 550; CHECK-NEXT: retq 551 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 552 %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <8 x i32> zeroinitializer 553 ; add forces execution domain 554 %3 = add <8 x i32> %2, <i32 2, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 555 ret <8 x i32> %3 556} 557 558define <2 x i64> @stack_fold_pbroadcastq(<2 x i64> %a0) { 559; CHECK-LABEL: stack_fold_pbroadcastq: 560; CHECK: # %bb.0: 561; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 562; CHECK-NEXT: #APP 563; CHECK-NEXT: nop 564; CHECK-NEXT: #NO_APP 565; CHECK-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 566; CHECK-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 567; CHECK-NEXT: retq 568 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 569 %2 = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> zeroinitializer 570 ; add forces execution domain 571 %3 = add <2 x i64> %2, <i64 2, i64 1> 572 ret <2 x i64> %3 573} 574 575define <4 x i64> @stack_fold_pbroadcastq_ymm(<2 x i64> %a0) { 576; CHECK-LABEL: stack_fold_pbroadcastq_ymm: 577; CHECK: # %bb.0: 578; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 579; CHECK-NEXT: #APP 580; CHECK-NEXT: nop 581; CHECK-NEXT: #NO_APP 582; CHECK-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload 583; CHECK-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 584; CHECK-NEXT: retq 585 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 586 %2 = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> zeroinitializer 587 ; add forces execution domain 588 %3 = add <4 x i64> %2, <i64 2, i64 1, i64 1, i64 1> 589 ret <4 x i64> %3 590} 591 592define <8 x i16> @stack_fold_pbroadcastw(<8 x i16> %a0) { 593; CHECK-LABEL: stack_fold_pbroadcastw: 594; CHECK: # %bb.0: 595; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 596; CHECK-NEXT: #APP 597; CHECK-NEXT: nop 598; CHECK-NEXT: #NO_APP 599; CHECK-NEXT: vpbroadcastw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 600; CHECK-NEXT: retq 601 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 602 %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> zeroinitializer 603 ret <8 x i16> %2 604} 605 606define <16 x i16> @stack_fold_pbroadcastw_ymm(<8 x i16> %a0) { 607; CHECK-LABEL: stack_fold_pbroadcastw_ymm: 608; CHECK: # %bb.0: 609; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 610; CHECK-NEXT: #APP 611; CHECK-NEXT: nop 612; CHECK-NEXT: #NO_APP 613; CHECK-NEXT: vpbroadcastw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload 614; CHECK-NEXT: retq 615 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 616 %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <16 x i32> zeroinitializer 617 ret <16 x i16> %2 618} 619 620define <32 x i8> @stack_fold_pcmpeqb(<32 x i8> %a0, <32 x i8> %a1) { 621; CHECK-LABEL: stack_fold_pcmpeqb: 622; CHECK: # %bb.0: 623; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 624; CHECK-NEXT: #APP 625; CHECK-NEXT: nop 626; CHECK-NEXT: #NO_APP 627; CHECK-NEXT: vpcmpeqb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 628; CHECK-NEXT: retq 629 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 630 %2 = icmp eq <32 x i8> %a0, %a1 631 %3 = sext <32 x i1> %2 to <32 x i8> 632 ret <32 x i8> %3 633} 634 635define <8 x i32> @stack_fold_pcmpeqd(<8 x i32> %a0, <8 x i32> %a1) { 636; CHECK-LABEL: stack_fold_pcmpeqd: 637; CHECK: # %bb.0: 638; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 639; CHECK-NEXT: #APP 640; CHECK-NEXT: nop 641; CHECK-NEXT: #NO_APP 642; CHECK-NEXT: vpcmpeqd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 643; CHECK-NEXT: retq 644 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 645 %2 = icmp eq <8 x i32> %a0, %a1 646 %3 = sext <8 x i1> %2 to <8 x i32> 647 ret <8 x i32> %3 648} 649 650define <4 x i64> @stack_fold_pcmpeqq(<4 x i64> %a0, <4 x i64> %a1) { 651; CHECK-LABEL: stack_fold_pcmpeqq: 652; CHECK: # %bb.0: 653; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 654; CHECK-NEXT: #APP 655; CHECK-NEXT: nop 656; CHECK-NEXT: #NO_APP 657; CHECK-NEXT: vpcmpeqq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 658; CHECK-NEXT: retq 659 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 660 %2 = icmp eq <4 x i64> %a0, %a1 661 %3 = sext <4 x i1> %2 to <4 x i64> 662 ret <4 x i64> %3 663} 664 665define <16 x i16> @stack_fold_pcmpeqw(<16 x i16> %a0, <16 x i16> %a1) { 666; CHECK-LABEL: stack_fold_pcmpeqw: 667; CHECK: # %bb.0: 668; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 669; CHECK-NEXT: #APP 670; CHECK-NEXT: nop 671; CHECK-NEXT: #NO_APP 672; CHECK-NEXT: vpcmpeqw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 673; CHECK-NEXT: retq 674 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 675 %2 = icmp eq <16 x i16> %a0, %a1 676 %3 = sext <16 x i1> %2 to <16 x i16> 677 ret <16 x i16> %3 678} 679 680define <32 x i8> @stack_fold_pcmpgtb(<32 x i8> %a0, <32 x i8> %a1) { 681; CHECK-LABEL: stack_fold_pcmpgtb: 682; CHECK: # %bb.0: 683; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 684; CHECK-NEXT: #APP 685; CHECK-NEXT: nop 686; CHECK-NEXT: #NO_APP 687; CHECK-NEXT: vpcmpgtb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 688; CHECK-NEXT: retq 689 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 690 %2 = icmp sgt <32 x i8> %a0, %a1 691 %3 = sext <32 x i1> %2 to <32 x i8> 692 ret <32 x i8> %3 693} 694 695define <8 x i32> @stack_fold_pcmpgtd(<8 x i32> %a0, <8 x i32> %a1) { 696; CHECK-LABEL: stack_fold_pcmpgtd: 697; CHECK: # %bb.0: 698; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 699; CHECK-NEXT: #APP 700; CHECK-NEXT: nop 701; CHECK-NEXT: #NO_APP 702; CHECK-NEXT: vpcmpgtd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 703; CHECK-NEXT: retq 704 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 705 %2 = icmp sgt <8 x i32> %a0, %a1 706 %3 = sext <8 x i1> %2 to <8 x i32> 707 ret <8 x i32> %3 708} 709 710define <4 x i64> @stack_fold_pcmpgtq(<4 x i64> %a0, <4 x i64> %a1) { 711; CHECK-LABEL: stack_fold_pcmpgtq: 712; CHECK: # %bb.0: 713; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 714; CHECK-NEXT: #APP 715; CHECK-NEXT: nop 716; CHECK-NEXT: #NO_APP 717; CHECK-NEXT: vpcmpgtq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 718; CHECK-NEXT: retq 719 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 720 %2 = icmp sgt <4 x i64> %a0, %a1 721 %3 = sext <4 x i1> %2 to <4 x i64> 722 ret <4 x i64> %3 723} 724 725define <16 x i16> @stack_fold_pcmpgtw(<16 x i16> %a0, <16 x i16> %a1) { 726; CHECK-LABEL: stack_fold_pcmpgtw: 727; CHECK: # %bb.0: 728; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 729; CHECK-NEXT: #APP 730; CHECK-NEXT: nop 731; CHECK-NEXT: #NO_APP 732; CHECK-NEXT: vpcmpgtw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 733; CHECK-NEXT: retq 734 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 735 %2 = icmp sgt <16 x i16> %a0, %a1 736 %3 = sext <16 x i1> %2 to <16 x i16> 737 ret <16 x i16> %3 738} 739 740define <8 x i32> @stack_fold_perm2i128(<8 x i32> %a0, <8 x i32> %a1) { 741; CHECK-LABEL: stack_fold_perm2i128: 742; CHECK: # %bb.0: 743; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 744; CHECK-NEXT: #APP 745; CHECK-NEXT: nop 746; CHECK-NEXT: #NO_APP 747; CHECK-NEXT: vperm2i128 $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 748; CHECK-NEXT: # ymm0 = ymm0[2,3],mem[0,1] 749; CHECK-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 750; CHECK-NEXT: retq 751 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 752 %2 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> 753 ; add forces execution domain 754 %3 = add <8 x i32> %2, <i32 2, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 755 ret <8 x i32> %3 756} 757 758define <8 x i32> @stack_fold_permd(<8 x i32> %a0, <8 x i32> %a1) { 759; CHECK-LABEL: stack_fold_permd: 760; CHECK: # %bb.0: 761; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 762; CHECK-NEXT: #APP 763; CHECK-NEXT: nop 764; CHECK-NEXT: #NO_APP 765; CHECK-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 766; CHECK-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 767; CHECK-NEXT: retq 768 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 769 %2 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a1, <8 x i32> %a0) 770 ; add forces execution domain 771 %3 = add <8 x i32> %2, <i32 2, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 772 ret <8 x i32> %3 773} 774declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly 775 776define <4 x double> @stack_fold_permpd(<4 x double> %a0) { 777; CHECK-LABEL: stack_fold_permpd: 778; CHECK: # %bb.0: 779; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 780; CHECK-NEXT: #APP 781; CHECK-NEXT: nop 782; CHECK-NEXT: #NO_APP 783; CHECK-NEXT: vpermpd $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 784; CHECK-NEXT: # ymm0 = mem[3,2,2,3] 785; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 786; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 787; CHECK-NEXT: retq 788 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 789 %2 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 3> 790 ; fadd forces execution domain 791 %3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0> 792 ret <4 x double> %3 793} 794 795define <8 x float> @stack_fold_permps(<8 x i32> %a0, <8 x float> %a1) { 796; CHECK-LABEL: stack_fold_permps: 797; CHECK: # %bb.0: 798; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 799; CHECK-NEXT: #APP 800; CHECK-NEXT: nop 801; CHECK-NEXT: #NO_APP 802; CHECK-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 803; CHECK-NEXT: retq 804 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 805 %2 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a1, <8 x i32> %a0) 806 ret <8 x float> %2 807} 808declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind readonly 809 810define <4 x i64> @stack_fold_permq(<4 x i64> %a0) { 811; CHECK-LABEL: stack_fold_permq: 812; CHECK: # %bb.0: 813; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 814; CHECK-NEXT: #APP 815; CHECK-NEXT: nop 816; CHECK-NEXT: #NO_APP 817; CHECK-NEXT: vpermq $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 818; CHECK-NEXT: # ymm0 = mem[3,2,2,3] 819; CHECK-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 820; CHECK-NEXT: retq 821 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 822 %2 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 3> 823 ; add forces execution domain 824 %3 = add <4 x i64> %2, <i64 2, i64 1, i64 1, i64 1> 825 ret <4 x i64> %3 826} 827 828define <8 x i32> @stack_fold_phaddd(<8 x i32> %a0, <8 x i32> %a1) { 829; CHECK-LABEL: stack_fold_phaddd: 830; CHECK: # %bb.0: 831; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 832; CHECK-NEXT: #APP 833; CHECK-NEXT: nop 834; CHECK-NEXT: #NO_APP 835; CHECK-NEXT: vphaddd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 836; CHECK-NEXT: retq 837 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 838 %2 = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %a0, <8 x i32> %a1) 839 ret <8 x i32> %2 840} 841declare <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32>, <8 x i32>) nounwind readnone 842 843define <16 x i16> @stack_fold_phaddsw(<16 x i16> %a0, <16 x i16> %a1) { 844; CHECK-LABEL: stack_fold_phaddsw: 845; CHECK: # %bb.0: 846; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 847; CHECK-NEXT: #APP 848; CHECK-NEXT: nop 849; CHECK-NEXT: #NO_APP 850; CHECK-NEXT: vphaddsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 851; CHECK-NEXT: retq 852 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 853 %2 = call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> %a0, <16 x i16> %a1) 854 ret <16 x i16> %2 855} 856declare <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16>, <16 x i16>) nounwind readnone 857 858define <16 x i16> @stack_fold_phaddw(<16 x i16> %a0, <16 x i16> %a1) { 859; CHECK-LABEL: stack_fold_phaddw: 860; CHECK: # %bb.0: 861; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 862; CHECK-NEXT: #APP 863; CHECK-NEXT: nop 864; CHECK-NEXT: #NO_APP 865; CHECK-NEXT: vphaddw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 866; CHECK-NEXT: retq 867 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 868 %2 = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %a0, <16 x i16> %a1) 869 ret <16 x i16> %2 870} 871declare <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16>, <16 x i16>) nounwind readnone 872 873define <8 x i32> @stack_fold_phsubd(<8 x i32> %a0, <8 x i32> %a1) { 874; CHECK-LABEL: stack_fold_phsubd: 875; CHECK: # %bb.0: 876; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 877; CHECK-NEXT: #APP 878; CHECK-NEXT: nop 879; CHECK-NEXT: #NO_APP 880; CHECK-NEXT: vphsubd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 881; CHECK-NEXT: retq 882 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 883 %2 = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %a0, <8 x i32> %a1) 884 ret <8 x i32> %2 885} 886declare <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32>, <8 x i32>) nounwind readnone 887 888define <16 x i16> @stack_fold_phsubsw(<16 x i16> %a0, <16 x i16> %a1) { 889; CHECK-LABEL: stack_fold_phsubsw: 890; CHECK: # %bb.0: 891; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 892; CHECK-NEXT: #APP 893; CHECK-NEXT: nop 894; CHECK-NEXT: #NO_APP 895; CHECK-NEXT: vphsubsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 896; CHECK-NEXT: retq 897 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 898 %2 = call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %a0, <16 x i16> %a1) 899 ret <16 x i16> %2 900} 901declare <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16>, <16 x i16>) nounwind readnone 902 903define <16 x i16> @stack_fold_phsubw(<16 x i16> %a0, <16 x i16> %a1) { 904; CHECK-LABEL: stack_fold_phsubw: 905; CHECK: # %bb.0: 906; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 907; CHECK-NEXT: #APP 908; CHECK-NEXT: nop 909; CHECK-NEXT: #NO_APP 910; CHECK-NEXT: vphsubw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 911; CHECK-NEXT: retq 912 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 913 %2 = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %a0, <16 x i16> %a1) 914 ret <16 x i16> %2 915} 916declare <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16>, <16 x i16>) nounwind readnone 917 918define <16 x i16> @stack_fold_pmaddubsw(<32 x i8> %a0, <32 x i8> %a1) { 919; CHECK-LABEL: stack_fold_pmaddubsw: 920; CHECK: # %bb.0: 921; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 922; CHECK-NEXT: #APP 923; CHECK-NEXT: nop 924; CHECK-NEXT: #NO_APP 925; CHECK-NEXT: vpmaddubsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 926; CHECK-NEXT: retq 927 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 928 %2 = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %a0, <32 x i8> %a1) 929 ret <16 x i16> %2 930} 931declare <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>) nounwind readnone 932 933define <8 x i32> @stack_fold_pmaddwd(<16 x i16> %a0, <16 x i16> %a1) { 934; CHECK-LABEL: stack_fold_pmaddwd: 935; CHECK: # %bb.0: 936; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 937; CHECK-NEXT: #APP 938; CHECK-NEXT: nop 939; CHECK-NEXT: #NO_APP 940; CHECK-NEXT: vpmaddwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 941; CHECK-NEXT: retq 942 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 943 %2 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a0, <16 x i16> %a1) 944 ret <8 x i32> %2 945} 946declare <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>) nounwind readnone 947 948define <32 x i8> @stack_fold_pmaxsb(<32 x i8> %a0, <32 x i8> %a1) { 949; CHECK-LABEL: stack_fold_pmaxsb: 950; CHECK: # %bb.0: 951; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 952; CHECK-NEXT: #APP 953; CHECK-NEXT: nop 954; CHECK-NEXT: #NO_APP 955; CHECK-NEXT: vpmaxsb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 956; CHECK-NEXT: retq 957 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 958 %2 = icmp sgt <32 x i8> %a0, %a1 959 %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %a1 960 ret <32 x i8> %3 961} 962 963define <8 x i32> @stack_fold_pmaxsd(<8 x i32> %a0, <8 x i32> %a1) { 964; CHECK-LABEL: stack_fold_pmaxsd: 965; CHECK: # %bb.0: 966; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 967; CHECK-NEXT: #APP 968; CHECK-NEXT: nop 969; CHECK-NEXT: #NO_APP 970; CHECK-NEXT: vpmaxsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 971; CHECK-NEXT: retq 972 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 973 %2 = icmp sgt <8 x i32> %a0, %a1 974 %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %a1 975 ret <8 x i32> %3 976} 977 978define <16 x i16> @stack_fold_pmaxsw(<16 x i16> %a0, <16 x i16> %a1) { 979; CHECK-LABEL: stack_fold_pmaxsw: 980; CHECK: # %bb.0: 981; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 982; CHECK-NEXT: #APP 983; CHECK-NEXT: nop 984; CHECK-NEXT: #NO_APP 985; CHECK-NEXT: vpmaxsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 986; CHECK-NEXT: retq 987 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 988 %2 = icmp sgt <16 x i16> %a0, %a1 989 %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %a1 990 ret <16 x i16> %3 991} 992 993define <32 x i8> @stack_fold_pmaxub(<32 x i8> %a0, <32 x i8> %a1) { 994; CHECK-LABEL: stack_fold_pmaxub: 995; CHECK: # %bb.0: 996; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 997; CHECK-NEXT: #APP 998; CHECK-NEXT: nop 999; CHECK-NEXT: #NO_APP 1000; CHECK-NEXT: vpmaxub {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1001; CHECK-NEXT: retq 1002 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1003 %2 = icmp ugt <32 x i8> %a0, %a1 1004 %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %a1 1005 ret <32 x i8> %3 1006} 1007 1008define <8 x i32> @stack_fold_pmaxud(<8 x i32> %a0, <8 x i32> %a1) { 1009; CHECK-LABEL: stack_fold_pmaxud: 1010; CHECK: # %bb.0: 1011; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1012; CHECK-NEXT: #APP 1013; CHECK-NEXT: nop 1014; CHECK-NEXT: #NO_APP 1015; CHECK-NEXT: vpmaxud {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1016; CHECK-NEXT: retq 1017 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1018 %2 = icmp ugt <8 x i32> %a0, %a1 1019 %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %a1 1020 ret <8 x i32> %3 1021} 1022 1023define <16 x i16> @stack_fold_pmaxuw(<16 x i16> %a0, <16 x i16> %a1) { 1024; CHECK-LABEL: stack_fold_pmaxuw: 1025; CHECK: # %bb.0: 1026; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1027; CHECK-NEXT: #APP 1028; CHECK-NEXT: nop 1029; CHECK-NEXT: #NO_APP 1030; CHECK-NEXT: vpmaxuw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1031; CHECK-NEXT: retq 1032 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1033 %2 = icmp ugt <16 x i16> %a0, %a1 1034 %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %a1 1035 ret <16 x i16> %3 1036} 1037 1038define <32 x i8> @stack_fold_pminsb(<32 x i8> %a0, <32 x i8> %a1) { 1039; CHECK-LABEL: stack_fold_pminsb: 1040; CHECK: # %bb.0: 1041; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1042; CHECK-NEXT: #APP 1043; CHECK-NEXT: nop 1044; CHECK-NEXT: #NO_APP 1045; CHECK-NEXT: vpminsb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1046; CHECK-NEXT: retq 1047 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1048 %2 = icmp slt <32 x i8> %a0, %a1 1049 %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %a1 1050 ret <32 x i8> %3 1051} 1052 1053define <8 x i32> @stack_fold_pminsd(<8 x i32> %a0, <8 x i32> %a1) { 1054; CHECK-LABEL: stack_fold_pminsd: 1055; CHECK: # %bb.0: 1056; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1057; CHECK-NEXT: #APP 1058; CHECK-NEXT: nop 1059; CHECK-NEXT: #NO_APP 1060; CHECK-NEXT: vpminsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1061; CHECK-NEXT: retq 1062 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1063 %2 = icmp slt <8 x i32> %a0, %a1 1064 %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %a1 1065 ret <8 x i32> %3 1066} 1067 1068define <16 x i16> @stack_fold_pminsw(<16 x i16> %a0, <16 x i16> %a1) { 1069; CHECK-LABEL: stack_fold_pminsw: 1070; CHECK: # %bb.0: 1071; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1072; CHECK-NEXT: #APP 1073; CHECK-NEXT: nop 1074; CHECK-NEXT: #NO_APP 1075; CHECK-NEXT: vpminsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1076; CHECK-NEXT: retq 1077 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1078 %2 = icmp slt <16 x i16> %a0, %a1 1079 %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %a1 1080 ret <16 x i16> %3 1081} 1082 1083define <32 x i8> @stack_fold_pminub(<32 x i8> %a0, <32 x i8> %a1) { 1084; CHECK-LABEL: stack_fold_pminub: 1085; CHECK: # %bb.0: 1086; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1087; CHECK-NEXT: #APP 1088; CHECK-NEXT: nop 1089; CHECK-NEXT: #NO_APP 1090; CHECK-NEXT: vpminub {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1091; CHECK-NEXT: retq 1092 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1093 %2 = icmp ult <32 x i8> %a0, %a1 1094 %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %a1 1095 ret <32 x i8> %3 1096} 1097 1098define <8 x i32> @stack_fold_pminud(<8 x i32> %a0, <8 x i32> %a1) { 1099; CHECK-LABEL: stack_fold_pminud: 1100; CHECK: # %bb.0: 1101; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1102; CHECK-NEXT: #APP 1103; CHECK-NEXT: nop 1104; CHECK-NEXT: #NO_APP 1105; CHECK-NEXT: vpminud {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1106; CHECK-NEXT: retq 1107 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1108 %2 = icmp ult <8 x i32> %a0, %a1 1109 %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %a1 1110 ret <8 x i32> %3 1111} 1112 1113define <16 x i16> @stack_fold_pminuw(<16 x i16> %a0, <16 x i16> %a1) { 1114; CHECK-LABEL: stack_fold_pminuw: 1115; CHECK: # %bb.0: 1116; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1117; CHECK-NEXT: #APP 1118; CHECK-NEXT: nop 1119; CHECK-NEXT: #NO_APP 1120; CHECK-NEXT: vpminuw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1121; CHECK-NEXT: retq 1122 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1123 %2 = icmp ult <16 x i16> %a0, %a1 1124 %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %a1 1125 ret <16 x i16> %3 1126} 1127 1128define <8 x i32> @stack_fold_pmovsxbd(<16 x i8> %a0) { 1129; CHECK-LABEL: stack_fold_pmovsxbd: 1130; CHECK: # %bb.0: 1131; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1132; CHECK-NEXT: #APP 1133; CHECK-NEXT: nop 1134; CHECK-NEXT: #NO_APP 1135; CHECK-NEXT: vpmovsxbd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload 1136; CHECK-NEXT: retq 1137 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1138 %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1139 %3 = sext <8 x i8> %2 to <8 x i32> 1140 ret <8 x i32> %3 1141} 1142 1143define <4 x i64> @stack_fold_pmovsxbq(<16 x i8> %a0) { 1144; CHECK-LABEL: stack_fold_pmovsxbq: 1145; CHECK: # %bb.0: 1146; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1147; CHECK-NEXT: #APP 1148; CHECK-NEXT: nop 1149; CHECK-NEXT: #NO_APP 1150; CHECK-NEXT: vpmovsxbq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload 1151; CHECK-NEXT: retq 1152 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1153 %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1154 %3 = sext <4 x i8> %2 to <4 x i64> 1155 ret <4 x i64> %3 1156} 1157 1158define <16 x i16> @stack_fold_pmovsxbw(<16 x i8> %a0) { 1159; CHECK-LABEL: stack_fold_pmovsxbw: 1160; CHECK: # %bb.0: 1161; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1162; CHECK-NEXT: #APP 1163; CHECK-NEXT: nop 1164; CHECK-NEXT: #NO_APP 1165; CHECK-NEXT: vpmovsxbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload 1166; CHECK-NEXT: retq 1167 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1168 %2 = sext <16 x i8> %a0 to <16 x i16> 1169 ret <16 x i16> %2 1170} 1171 1172define <4 x i64> @stack_fold_pmovsxdq(<4 x i32> %a0) { 1173; CHECK-LABEL: stack_fold_pmovsxdq: 1174; CHECK: # %bb.0: 1175; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1176; CHECK-NEXT: #APP 1177; CHECK-NEXT: nop 1178; CHECK-NEXT: #NO_APP 1179; CHECK-NEXT: vpmovsxdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload 1180; CHECK-NEXT: retq 1181 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1182 %2 = sext <4 x i32> %a0 to <4 x i64> 1183 ret <4 x i64> %2 1184} 1185 1186define <8 x i32> @stack_fold_pmovsxwd(<8 x i16> %a0) { 1187; CHECK-LABEL: stack_fold_pmovsxwd: 1188; CHECK: # %bb.0: 1189; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1190; CHECK-NEXT: #APP 1191; CHECK-NEXT: nop 1192; CHECK-NEXT: #NO_APP 1193; CHECK-NEXT: vpmovsxwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload 1194; CHECK-NEXT: retq 1195 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1196 %2 = sext <8 x i16> %a0 to <8 x i32> 1197 ret <8 x i32> %2 1198} 1199 1200define <4 x i64> @stack_fold_pmovsxwq(<8 x i16> %a0) { 1201; CHECK-LABEL: stack_fold_pmovsxwq: 1202; CHECK: # %bb.0: 1203; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1204; CHECK-NEXT: #APP 1205; CHECK-NEXT: nop 1206; CHECK-NEXT: #NO_APP 1207; CHECK-NEXT: vpmovsxwq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload 1208; CHECK-NEXT: retq 1209 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1210 %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1211 %3 = sext <4 x i16> %2 to <4 x i64> 1212 ret <4 x i64> %3 1213} 1214 1215define <8 x i32> @stack_fold_pmovzxbd(<16 x i8> %a0) { 1216; CHECK-LABEL: stack_fold_pmovzxbd: 1217; CHECK: # %bb.0: 1218; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1219; CHECK-NEXT: #APP 1220; CHECK-NEXT: nop 1221; CHECK-NEXT: #NO_APP 1222; CHECK-NEXT: vpmovzxbd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload 1223; CHECK-NEXT: # ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 1224; CHECK-NEXT: retq 1225 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1226 %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1227 %3 = zext <8 x i8> %2 to <8 x i32> 1228 ret <8 x i32> %3 1229} 1230 1231define <4 x i64> @stack_fold_pmovzxbq(<16 x i8> %a0) { 1232; CHECK-LABEL: stack_fold_pmovzxbq: 1233; CHECK: # %bb.0: 1234; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1235; CHECK-NEXT: #APP 1236; CHECK-NEXT: nop 1237; CHECK-NEXT: #NO_APP 1238; CHECK-NEXT: vpmovzxbq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload 1239; CHECK-NEXT: # ymm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero 1240; CHECK-NEXT: retq 1241 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1242 %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1243 %3 = zext <4 x i8> %2 to <4 x i64> 1244 ret <4 x i64> %3 1245} 1246 1247define <16 x i16> @stack_fold_pmovzxbw(<16 x i8> %a0) { 1248; CHECK-LABEL: stack_fold_pmovzxbw: 1249; CHECK: # %bb.0: 1250; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1251; CHECK-NEXT: #APP 1252; CHECK-NEXT: nop 1253; CHECK-NEXT: #NO_APP 1254; CHECK-NEXT: vpmovzxbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload 1255; CHECK-NEXT: # ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero 1256; CHECK-NEXT: retq 1257 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1258 %2 = zext <16 x i8> %a0 to <16 x i16> 1259 ret <16 x i16> %2 1260} 1261 1262define <4 x i64> @stack_fold_pmovzxdq(<4 x i32> %a0) { 1263; CHECK-LABEL: stack_fold_pmovzxdq: 1264; CHECK: # %bb.0: 1265; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1266; CHECK-NEXT: #APP 1267; CHECK-NEXT: nop 1268; CHECK-NEXT: #NO_APP 1269; CHECK-NEXT: vpmovzxdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload 1270; CHECK-NEXT: # ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 1271; CHECK-NEXT: retq 1272 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1273 %2 = zext <4 x i32> %a0 to <4 x i64> 1274 ret <4 x i64> %2 1275} 1276 1277define <8 x i32> @stack_fold_pmovzxwd(<8 x i16> %a0) { 1278; CHECK-LABEL: stack_fold_pmovzxwd: 1279; CHECK: # %bb.0: 1280; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1281; CHECK-NEXT: #APP 1282; CHECK-NEXT: nop 1283; CHECK-NEXT: #NO_APP 1284; CHECK-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload 1285; CHECK-NEXT: # ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 1286; CHECK-NEXT: retq 1287 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1288 %2 = zext <8 x i16> %a0 to <8 x i32> 1289 ret <8 x i32> %2 1290} 1291 1292define <4 x i64> @stack_fold_pmovzxwq(<8 x i16> %a0) { 1293; CHECK-LABEL: stack_fold_pmovzxwq: 1294; CHECK: # %bb.0: 1295; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1296; CHECK-NEXT: #APP 1297; CHECK-NEXT: nop 1298; CHECK-NEXT: #NO_APP 1299; CHECK-NEXT: vpmovzxwq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload 1300; CHECK-NEXT: # ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 1301; CHECK-NEXT: retq 1302 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1303 %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1304 %3 = zext <4 x i16> %2 to <4 x i64> 1305 ret <4 x i64> %3 1306} 1307 1308define <4 x i64> @stack_fold_pmuldq(<8 x i32> %a0, <8 x i32> %a1) { 1309; CHECK-LABEL: stack_fold_pmuldq: 1310; CHECK: # %bb.0: 1311; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1312; CHECK-NEXT: #APP 1313; CHECK-NEXT: nop 1314; CHECK-NEXT: #NO_APP 1315; CHECK-NEXT: vpmuldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1316; CHECK-NEXT: retq 1317 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1318 %2 = bitcast <8 x i32> %a0 to <4 x i64> 1319 %3 = bitcast <8 x i32> %a1 to <4 x i64> 1320 %4 = shl <4 x i64> %2, <i64 32, i64 32, i64 32, i64 32> 1321 %5 = ashr <4 x i64> %4, <i64 32, i64 32, i64 32, i64 32> 1322 %6 = shl <4 x i64> %3, <i64 32, i64 32, i64 32, i64 32> 1323 %7 = ashr <4 x i64> %6, <i64 32, i64 32, i64 32, i64 32> 1324 %8 = mul <4 x i64> %5, %7 1325 ret <4 x i64> %8 1326} 1327 1328define <16 x i16> @stack_fold_pmulhrsw(<16 x i16> %a0, <16 x i16> %a1) { 1329; CHECK-LABEL: stack_fold_pmulhrsw: 1330; CHECK: # %bb.0: 1331; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1332; CHECK-NEXT: #APP 1333; CHECK-NEXT: nop 1334; CHECK-NEXT: #NO_APP 1335; CHECK-NEXT: vpmulhrsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1336; CHECK-NEXT: retq 1337 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1338 %2 = call <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16> %a0, <16 x i16> %a1) 1339 ret <16 x i16> %2 1340} 1341declare <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16>, <16 x i16>) nounwind readnone 1342 1343define <16 x i16> @stack_fold_pmulhuw(<16 x i16> %a0, <16 x i16> %a1) { 1344; CHECK-LABEL: stack_fold_pmulhuw: 1345; CHECK: # %bb.0: 1346; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1347; CHECK-NEXT: #APP 1348; CHECK-NEXT: nop 1349; CHECK-NEXT: #NO_APP 1350; CHECK-NEXT: vpmulhuw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1351; CHECK-NEXT: retq 1352 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1353 %2 = call <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16> %a0, <16 x i16> %a1) 1354 ret <16 x i16> %2 1355} 1356declare <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16>, <16 x i16>) nounwind readnone 1357 1358define <16 x i16> @stack_fold_pmulhw(<16 x i16> %a0, <16 x i16> %a1) { 1359; CHECK-LABEL: stack_fold_pmulhw: 1360; CHECK: # %bb.0: 1361; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1362; CHECK-NEXT: #APP 1363; CHECK-NEXT: nop 1364; CHECK-NEXT: #NO_APP 1365; CHECK-NEXT: vpmulhw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1366; CHECK-NEXT: retq 1367 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1368 %2 = call <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16> %a0, <16 x i16> %a1) 1369 ret <16 x i16> %2 1370} 1371declare <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16>, <16 x i16>) nounwind readnone 1372 1373define <8 x i32> @stack_fold_pmulld(<8 x i32> %a0, <8 x i32> %a1) { 1374; CHECK-LABEL: stack_fold_pmulld: 1375; CHECK: # %bb.0: 1376; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1377; CHECK-NEXT: #APP 1378; CHECK-NEXT: nop 1379; CHECK-NEXT: #NO_APP 1380; CHECK-NEXT: vpmulld {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1381; CHECK-NEXT: retq 1382 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1383 %2 = mul <8 x i32> %a0, %a1 1384 ret <8 x i32> %2 1385} 1386 1387define <16 x i16> @stack_fold_pmullw(<16 x i16> %a0, <16 x i16> %a1) { 1388; CHECK-LABEL: stack_fold_pmullw: 1389; CHECK: # %bb.0: 1390; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1391; CHECK-NEXT: #APP 1392; CHECK-NEXT: nop 1393; CHECK-NEXT: #NO_APP 1394; CHECK-NEXT: vpmullw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1395; CHECK-NEXT: retq 1396 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1397 %2 = mul <16 x i16> %a0, %a1 1398 ret <16 x i16> %2 1399} 1400 1401define <4 x i64> @stack_fold_pmuludq(<8 x i32> %a0, <8 x i32> %a1) { 1402; CHECK-LABEL: stack_fold_pmuludq: 1403; CHECK: # %bb.0: 1404; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1405; CHECK-NEXT: #APP 1406; CHECK-NEXT: nop 1407; CHECK-NEXT: #NO_APP 1408; CHECK-NEXT: vpmuludq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1409; CHECK-NEXT: retq 1410 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1411 %2 = bitcast <8 x i32> %a0 to <4 x i64> 1412 %3 = bitcast <8 x i32> %a1 to <4 x i64> 1413 %4 = and <4 x i64> %2, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 1414 %5 = and <4 x i64> %3, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 1415 %6 = mul <4 x i64> %4, %5 1416 ret <4 x i64> %6 1417} 1418 1419define <32 x i8> @stack_fold_por(<32 x i8> %a0, <32 x i8> %a1) { 1420; CHECK-LABEL: stack_fold_por: 1421; CHECK: # %bb.0: 1422; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1423; CHECK-NEXT: #APP 1424; CHECK-NEXT: nop 1425; CHECK-NEXT: #NO_APP 1426; CHECK-NEXT: vpor {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1427; CHECK-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1428; CHECK-NEXT: retq 1429 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1430 %2 = or <32 x i8> %a0, %a1 1431 ; add forces execution domain 1432 %3 = add <32 x i8> %2, <i8 2, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 1433 ret <32 x i8> %3 1434} 1435 1436define <4 x i64> @stack_fold_psadbw(<32 x i8> %a0, <32 x i8> %a1) { 1437; CHECK-LABEL: stack_fold_psadbw: 1438; CHECK: # %bb.0: 1439; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1440; CHECK-NEXT: #APP 1441; CHECK-NEXT: nop 1442; CHECK-NEXT: #NO_APP 1443; CHECK-NEXT: vpsadbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1444; CHECK-NEXT: retq 1445 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1446 %2 = call <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8> %a0, <32 x i8> %a1) 1447 ret <4 x i64> %2 1448} 1449declare <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8>, <32 x i8>) nounwind readnone 1450 1451define <32 x i8> @stack_fold_pshufb(<32 x i8> %a0, <32 x i8> %a1) { 1452; CHECK-LABEL: stack_fold_pshufb: 1453; CHECK: # %bb.0: 1454; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1455; CHECK-NEXT: #APP 1456; CHECK-NEXT: nop 1457; CHECK-NEXT: #NO_APP 1458; CHECK-NEXT: vpshufb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1459; CHECK-NEXT: retq 1460 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1461 %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> %a1) 1462 ret <32 x i8> %2 1463} 1464declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>) nounwind readnone 1465 1466define <8 x i32> @stack_fold_pshufd(<8 x i32> %a0) { 1467; CHECK-LABEL: stack_fold_pshufd: 1468; CHECK: # %bb.0: 1469; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1470; CHECK-NEXT: #APP 1471; CHECK-NEXT: nop 1472; CHECK-NEXT: #NO_APP 1473; CHECK-NEXT: vpshufd $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 1474; CHECK-NEXT: # ymm0 = mem[3,2,1,0,7,6,5,4] 1475; CHECK-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1476; CHECK-NEXT: retq 1477 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1478 %2 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4> 1479 ; add forces execution domain 1480 %3 = add <8 x i32> %2, <i32 2, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 1481 ret <8 x i32> %3 1482} 1483 1484define <16 x i16> @stack_fold_vpshufhw(<16 x i16> %a0) { 1485; CHECK-LABEL: stack_fold_vpshufhw: 1486; CHECK: # %bb.0: 1487; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1488; CHECK-NEXT: #APP 1489; CHECK-NEXT: nop 1490; CHECK-NEXT: #NO_APP 1491; CHECK-NEXT: vpshufhw $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 1492; CHECK-NEXT: # ymm0 = mem[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12] 1493; CHECK-NEXT: retq 1494 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1495 %2 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 13, i32 12> 1496 ret <16 x i16> %2 1497} 1498 1499define <16 x i16> @stack_fold_vpshuflw(<16 x i16> %a0) { 1500; CHECK-LABEL: stack_fold_vpshuflw: 1501; CHECK: # %bb.0: 1502; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1503; CHECK-NEXT: #APP 1504; CHECK-NEXT: nop 1505; CHECK-NEXT: #NO_APP 1506; CHECK-NEXT: vpshuflw $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload 1507; CHECK-NEXT: # ymm0 = mem[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] 1508; CHECK-NEXT: retq 1509 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1510 %2 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15> 1511 ret <16 x i16> %2 1512} 1513 1514define <32 x i8> @stack_fold_psignb(<32 x i8> %a0, <32 x i8> %a1) { 1515; CHECK-LABEL: stack_fold_psignb: 1516; CHECK: # %bb.0: 1517; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1518; CHECK-NEXT: #APP 1519; CHECK-NEXT: nop 1520; CHECK-NEXT: #NO_APP 1521; CHECK-NEXT: vpsignb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1522; CHECK-NEXT: retq 1523 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1524 %2 = call <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8> %a0, <32 x i8> %a1) 1525 ret <32 x i8> %2 1526} 1527declare <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8>, <32 x i8>) nounwind readnone 1528 1529define <8 x i32> @stack_fold_psignd(<8 x i32> %a0, <8 x i32> %a1) { 1530; CHECK-LABEL: stack_fold_psignd: 1531; CHECK: # %bb.0: 1532; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1533; CHECK-NEXT: #APP 1534; CHECK-NEXT: nop 1535; CHECK-NEXT: #NO_APP 1536; CHECK-NEXT: vpsignd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1537; CHECK-NEXT: retq 1538 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1539 %2 = call <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32> %a0, <8 x i32> %a1) 1540 ret <8 x i32> %2 1541} 1542declare <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32>, <8 x i32>) nounwind readnone 1543 1544define <16 x i16> @stack_fold_psignw(<16 x i16> %a0, <16 x i16> %a1) { 1545; CHECK-LABEL: stack_fold_psignw: 1546; CHECK: # %bb.0: 1547; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1548; CHECK-NEXT: #APP 1549; CHECK-NEXT: nop 1550; CHECK-NEXT: #NO_APP 1551; CHECK-NEXT: vpsignw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1552; CHECK-NEXT: retq 1553 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1554 %2 = call <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16> %a0, <16 x i16> %a1) 1555 ret <16 x i16> %2 1556} 1557declare <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16>, <16 x i16>) nounwind readnone 1558 1559define <8 x i32> @stack_fold_pslld(<8 x i32> %a0, <4 x i32> %a1) { 1560; CHECK-LABEL: stack_fold_pslld: 1561; CHECK: # %bb.0: 1562; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1563; CHECK-NEXT: #APP 1564; CHECK-NEXT: nop 1565; CHECK-NEXT: #NO_APP 1566; CHECK-NEXT: vpslld {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 1567; CHECK-NEXT: retq 1568 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1569 %2 = call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %a0, <4 x i32> %a1) 1570 ret <8 x i32> %2 1571} 1572declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) nounwind readnone 1573 1574define <4 x i64> @stack_fold_psllq(<4 x i64> %a0, <2 x i64> %a1) { 1575; CHECK-LABEL: stack_fold_psllq: 1576; CHECK: # %bb.0: 1577; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1578; CHECK-NEXT: #APP 1579; CHECK-NEXT: nop 1580; CHECK-NEXT: #NO_APP 1581; CHECK-NEXT: vpsllq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 1582; CHECK-NEXT: retq 1583 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1584 %2 = call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %a0, <2 x i64> %a1) 1585 ret <4 x i64> %2 1586} 1587declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) nounwind readnone 1588 1589define <4 x i32> @stack_fold_psllvd(<4 x i32> %a0, <4 x i32> %a1) { 1590; CHECK-LABEL: stack_fold_psllvd: 1591; CHECK: # %bb.0: 1592; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1593; CHECK-NEXT: #APP 1594; CHECK-NEXT: nop 1595; CHECK-NEXT: #NO_APP 1596; CHECK-NEXT: vpsllvd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1597; CHECK-NEXT: retq 1598 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1599 %2 = call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %a0, <4 x i32> %a1) 1600 ret <4 x i32> %2 1601} 1602declare <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32>, <4 x i32>) nounwind readnone 1603 1604define <8 x i32> @stack_fold_psllvd_ymm(<8 x i32> %a0, <8 x i32> %a1) { 1605; CHECK-LABEL: stack_fold_psllvd_ymm: 1606; CHECK: # %bb.0: 1607; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1608; CHECK-NEXT: #APP 1609; CHECK-NEXT: nop 1610; CHECK-NEXT: #NO_APP 1611; CHECK-NEXT: vpsllvd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1612; CHECK-NEXT: retq 1613 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1614 %2 = call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %a0, <8 x i32> %a1) 1615 ret <8 x i32> %2 1616} 1617declare <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32>, <8 x i32>) nounwind readnone 1618 1619define <2 x i64> @stack_fold_psllvq(<2 x i64> %a0, <2 x i64> %a1) { 1620; CHECK-LABEL: stack_fold_psllvq: 1621; CHECK: # %bb.0: 1622; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1623; CHECK-NEXT: #APP 1624; CHECK-NEXT: nop 1625; CHECK-NEXT: #NO_APP 1626; CHECK-NEXT: vpsllvq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1627; CHECK-NEXT: retq 1628 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1629 %2 = call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %a0, <2 x i64> %a1) 1630 ret <2 x i64> %2 1631} 1632declare <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64>, <2 x i64>) nounwind readnone 1633 1634define <4 x i64> @stack_fold_psllvq_ymm(<4 x i64> %a0, <4 x i64> %a1) { 1635; CHECK-LABEL: stack_fold_psllvq_ymm: 1636; CHECK: # %bb.0: 1637; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1638; CHECK-NEXT: #APP 1639; CHECK-NEXT: nop 1640; CHECK-NEXT: #NO_APP 1641; CHECK-NEXT: vpsllvq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1642; CHECK-NEXT: retq 1643 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1644 %2 = call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %a0, <4 x i64> %a1) 1645 ret <4 x i64> %2 1646} 1647declare <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64>, <4 x i64>) nounwind readnone 1648 1649define <16 x i16> @stack_fold_psllw(<16 x i16> %a0, <8 x i16> %a1) { 1650; CHECK-LABEL: stack_fold_psllw: 1651; CHECK: # %bb.0: 1652; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1653; CHECK-NEXT: #APP 1654; CHECK-NEXT: nop 1655; CHECK-NEXT: #NO_APP 1656; CHECK-NEXT: vpsllw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 1657; CHECK-NEXT: retq 1658 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1659 %2 = call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %a0, <8 x i16> %a1) 1660 ret <16 x i16> %2 1661} 1662declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) nounwind readnone 1663 1664define <8 x i32> @stack_fold_psrad(<8 x i32> %a0, <4 x i32> %a1) { 1665; CHECK-LABEL: stack_fold_psrad: 1666; CHECK: # %bb.0: 1667; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1668; CHECK-NEXT: #APP 1669; CHECK-NEXT: nop 1670; CHECK-NEXT: #NO_APP 1671; CHECK-NEXT: vpsrad {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 1672; CHECK-NEXT: retq 1673 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1674 %2 = call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %a0, <4 x i32> %a1) 1675 ret <8 x i32> %2 1676} 1677declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>) nounwind readnone 1678 1679define <4 x i32> @stack_fold_psravd(<4 x i32> %a0, <4 x i32> %a1) { 1680; CHECK-LABEL: stack_fold_psravd: 1681; CHECK: # %bb.0: 1682; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1683; CHECK-NEXT: #APP 1684; CHECK-NEXT: nop 1685; CHECK-NEXT: #NO_APP 1686; CHECK-NEXT: vpsravd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1687; CHECK-NEXT: retq 1688 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1689 %2 = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %a0, <4 x i32> %a1) 1690 ret <4 x i32> %2 1691} 1692declare <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32>, <4 x i32>) nounwind readnone 1693 1694define <8 x i32> @stack_fold_psravd_ymm(<8 x i32> %a0, <8 x i32> %a1) { 1695; CHECK-LABEL: stack_fold_psravd_ymm: 1696; CHECK: # %bb.0: 1697; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1698; CHECK-NEXT: #APP 1699; CHECK-NEXT: nop 1700; CHECK-NEXT: #NO_APP 1701; CHECK-NEXT: vpsravd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1702; CHECK-NEXT: retq 1703 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1704 %2 = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %a0, <8 x i32> %a1) 1705 ret <8 x i32> %2 1706} 1707declare <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32>, <8 x i32>) nounwind readnone 1708 1709define <16 x i16> @stack_fold_psraw(<16 x i16> %a0, <8 x i16> %a1) { 1710; CHECK-LABEL: stack_fold_psraw: 1711; CHECK: # %bb.0: 1712; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1713; CHECK-NEXT: #APP 1714; CHECK-NEXT: nop 1715; CHECK-NEXT: #NO_APP 1716; CHECK-NEXT: vpsraw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 1717; CHECK-NEXT: retq 1718 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1719 %2 = call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %a0, <8 x i16> %a1) 1720 ret <16 x i16> %2 1721} 1722declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>) nounwind readnone 1723 1724define <8 x i32> @stack_fold_psrld(<8 x i32> %a0, <4 x i32> %a1) { 1725; CHECK-LABEL: stack_fold_psrld: 1726; CHECK: # %bb.0: 1727; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1728; CHECK-NEXT: #APP 1729; CHECK-NEXT: nop 1730; CHECK-NEXT: #NO_APP 1731; CHECK-NEXT: vpsrld {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 1732; CHECK-NEXT: retq 1733 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1734 %2 = call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %a0, <4 x i32> %a1) 1735 ret <8 x i32> %2 1736} 1737declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) nounwind readnone 1738 1739define <4 x i64> @stack_fold_psrlq(<4 x i64> %a0, <2 x i64> %a1) { 1740; CHECK-LABEL: stack_fold_psrlq: 1741; CHECK: # %bb.0: 1742; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1743; CHECK-NEXT: #APP 1744; CHECK-NEXT: nop 1745; CHECK-NEXT: #NO_APP 1746; CHECK-NEXT: vpsrlq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 1747; CHECK-NEXT: retq 1748 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1749 %2 = call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %a0, <2 x i64> %a1) 1750 ret <4 x i64> %2 1751} 1752declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) nounwind readnone 1753 1754define <4 x i32> @stack_fold_psrlvd(<4 x i32> %a0, <4 x i32> %a1) { 1755; CHECK-LABEL: stack_fold_psrlvd: 1756; CHECK: # %bb.0: 1757; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1758; CHECK-NEXT: #APP 1759; CHECK-NEXT: nop 1760; CHECK-NEXT: #NO_APP 1761; CHECK-NEXT: vpsrlvd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1762; CHECK-NEXT: retq 1763 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1764 %2 = call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %a0, <4 x i32> %a1) 1765 ret <4 x i32> %2 1766} 1767declare <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32>, <4 x i32>) nounwind readnone 1768 1769define <8 x i32> @stack_fold_psrlvd_ymm(<8 x i32> %a0, <8 x i32> %a1) { 1770; CHECK-LABEL: stack_fold_psrlvd_ymm: 1771; CHECK: # %bb.0: 1772; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1773; CHECK-NEXT: #APP 1774; CHECK-NEXT: nop 1775; CHECK-NEXT: #NO_APP 1776; CHECK-NEXT: vpsrlvd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1777; CHECK-NEXT: retq 1778 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1779 %2 = call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %a0, <8 x i32> %a1) 1780 ret <8 x i32> %2 1781} 1782declare <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32>, <8 x i32>) nounwind readnone 1783 1784define <2 x i64> @stack_fold_psrlvq(<2 x i64> %a0, <2 x i64> %a1) { 1785; CHECK-LABEL: stack_fold_psrlvq: 1786; CHECK: # %bb.0: 1787; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1788; CHECK-NEXT: #APP 1789; CHECK-NEXT: nop 1790; CHECK-NEXT: #NO_APP 1791; CHECK-NEXT: vpsrlvq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 1792; CHECK-NEXT: retq 1793 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1794 %2 = call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %a0, <2 x i64> %a1) 1795 ret <2 x i64> %2 1796} 1797declare <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64>, <2 x i64>) nounwind readnone 1798 1799define <4 x i64> @stack_fold_psrlvq_ymm(<4 x i64> %a0, <4 x i64> %a1) { 1800; CHECK-LABEL: stack_fold_psrlvq_ymm: 1801; CHECK: # %bb.0: 1802; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1803; CHECK-NEXT: #APP 1804; CHECK-NEXT: nop 1805; CHECK-NEXT: #NO_APP 1806; CHECK-NEXT: vpsrlvq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1807; CHECK-NEXT: retq 1808 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1809 %2 = call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %a0, <4 x i64> %a1) 1810 ret <4 x i64> %2 1811} 1812declare <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64>, <4 x i64>) nounwind readnone 1813 1814define <16 x i16> @stack_fold_psrlw(<16 x i16> %a0, <8 x i16> %a1) { 1815; CHECK-LABEL: stack_fold_psrlw: 1816; CHECK: # %bb.0: 1817; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1818; CHECK-NEXT: #APP 1819; CHECK-NEXT: nop 1820; CHECK-NEXT: #NO_APP 1821; CHECK-NEXT: vpsrlw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload 1822; CHECK-NEXT: retq 1823 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1824 %2 = call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %a0, <8 x i16> %a1) 1825 ret <16 x i16> %2 1826} 1827declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) nounwind readnone 1828 1829define <32 x i8> @stack_fold_psubb(<32 x i8> %a0, <32 x i8> %a1) { 1830; CHECK-LABEL: stack_fold_psubb: 1831; CHECK: # %bb.0: 1832; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1833; CHECK-NEXT: #APP 1834; CHECK-NEXT: nop 1835; CHECK-NEXT: #NO_APP 1836; CHECK-NEXT: vpsubb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1837; CHECK-NEXT: retq 1838 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1839 %2 = sub <32 x i8> %a0, %a1 1840 ret <32 x i8> %2 1841} 1842 1843define <8 x i32> @stack_fold_psubd(<8 x i32> %a0, <8 x i32> %a1) { 1844; CHECK-LABEL: stack_fold_psubd: 1845; CHECK: # %bb.0: 1846; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1847; CHECK-NEXT: #APP 1848; CHECK-NEXT: nop 1849; CHECK-NEXT: #NO_APP 1850; CHECK-NEXT: vpsubd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1851; CHECK-NEXT: retq 1852 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1853 %2 = sub <8 x i32> %a0, %a1 1854 ret <8 x i32> %2 1855} 1856 1857define <4 x i64> @stack_fold_psubq(<4 x i64> %a0, <4 x i64> %a1) { 1858; CHECK-LABEL: stack_fold_psubq: 1859; CHECK: # %bb.0: 1860; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1861; CHECK-NEXT: #APP 1862; CHECK-NEXT: nop 1863; CHECK-NEXT: #NO_APP 1864; CHECK-NEXT: vpsubq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1865; CHECK-NEXT: retq 1866 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1867 %2 = sub <4 x i64> %a0, %a1 1868 ret <4 x i64> %2 1869} 1870 1871define <32 x i8> @stack_fold_psubsb(<32 x i8> %a0, <32 x i8> %a1) { 1872; CHECK-LABEL: stack_fold_psubsb: 1873; CHECK: # %bb.0: 1874; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1875; CHECK-NEXT: #APP 1876; CHECK-NEXT: nop 1877; CHECK-NEXT: #NO_APP 1878; CHECK-NEXT: vpsubsb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1879; CHECK-NEXT: retq 1880 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1881 %2 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> %a0, <32 x i8> %a1) 1882 ret <32 x i8> %2 1883} 1884declare <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone 1885 1886define <16 x i16> @stack_fold_psubsw(<16 x i16> %a0, <16 x i16> %a1) { 1887; CHECK-LABEL: stack_fold_psubsw: 1888; CHECK: # %bb.0: 1889; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1890; CHECK-NEXT: #APP 1891; CHECK-NEXT: nop 1892; CHECK-NEXT: #NO_APP 1893; CHECK-NEXT: vpsubsw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1894; CHECK-NEXT: retq 1895 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1896 %2 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> %a0, <16 x i16> %a1) 1897 ret <16 x i16> %2 1898} 1899declare <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone 1900 1901define <32 x i8> @stack_fold_psubusb(<32 x i8> %a0, <32 x i8> %a1) { 1902; CHECK-LABEL: stack_fold_psubusb: 1903; CHECK: # %bb.0: 1904; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1905; CHECK-NEXT: #APP 1906; CHECK-NEXT: nop 1907; CHECK-NEXT: #NO_APP 1908; CHECK-NEXT: vpsubusb {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1909; CHECK-NEXT: retq 1910 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1911 %2 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> %a0, <32 x i8> %a1) 1912 ret <32 x i8> %2 1913} 1914declare <32 x i8> @llvm.usub.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone 1915 1916define <16 x i16> @stack_fold_psubusw(<16 x i16> %a0, <16 x i16> %a1) { 1917; CHECK-LABEL: stack_fold_psubusw: 1918; CHECK: # %bb.0: 1919; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1920; CHECK-NEXT: #APP 1921; CHECK-NEXT: nop 1922; CHECK-NEXT: #NO_APP 1923; CHECK-NEXT: vpsubusw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1924; CHECK-NEXT: retq 1925 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1926 %2 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> %a0, <16 x i16> %a1) 1927 ret <16 x i16> %2 1928} 1929declare <16 x i16> @llvm.usub.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone 1930 1931define <16 x i16> @stack_fold_psubw(<16 x i16> %a0, <16 x i16> %a1) { 1932; CHECK-LABEL: stack_fold_psubw: 1933; CHECK: # %bb.0: 1934; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1935; CHECK-NEXT: #APP 1936; CHECK-NEXT: nop 1937; CHECK-NEXT: #NO_APP 1938; CHECK-NEXT: vpsubw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1939; CHECK-NEXT: retq 1940 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1941 %2 = sub <16 x i16> %a0, %a1 1942 ret <16 x i16> %2 1943} 1944 1945define <32 x i8> @stack_fold_punpckhbw(<32 x i8> %a0, <32 x i8> %a1) { 1946; CHECK-LABEL: stack_fold_punpckhbw: 1947; CHECK: # %bb.0: 1948; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1949; CHECK-NEXT: #APP 1950; CHECK-NEXT: nop 1951; CHECK-NEXT: #NO_APP 1952; CHECK-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1953; CHECK-NEXT: # ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] 1954; CHECK-NEXT: retq 1955 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1956 %2 = shufflevector <32 x i8> %a0, <32 x i8> %a1, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63> 1957 ret <32 x i8> %2 1958} 1959 1960define <8 x i32> @stack_fold_punpckhdq(<8 x i32> %a0, <8 x i32> %a1) { 1961; CHECK-LABEL: stack_fold_punpckhdq: 1962; CHECK: # %bb.0: 1963; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1964; CHECK-NEXT: #APP 1965; CHECK-NEXT: nop 1966; CHECK-NEXT: #NO_APP 1967; CHECK-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1968; CHECK-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] 1969; CHECK-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1970; CHECK-NEXT: retq 1971 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1972 %2 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15> 1973 ; add forces execution domain 1974 %3 = add <8 x i32> %2, <i32 2, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 1975 ret <8 x i32> %3 1976} 1977 1978define <4 x i64> @stack_fold_punpckhqdq(<4 x i64> %a0, <4 x i64> %a1) { 1979; CHECK-LABEL: stack_fold_punpckhqdq: 1980; CHECK: # %bb.0: 1981; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1982; CHECK-NEXT: #APP 1983; CHECK-NEXT: nop 1984; CHECK-NEXT: #NO_APP 1985; CHECK-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 1986; CHECK-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] 1987; CHECK-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1988; CHECK-NEXT: retq 1989 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 1990 %2 = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 1, i32 5, i32 3, i32 7> 1991 ; add forces execution domain 1992 %3 = add <4 x i64> %2, <i64 2, i64 1, i64 1, i64 1> 1993 ret <4 x i64> %3 1994} 1995 1996define <16 x i16> @stack_fold_punpckhwd(<16 x i16> %a0, <16 x i16> %a1) { 1997; CHECK-LABEL: stack_fold_punpckhwd: 1998; CHECK: # %bb.0: 1999; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2000; CHECK-NEXT: #APP 2001; CHECK-NEXT: nop 2002; CHECK-NEXT: #NO_APP 2003; CHECK-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 2004; CHECK-NEXT: # ymm0 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] 2005; CHECK-NEXT: retq 2006 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2007 %2 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> 2008 ret <16 x i16> %2 2009} 2010 2011define <32 x i8> @stack_fold_punpcklbw(<32 x i8> %a0, <32 x i8> %a1) { 2012; CHECK-LABEL: stack_fold_punpcklbw: 2013; CHECK: # %bb.0: 2014; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2015; CHECK-NEXT: #APP 2016; CHECK-NEXT: nop 2017; CHECK-NEXT: #NO_APP 2018; CHECK-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 2019; CHECK-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[16],mem[16],ymm0[17],mem[17],ymm0[18],mem[18],ymm0[19],mem[19],ymm0[20],mem[20],ymm0[21],mem[21],ymm0[22],mem[22],ymm0[23],mem[23] 2020; CHECK-NEXT: retq 2021 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2022 %2 = shufflevector <32 x i8> %a0, <32 x i8> %a1, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55> 2023 ret <32 x i8> %2 2024} 2025 2026define <8 x i32> @stack_fold_punpckldq(<8 x i32> %a0, <8 x i32> %a1) { 2027; CHECK-LABEL: stack_fold_punpckldq: 2028; CHECK: # %bb.0: 2029; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2030; CHECK-NEXT: #APP 2031; CHECK-NEXT: nop 2032; CHECK-NEXT: #NO_APP 2033; CHECK-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 2034; CHECK-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] 2035; CHECK-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2036; CHECK-NEXT: retq 2037 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2038 %2 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13> 2039 ; add forces execution domain 2040 %3 = add <8 x i32> %2, <i32 2, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 2041 ret <8 x i32> %3 2042} 2043 2044define <4 x i64> @stack_fold_punpcklqdq(<4 x i64> %a0, <4 x i64> %a1) { 2045; CHECK-LABEL: stack_fold_punpcklqdq: 2046; CHECK: # %bb.0: 2047; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2048; CHECK-NEXT: #APP 2049; CHECK-NEXT: nop 2050; CHECK-NEXT: #NO_APP 2051; CHECK-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 2052; CHECK-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] 2053; CHECK-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2054; CHECK-NEXT: retq 2055 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2056 %2 = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6> 2057 ; add forces execution domain 2058 %3 = add <4 x i64> %2, <i64 2, i64 1, i64 1, i64 1> 2059 ret <4 x i64> %3 2060} 2061 2062define <16 x i16> @stack_fold_punpcklwd(<16 x i16> %a0, <16 x i16> %a1) { 2063; CHECK-LABEL: stack_fold_punpcklwd: 2064; CHECK: # %bb.0: 2065; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2066; CHECK-NEXT: #APP 2067; CHECK-NEXT: nop 2068; CHECK-NEXT: #NO_APP 2069; CHECK-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 2070; CHECK-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11] 2071; CHECK-NEXT: retq 2072 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2073 %2 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27> 2074 ret <16 x i16> %2 2075} 2076 2077define <32 x i8> @stack_fold_pxor(<32 x i8> %a0, <32 x i8> %a1) { 2078; CHECK-LABEL: stack_fold_pxor: 2079; CHECK: # %bb.0: 2080; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2081; CHECK-NEXT: #APP 2082; CHECK-NEXT: nop 2083; CHECK-NEXT: #NO_APP 2084; CHECK-NEXT: vpxor {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 2085; CHECK-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2086; CHECK-NEXT: retq 2087 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() 2088 %2 = xor <32 x i8> %a0, %a1 2089 ; add forces execution domain 2090 %3 = add <32 x i8> %2, <i8 2, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 2091 ret <32 x i8> %3 2092} 2093