1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86-SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=X64,X64-SSSE3 4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX2 6 7target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" 8 9define i32 @t(ptr %val) nounwind { 10; X86-SSE2-LABEL: t: 11; X86-SSE2: # %bb.0: 12; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax 13; X86-SSE2-NEXT: movl 8(%eax), %eax 14; X86-SSE2-NEXT: retl 15; 16; X64-LABEL: t: 17; X64: # %bb.0: 18; X64-NEXT: movl 8(%rdi), %eax 19; X64-NEXT: retq 20 %tmp2 = load <2 x i64>, ptr %val, align 16 ; <<2 x i64>> [#uses=1] 21 %tmp3 = bitcast <2 x i64> %tmp2 to <4 x i32> ; <<4 x i32>> [#uses=1] 22 %tmp4 = extractelement <4 x i32> %tmp3, i32 2 ; <i32> [#uses=1] 23 ret i32 %tmp4 24} 25 26; Case where extractelement of load ends up as undef. 27; (Making sure this doesn't crash.) 28define i32 @t2(ptr %xp) { 29; X86-SSE2-LABEL: t2: 30; X86-SSE2: # %bb.0: 31; X86-SSE2-NEXT: retl 32; 33; X64-LABEL: t2: 34; X64: # %bb.0: 35; X64-NEXT: retq 36 %x = load <8 x i32>, ptr %xp 37 %Shuff68 = shufflevector <8 x i32> %x, <8 x i32> undef, <8 x i32> <i32 undef, i32 7, i32 9, i32 undef, i32 13, i32 15, i32 1, i32 3> 38 %y = extractelement <8 x i32> %Shuff68, i32 0 39 ret i32 %y 40} 41 42; This case could easily end up inf-looping in the DAG combiner due to an 43; low alignment load of the vector which prevents us from reliably forming a 44; narrow load. 45 46define void @t3(ptr %a0) { 47; X86-SSE2-LABEL: t3: 48; X86-SSE2: # %bb.0: # %bb 49; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax 50; X86-SSE2-NEXT: movups (%eax), %xmm0 51; X86-SSE2-NEXT: movhps %xmm0, (%eax) 52; X86-SSE2-NEXT: retl 53; 54; X64-SSSE3-LABEL: t3: 55; X64-SSSE3: # %bb.0: # %bb 56; X64-SSSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 57; X64-SSSE3-NEXT: movsd %xmm0, (%rax) 58; X64-SSSE3-NEXT: retq 59; 60; X64-AVX-LABEL: t3: 61; X64-AVX: # %bb.0: # %bb 62; X64-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 63; X64-AVX-NEXT: vmovsd %xmm0, (%rax) 64; X64-AVX-NEXT: retq 65bb: 66 %tmp13 = load <2 x double>, ptr %a0, align 1 67 %.sroa.3.24.vec.extract = extractelement <2 x double> %tmp13, i32 1 68 store double %.sroa.3.24.vec.extract, ptr undef, align 8 69 ret void 70} 71 72; Case where a load is unary shuffled, then bitcast (to a type with the same 73; number of elements) before extractelement. 74; This is testing for an assertion - the extraction was assuming that the undef 75; second shuffle operand was a post-bitcast type instead of a pre-bitcast type. 76define i64 @t4(ptr %a) { 77; X86-SSE2-LABEL: t4: 78; X86-SSE2: # %bb.0: 79; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx 80; X86-SSE2-NEXT: movl (%ecx), %eax 81; X86-SSE2-NEXT: movl 4(%ecx), %edx 82; X86-SSE2-NEXT: retl 83; 84; X64-LABEL: t4: 85; X64: # %bb.0: 86; X64-NEXT: movq (%rdi), %rax 87; X64-NEXT: retq 88 %b = load <2 x double>, ptr %a, align 16 89 %c = shufflevector <2 x double> %b, <2 x double> %b, <2 x i32> <i32 1, i32 0> 90 %d = bitcast <2 x double> %c to <2 x i64> 91 %e = extractelement <2 x i64> %d, i32 1 92 ret i64 %e 93} 94 95; Don't extract from a volatile. 96define void @t5(ptr%a0, ptr%a1) { 97; X86-SSE2-LABEL: t5: 98; X86-SSE2: # %bb.0: 99; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax 100; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx 101; X86-SSE2-NEXT: movaps (%ecx), %xmm0 102; X86-SSE2-NEXT: movhps %xmm0, (%eax) 103; X86-SSE2-NEXT: retl 104; 105; X64-SSSE3-LABEL: t5: 106; X64-SSSE3: # %bb.0: 107; X64-SSSE3-NEXT: movaps (%rdi), %xmm0 108; X64-SSSE3-NEXT: movhps %xmm0, (%rsi) 109; X64-SSSE3-NEXT: retq 110; 111; X64-AVX-LABEL: t5: 112; X64-AVX: # %bb.0: 113; X64-AVX-NEXT: vmovaps (%rdi), %xmm0 114; X64-AVX-NEXT: vmovhps %xmm0, (%rsi) 115; X64-AVX-NEXT: retq 116 %vecload = load volatile <2 x double>, ptr %a0, align 16 117 %vecext = extractelement <2 x double> %vecload, i32 1 118 store volatile double %vecext, ptr %a1, align 8 119 ret void 120} 121 122; Check for multiuse. 123define float @t6(ptr%a0) { 124; X86-SSE2-LABEL: t6: 125; X86-SSE2: # %bb.0: 126; X86-SSE2-NEXT: pushl %eax 127; X86-SSE2-NEXT: .cfi_def_cfa_offset 8 128; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax 129; X86-SSE2-NEXT: movaps (%eax), %xmm0 130; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] 131; X86-SSE2-NEXT: xorps %xmm1, %xmm1 132; X86-SSE2-NEXT: cmpeqss %xmm0, %xmm1 133; X86-SSE2-NEXT: movss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] 134; X86-SSE2-NEXT: andps %xmm1, %xmm2 135; X86-SSE2-NEXT: andnps %xmm0, %xmm1 136; X86-SSE2-NEXT: orps %xmm2, %xmm1 137; X86-SSE2-NEXT: movss %xmm1, (%esp) 138; X86-SSE2-NEXT: flds (%esp) 139; X86-SSE2-NEXT: popl %eax 140; X86-SSE2-NEXT: .cfi_def_cfa_offset 4 141; X86-SSE2-NEXT: retl 142; 143; X64-SSSE3-LABEL: t6: 144; X64-SSSE3: # %bb.0: 145; X64-SSSE3-NEXT: movshdup {{.*#+}} xmm1 = mem[1,1,3,3] 146; X64-SSSE3-NEXT: xorps %xmm0, %xmm0 147; X64-SSSE3-NEXT: cmpeqss %xmm1, %xmm0 148; X64-SSSE3-NEXT: movss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] 149; X64-SSSE3-NEXT: andps %xmm0, %xmm2 150; X64-SSSE3-NEXT: andnps %xmm1, %xmm0 151; X64-SSSE3-NEXT: orps %xmm2, %xmm0 152; X64-SSSE3-NEXT: retq 153; 154; X64-AVX1-LABEL: t6: 155; X64-AVX1: # %bb.0: 156; X64-AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 157; X64-AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 158; X64-AVX1-NEXT: vcmpeqss %xmm1, %xmm0, %xmm1 159; X64-AVX1-NEXT: vblendvps %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 160; X64-AVX1-NEXT: retq 161; 162; X64-AVX2-LABEL: t6: 163; X64-AVX2: # %bb.0: 164; X64-AVX2-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 165; X64-AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 166; X64-AVX2-NEXT: vcmpeqss %xmm1, %xmm0, %xmm1 167; X64-AVX2-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 168; X64-AVX2-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0 169; X64-AVX2-NEXT: retq 170 %vecload = load <8 x float>, ptr %a0, align 32 171 %vecext = extractelement <8 x float> %vecload, i32 1 172 %cmp = fcmp oeq float %vecext, 0.000000e+00 173 %cond = select i1 %cmp, float 1.000000e+00, float %vecext 174 ret float %cond 175} 176 177define void @PR43971(ptr%a0, ptr%a1) { 178; X86-SSE2-LABEL: PR43971: 179; X86-SSE2: # %bb.0: # %entry 180; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax 181; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx 182; X86-SSE2-NEXT: movaps 16(%ecx), %xmm0 183; X86-SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 184; X86-SSE2-NEXT: xorps %xmm1, %xmm1 185; X86-SSE2-NEXT: cmpltss %xmm0, %xmm1 186; X86-SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 187; X86-SSE2-NEXT: andps %xmm1, %xmm2 188; X86-SSE2-NEXT: andnps %xmm0, %xmm1 189; X86-SSE2-NEXT: orps %xmm2, %xmm1 190; X86-SSE2-NEXT: movss %xmm1, (%eax) 191; X86-SSE2-NEXT: retl 192; 193; X64-SSSE3-LABEL: PR43971: 194; X64-SSSE3: # %bb.0: # %entry 195; X64-SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 196; X64-SSSE3-NEXT: xorps %xmm1, %xmm1 197; X64-SSSE3-NEXT: cmpltss %xmm0, %xmm1 198; X64-SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 199; X64-SSSE3-NEXT: andps %xmm1, %xmm2 200; X64-SSSE3-NEXT: andnps %xmm0, %xmm1 201; X64-SSSE3-NEXT: orps %xmm2, %xmm1 202; X64-SSSE3-NEXT: movss %xmm1, (%rsi) 203; X64-SSSE3-NEXT: retq 204; 205; X64-AVX-LABEL: PR43971: 206; X64-AVX: # %bb.0: # %entry 207; X64-AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 208; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 209; X64-AVX-NEXT: vcmpltss %xmm0, %xmm1, %xmm1 210; X64-AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero 211; X64-AVX-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0 212; X64-AVX-NEXT: vmovss %xmm0, (%rsi) 213; X64-AVX-NEXT: retq 214entry: 215 %0 = load <8 x float>, ptr %a0, align 32 216 %vecext = extractelement <8 x float> %0, i32 6 217 %cmp = fcmp ogt float %vecext, 0.000000e+00 218 %1 = load float, ptr %a1, align 4 219 %cond = select i1 %cmp, float %1, float %vecext 220 store float %cond, ptr %a1, align 4 221 ret void 222} 223 224define float @PR43971_1(ptr%a0) nounwind { 225; X86-SSE2-LABEL: PR43971_1: 226; X86-SSE2: # %bb.0: # %entry 227; X86-SSE2-NEXT: pushl %eax 228; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax 229; X86-SSE2-NEXT: movaps (%eax), %xmm0 230; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] 231; X86-SSE2-NEXT: xorps %xmm1, %xmm1 232; X86-SSE2-NEXT: cmpeqss %xmm0, %xmm1 233; X86-SSE2-NEXT: movss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] 234; X86-SSE2-NEXT: andps %xmm1, %xmm2 235; X86-SSE2-NEXT: andnps %xmm0, %xmm1 236; X86-SSE2-NEXT: orps %xmm2, %xmm1 237; X86-SSE2-NEXT: movss %xmm1, (%esp) 238; X86-SSE2-NEXT: flds (%esp) 239; X86-SSE2-NEXT: popl %eax 240; X86-SSE2-NEXT: retl 241; 242; X64-SSSE3-LABEL: PR43971_1: 243; X64-SSSE3: # %bb.0: # %entry 244; X64-SSSE3-NEXT: movshdup {{.*#+}} xmm1 = mem[1,1,3,3] 245; X64-SSSE3-NEXT: xorps %xmm0, %xmm0 246; X64-SSSE3-NEXT: cmpeqss %xmm1, %xmm0 247; X64-SSSE3-NEXT: movss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] 248; X64-SSSE3-NEXT: andps %xmm0, %xmm2 249; X64-SSSE3-NEXT: andnps %xmm1, %xmm0 250; X64-SSSE3-NEXT: orps %xmm2, %xmm0 251; X64-SSSE3-NEXT: retq 252; 253; X64-AVX1-LABEL: PR43971_1: 254; X64-AVX1: # %bb.0: # %entry 255; X64-AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 256; X64-AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 257; X64-AVX1-NEXT: vcmpeqss %xmm1, %xmm0, %xmm1 258; X64-AVX1-NEXT: vblendvps %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 259; X64-AVX1-NEXT: retq 260; 261; X64-AVX2-LABEL: PR43971_1: 262; X64-AVX2: # %bb.0: # %entry 263; X64-AVX2-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 264; X64-AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 265; X64-AVX2-NEXT: vcmpeqss %xmm1, %xmm0, %xmm1 266; X64-AVX2-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 267; X64-AVX2-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0 268; X64-AVX2-NEXT: retq 269entry: 270 %0 = load <8 x float>, ptr %a0, align 32 271 %vecext = extractelement <8 x float> %0, i32 1 272 %cmp = fcmp oeq float %vecext, 0.000000e+00 273 %cond = select i1 %cmp, float 1.000000e+00, float %vecext 274 ret float %cond 275} 276 277define i32 @PR85419(ptr %p0) { 278; X86-SSE2-LABEL: PR85419: 279; X86-SSE2: # %bb.0: 280; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx 281; X86-SSE2-NEXT: movl (%ecx), %edx 282; X86-SSE2-NEXT: xorl %eax, %eax 283; X86-SSE2-NEXT: orl 4(%ecx), %edx 284; X86-SSE2-NEXT: je .LBB8_2 285; X86-SSE2-NEXT: # %bb.1: 286; X86-SSE2-NEXT: movl 8(%ecx), %eax 287; X86-SSE2-NEXT: .LBB8_2: 288; X86-SSE2-NEXT: retl 289; 290; X64-LABEL: PR85419: 291; X64: # %bb.0: 292; X64-NEXT: xorl %eax, %eax 293; X64-NEXT: cmpq $0, (%rdi) 294; X64-NEXT: je .LBB8_2 295; X64-NEXT: # %bb.1: 296; X64-NEXT: movl 8(%rdi), %eax 297; X64-NEXT: .LBB8_2: 298; X64-NEXT: retq 299 %load = load <2 x i64>, ptr %p0, align 16 300 %vecext.i = extractelement <2 x i64> %load, i64 0 301 %cmp = icmp eq i64 %vecext.i, 0 302 %.cast = bitcast <2 x i64> %load to <4 x i32> 303 %vecext.i2 = extractelement <4 x i32> %.cast, i64 2 304 %retval.0 = select i1 %cmp, i32 0, i32 %vecext.i2 305 ret i32 %retval.0 306} 307 308; Test for bad extractions from a VBROADCAST_LOAD of the <2 x i16> non-uniform constant bitcast as <4 x i32>. 309define void @subextract_broadcast_load_constant(ptr nocapture %0, ptr nocapture %1, ptr nocapture %2) nounwind { 310; X86-SSE2-LABEL: subextract_broadcast_load_constant: 311; X86-SSE2: # %bb.0: 312; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax 313; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx 314; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx 315; X86-SSE2-NEXT: movl $-1583308898, (%edx) # imm = 0xA1A09F9E 316; X86-SSE2-NEXT: movw $-24674, (%ecx) # imm = 0x9F9E 317; X86-SSE2-NEXT: movw $-24160, (%eax) # imm = 0xA1A0 318; X86-SSE2-NEXT: retl 319; 320; X64-LABEL: subextract_broadcast_load_constant: 321; X64: # %bb.0: 322; X64-NEXT: movl $-1583308898, (%rdi) # imm = 0xA1A09F9E 323; X64-NEXT: movw $-24674, (%rsi) # imm = 0x9F9E 324; X64-NEXT: movw $-24160, (%rdx) # imm = 0xA1A0 325; X64-NEXT: retq 326 store i8 -98, ptr %0, align 1 327 %4 = getelementptr inbounds i8, ptr %0, i64 1 328 store i8 -97, ptr %4, align 1 329 %5 = getelementptr inbounds i8, ptr %0, i64 2 330 store i8 -96, ptr %5, align 1 331 %6 = getelementptr inbounds i8, ptr %0, i64 3 332 store i8 -95, ptr %6, align 1 333 %7 = load <2 x i16>, ptr %0, align 4 334 %8 = extractelement <2 x i16> %7, i32 0 335 store i16 %8, ptr %1, align 2 336 %9 = extractelement <2 x i16> %7, i32 1 337 store i16 %9, ptr %2, align 2 338 ret void 339} 340 341; A scalar load is favored over a XMM->GPR register transfer in this example. 342 343define i32 @multi_use_load_scalarization(ptr %p) nounwind { 344; X86-SSE2-LABEL: multi_use_load_scalarization: 345; X86-SSE2: # %bb.0: 346; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx 347; X86-SSE2-NEXT: movl (%ecx), %eax 348; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 349; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm1 350; X86-SSE2-NEXT: psubd %xmm1, %xmm0 351; X86-SSE2-NEXT: movdqa %xmm0, (%ecx) 352; X86-SSE2-NEXT: retl 353; 354; X64-SSSE3-LABEL: multi_use_load_scalarization: 355; X64-SSSE3: # %bb.0: 356; X64-SSSE3-NEXT: movl (%rdi), %eax 357; X64-SSSE3-NEXT: movdqu (%rdi), %xmm0 358; X64-SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 359; X64-SSSE3-NEXT: psubd %xmm1, %xmm0 360; X64-SSSE3-NEXT: movdqa %xmm0, (%rdi) 361; X64-SSSE3-NEXT: retq 362; 363; X64-AVX-LABEL: multi_use_load_scalarization: 364; X64-AVX: # %bb.0: 365; X64-AVX-NEXT: movl (%rdi), %eax 366; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0 367; X64-AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 368; X64-AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0 369; X64-AVX-NEXT: vmovdqa %xmm0, (%rdi) 370; X64-AVX-NEXT: retq 371 %v = load <4 x i32>, ptr %p, align 1 372 %v1 = add <4 x i32> %v, <i32 1, i32 1, i32 1, i32 1> 373 store <4 x i32> %v1, ptr %p 374 %r = extractelement <4 x i32> %v, i64 0 375 ret i32 %r 376} 377 378define i32 @multi_use_volatile_load_scalarization(ptr %p) nounwind { 379; X86-SSE2-LABEL: multi_use_volatile_load_scalarization: 380; X86-SSE2: # %bb.0: 381; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx 382; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 383; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm1 384; X86-SSE2-NEXT: movd %xmm0, %eax 385; X86-SSE2-NEXT: psubd %xmm1, %xmm0 386; X86-SSE2-NEXT: movdqa %xmm0, (%ecx) 387; X86-SSE2-NEXT: retl 388; 389; X64-SSSE3-LABEL: multi_use_volatile_load_scalarization: 390; X64-SSSE3: # %bb.0: 391; X64-SSSE3-NEXT: movdqu (%rdi), %xmm0 392; X64-SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 393; X64-SSSE3-NEXT: movd %xmm0, %eax 394; X64-SSSE3-NEXT: psubd %xmm1, %xmm0 395; X64-SSSE3-NEXT: movdqa %xmm0, (%rdi) 396; X64-SSSE3-NEXT: retq 397; 398; X64-AVX-LABEL: multi_use_volatile_load_scalarization: 399; X64-AVX: # %bb.0: 400; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0 401; X64-AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 402; X64-AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm1 403; X64-AVX-NEXT: vmovdqa %xmm1, (%rdi) 404; X64-AVX-NEXT: vmovd %xmm0, %eax 405; X64-AVX-NEXT: retq 406 %v = load volatile <4 x i32>, ptr %p, align 1 407 %v1 = add <4 x i32> %v, <i32 1, i32 1, i32 1, i32 1> 408 store <4 x i32> %v1, ptr %p 409 %r = extractelement <4 x i32> %v, i64 0 410 ret i32 %r 411} 412 413; This test is reduced from a C source example that showed a miscompile: 414; https://github.com/llvm/llvm-project/issues/53695 415; The scalarized loads from 'zero' in the AVX asm must occur before 416; the vector store to 'zero' overwrites the values. 417; If compiled to a binary, this test should return 0 if correct. 418 419@n1 = local_unnamed_addr global <8 x i32> <i32 0, i32 42, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0>, align 32 420@zero = internal unnamed_addr global <8 x i32> zeroinitializer, align 32 421 422define i32 @main() nounwind { 423; X86-SSE2-LABEL: main: 424; X86-SSE2: # %bb.0: 425; X86-SSE2-NEXT: pushl %ebp 426; X86-SSE2-NEXT: movl %esp, %ebp 427; X86-SSE2-NEXT: pushl %edi 428; X86-SSE2-NEXT: pushl %esi 429; X86-SSE2-NEXT: andl $-32, %esp 430; X86-SSE2-NEXT: subl $64, %esp 431; X86-SSE2-NEXT: movaps n1+16, %xmm0 432; X86-SSE2-NEXT: movaps n1, %xmm1 433; X86-SSE2-NEXT: movl zero+4, %ecx 434; X86-SSE2-NEXT: movl zero+8, %eax 435; X86-SSE2-NEXT: movaps %xmm1, zero 436; X86-SSE2-NEXT: movaps %xmm0, zero+16 437; X86-SSE2-NEXT: movaps {{.*#+}} xmm0 = [2,2,2,2] 438; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) 439; X86-SSE2-NEXT: movaps %xmm0, (%esp) 440; X86-SSE2-NEXT: movdqa (%esp), %xmm0 441; X86-SSE2-NEXT: movaps {{[0-9]+}}(%esp), %xmm1 442; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 443; X86-SSE2-NEXT: movd %xmm1, %esi 444; X86-SSE2-NEXT: xorl %edx, %edx 445; X86-SSE2-NEXT: divl %esi 446; X86-SSE2-NEXT: movl %eax, %esi 447; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 448; X86-SSE2-NEXT: movd %xmm0, %edi 449; X86-SSE2-NEXT: movl %ecx, %eax 450; X86-SSE2-NEXT: xorl %edx, %edx 451; X86-SSE2-NEXT: divl %edi 452; X86-SSE2-NEXT: addl %esi, %eax 453; X86-SSE2-NEXT: leal -8(%ebp), %esp 454; X86-SSE2-NEXT: popl %esi 455; X86-SSE2-NEXT: popl %edi 456; X86-SSE2-NEXT: popl %ebp 457; X86-SSE2-NEXT: retl 458; 459; X64-SSSE3-LABEL: main: 460; X64-SSSE3: # %bb.0: 461; X64-SSSE3-NEXT: pushq %rbp 462; X64-SSSE3-NEXT: movq %rsp, %rbp 463; X64-SSSE3-NEXT: andq $-32, %rsp 464; X64-SSSE3-NEXT: subq $64, %rsp 465; X64-SSSE3-NEXT: movq n1@GOTPCREL(%rip), %rax 466; X64-SSSE3-NEXT: movaps (%rax), %xmm0 467; X64-SSSE3-NEXT: movaps 16(%rax), %xmm1 468; X64-SSSE3-NEXT: movl zero+4(%rip), %ecx 469; X64-SSSE3-NEXT: movl zero+8(%rip), %eax 470; X64-SSSE3-NEXT: movaps %xmm0, zero(%rip) 471; X64-SSSE3-NEXT: movaps %xmm1, zero+16(%rip) 472; X64-SSSE3-NEXT: movaps {{.*#+}} xmm0 = [2,2,2,2] 473; X64-SSSE3-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) 474; X64-SSSE3-NEXT: movaps %xmm0, (%rsp) 475; X64-SSSE3-NEXT: movdqa (%rsp), %xmm0 476; X64-SSSE3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 477; X64-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 478; X64-SSSE3-NEXT: movd %xmm1, %esi 479; X64-SSSE3-NEXT: xorl %edx, %edx 480; X64-SSSE3-NEXT: divl %esi 481; X64-SSSE3-NEXT: movl %eax, %esi 482; X64-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 483; X64-SSSE3-NEXT: movd %xmm0, %edi 484; X64-SSSE3-NEXT: movl %ecx, %eax 485; X64-SSSE3-NEXT: xorl %edx, %edx 486; X64-SSSE3-NEXT: divl %edi 487; X64-SSSE3-NEXT: addl %esi, %eax 488; X64-SSSE3-NEXT: movq %rbp, %rsp 489; X64-SSSE3-NEXT: popq %rbp 490; X64-SSSE3-NEXT: retq 491; 492; X64-AVX-LABEL: main: 493; X64-AVX: # %bb.0: 494; X64-AVX-NEXT: pushq %rbp 495; X64-AVX-NEXT: movq %rsp, %rbp 496; X64-AVX-NEXT: andq $-32, %rsp 497; X64-AVX-NEXT: subq $64, %rsp 498; X64-AVX-NEXT: movq n1@GOTPCREL(%rip), %rax 499; X64-AVX-NEXT: vmovaps (%rax), %ymm0 500; X64-AVX-NEXT: movl zero+4(%rip), %ecx 501; X64-AVX-NEXT: movl zero+8(%rip), %eax 502; X64-AVX-NEXT: vmovaps %ymm0, zero(%rip) 503; X64-AVX-NEXT: vbroadcastss {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2] 504; X64-AVX-NEXT: vmovaps %ymm0, (%rsp) 505; X64-AVX-NEXT: vmovaps (%rsp), %ymm0 506; X64-AVX-NEXT: vextractps $2, %xmm0, %esi 507; X64-AVX-NEXT: xorl %edx, %edx 508; X64-AVX-NEXT: divl %esi 509; X64-AVX-NEXT: movl %eax, %esi 510; X64-AVX-NEXT: vextractps $1, %xmm0, %edi 511; X64-AVX-NEXT: movl %ecx, %eax 512; X64-AVX-NEXT: xorl %edx, %edx 513; X64-AVX-NEXT: divl %edi 514; X64-AVX-NEXT: addl %esi, %eax 515; X64-AVX-NEXT: movq %rbp, %rsp 516; X64-AVX-NEXT: popq %rbp 517; X64-AVX-NEXT: vzeroupper 518; X64-AVX-NEXT: retq 519 %stackptr = alloca <8 x i32>, align 32 520 %z = load <8 x i32>, ptr @zero, align 32 521 %t1 = load <8 x i32>, ptr @n1, align 32 522 store <8 x i32> %t1, ptr @zero, align 32 523 store volatile <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>, ptr %stackptr, align 32 524 %stackload = load volatile <8 x i32>, ptr %stackptr, align 32 525 %div = udiv <8 x i32> %z, %stackload 526 %e1 = extractelement <8 x i32> %div, i64 1 527 %e2 = extractelement <8 x i32> %div, i64 2 528 %r = add i32 %e1, %e2 529 ret i32 %r 530} 531