1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+sse2 -mcpu=nehalem | FileCheck %s 3 4; rdar: 12558838 5; PR14221 6; There is a mismatch between the intrinsic and the actual instruction. 7; The actual instruction has a partial update of dest, while the intrinsic 8; passes through the upper FP values. Here, we make sure the source and 9; destination of each scalar unary op are the same. 10 11define void @rsqrtss(<4 x float> %a) nounwind uwtable ssp { 12; CHECK-LABEL: rsqrtss: 13; CHECK: ## %bb.0: 14; CHECK-NEXT: rsqrtss %xmm0, %xmm0 15; CHECK-NEXT: cvtss2sd %xmm0, %xmm2 16; CHECK-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 17; CHECK-NEXT: cvtss2sd %xmm0, %xmm1 18; CHECK-NEXT: movaps %xmm2, %xmm0 19; CHECK-NEXT: jmp _callee ## TAILCALL 20 %t0 = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %a) nounwind 21 %a.addr.0.extract = extractelement <4 x float> %t0, i32 0 22 %conv = fpext float %a.addr.0.extract to double 23 %a.addr.4.extract = extractelement <4 x float> %t0, i32 1 24 %conv3 = fpext float %a.addr.4.extract to double 25 tail call void @callee(double %conv, double %conv3) nounwind 26 ret void 27} 28declare void @callee(double, double) 29declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone 30 31define void @rcpss(<4 x float> %a) nounwind uwtable ssp { 32; CHECK-LABEL: rcpss: 33; CHECK: ## %bb.0: 34; CHECK-NEXT: rcpss %xmm0, %xmm0 35; CHECK-NEXT: cvtss2sd %xmm0, %xmm2 36; CHECK-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 37; CHECK-NEXT: cvtss2sd %xmm0, %xmm1 38; CHECK-NEXT: movaps %xmm2, %xmm0 39; CHECK-NEXT: jmp _callee ## TAILCALL 40 %t0 = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %a) nounwind 41 %a.addr.0.extract = extractelement <4 x float> %t0, i32 0 42 %conv = fpext float %a.addr.0.extract to double 43 %a.addr.4.extract = extractelement <4 x float> %t0, i32 1 44 %conv3 = fpext float %a.addr.4.extract to double 45 tail call void @callee(double %conv, double %conv3) nounwind 46 ret void 47} 48declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone 49 50define void @sqrtss(<4 x float> %a) nounwind uwtable ssp { 51; CHECK-LABEL: sqrtss: 52; CHECK: ## %bb.0: 53; CHECK-NEXT: sqrtss %xmm0, %xmm1 54; CHECK-NEXT: cvtss2sd %xmm1, %xmm2 55; CHECK-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 56; CHECK-NEXT: xorps %xmm1, %xmm1 57; CHECK-NEXT: cvtss2sd %xmm0, %xmm1 58; CHECK-NEXT: movaps %xmm2, %xmm0 59; CHECK-NEXT: jmp _callee ## TAILCALL 60 %t0 = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %a) nounwind 61 %a.addr.0.extract = extractelement <4 x float> %t0, i32 0 62 %conv = fpext float %a.addr.0.extract to double 63 %a.addr.4.extract = extractelement <4 x float> %t0, i32 1 64 %conv3 = fpext float %a.addr.4.extract to double 65 tail call void @callee(double %conv, double %conv3) nounwind 66 ret void 67} 68declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone 69 70define void @sqrtsd(<2 x double> %a) nounwind uwtable ssp { 71; CHECK-LABEL: sqrtsd: 72; CHECK: ## %bb.0: 73; CHECK-NEXT: sqrtsd %xmm0, %xmm1 74; CHECK-NEXT: cvtsd2ss %xmm1, %xmm2 75; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] 76; CHECK-NEXT: xorps %xmm1, %xmm1 77; CHECK-NEXT: cvtsd2ss %xmm0, %xmm1 78; CHECK-NEXT: movaps %xmm2, %xmm0 79; CHECK-NEXT: jmp _callee2 ## TAILCALL 80 %t0 = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %a) nounwind 81 %a0 = extractelement <2 x double> %t0, i32 0 82 %conv = fptrunc double %a0 to float 83 %a1 = extractelement <2 x double> %t0, i32 1 84 %conv3 = fptrunc double %a1 to float 85 tail call void @callee2(float %conv, float %conv3) nounwind 86 ret void 87} 88 89declare void @callee2(float, float) 90declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone 91 92define <2 x double> @load_fold_cvtss2sd_int(ptr%a) { 93; CHECK-LABEL: load_fold_cvtss2sd_int: 94; CHECK: ## %bb.0: 95; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 96; CHECK-NEXT: cvtss2sd %xmm0, %xmm0 97; CHECK-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero 98; CHECK-NEXT: retq 99 %ld = load <4 x float>, ptr%a 100 %x = call <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double> <double 0x0, double 0x0>, <4 x float> %ld) 101 ret <2 x double> %x 102} 103 104define <2 x double> @load_fold_cvtss2sd_int_optsize(ptr%a) optsize { 105; CHECK-LABEL: load_fold_cvtss2sd_int_optsize: 106; CHECK: ## %bb.0: 107; CHECK-NEXT: cvtss2sd (%rdi), %xmm0 108; CHECK-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero 109; CHECK-NEXT: retq 110 %ld = load <4 x float>, ptr%a 111 %x = call <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double> <double 0x0, double 0x0>, <4 x float> %ld) 112 ret <2 x double> %x 113} 114 115define <2 x double> @load_fold_cvtss2sd_int_minsize(ptr%a) minsize { 116; CHECK-LABEL: load_fold_cvtss2sd_int_minsize: 117; CHECK: ## %bb.0: 118; CHECK-NEXT: cvtss2sd (%rdi), %xmm0 119; CHECK-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero 120; CHECK-NEXT: retq 121 %ld = load <4 x float>, ptr%a 122 %x = call <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double> <double 0x0, double 0x0>, <4 x float> %ld) 123 ret <2 x double> %x 124} 125 126declare <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double>, <4 x float>) nounwind readnone 127 128define float @PR22206(<4 x float> %a) { 129; CHECK-LABEL: PR22206: 130; CHECK: ## %bb.0: 131; CHECK-NEXT: sqrtss %xmm0, %xmm1 132; CHECK-NEXT: addss %xmm1, %xmm0 133; CHECK-NEXT: retq 134 %res = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %a) nounwind 135 %new = extractelement <4 x float> %res, i32 0 136 %orig = extractelement <4 x float> %a, i32 0 137 %add = fadd float %new, %orig 138 ret float %add 139} 140 141