1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse2 < %s | FileCheck %s --check-prefix=SSE 3; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s --check-prefix=AVX 4; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx512f < %s | FileCheck %s --check-prefix=AVX 5 6; Verify we fold loads into unary sse intrinsics only when optimizing for size 7 8define float @rcpss(ptr %a) { 9; SSE-LABEL: rcpss: 10; SSE: # %bb.0: 11; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 12; SSE-NEXT: rcpss %xmm0, %xmm0 13; SSE-NEXT: retq 14; 15; AVX-LABEL: rcpss: 16; AVX: # %bb.0: 17; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 18; AVX-NEXT: vrcpss %xmm0, %xmm0, %xmm0 19; AVX-NEXT: retq 20 %ld = load float, ptr %a 21 %ins = insertelement <4 x float> undef, float %ld, i32 0 22 %res = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %ins) 23 %ext = extractelement <4 x float> %res, i32 0 24 ret float %ext 25} 26 27define float @rsqrtss(ptr %a) { 28; SSE-LABEL: rsqrtss: 29; SSE: # %bb.0: 30; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 31; SSE-NEXT: rsqrtss %xmm0, %xmm0 32; SSE-NEXT: retq 33; 34; AVX-LABEL: rsqrtss: 35; AVX: # %bb.0: 36; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 37; AVX-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0 38; AVX-NEXT: retq 39 %ld = load float, ptr %a 40 %ins = insertelement <4 x float> undef, float %ld, i32 0 41 %res = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %ins) 42 %ext = extractelement <4 x float> %res, i32 0 43 ret float %ext 44} 45 46define float @sqrtss(ptr %a) { 47; SSE-LABEL: sqrtss: 48; SSE: # %bb.0: 49; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 50; SSE-NEXT: sqrtss %xmm0, %xmm0 51; SSE-NEXT: retq 52; 53; AVX-LABEL: sqrtss: 54; AVX: # %bb.0: 55; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 56; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 57; AVX-NEXT: retq 58 %ld = load float, ptr %a 59 %ins = insertelement <4 x float> undef, float %ld, i32 0 60 %res = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %ins) 61 %ext = extractelement <4 x float> %res, i32 0 62 ret float %ext 63} 64 65define double @sqrtsd(ptr %a) { 66; SSE-LABEL: sqrtsd: 67; SSE: # %bb.0: 68; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 69; SSE-NEXT: sqrtsd %xmm0, %xmm0 70; SSE-NEXT: retq 71; 72; AVX-LABEL: sqrtsd: 73; AVX: # %bb.0: 74; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 75; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 76; AVX-NEXT: retq 77 %ld = load double, ptr %a 78 %ins = insertelement <2 x double> undef, double %ld, i32 0 79 %res = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %ins) 80 %ext = extractelement <2 x double> %res, i32 0 81 ret double %ext 82} 83 84define float @rcpss_size(ptr %a) optsize { 85; SSE-LABEL: rcpss_size: 86; SSE: # %bb.0: 87; SSE-NEXT: rcpss (%rdi), %xmm0 88; SSE-NEXT: retq 89; 90; AVX-LABEL: rcpss_size: 91; AVX: # %bb.0: 92; AVX-NEXT: vrcpss (%rdi), %xmm0, %xmm0 93; AVX-NEXT: retq 94 %ld = load float, ptr %a 95 %ins = insertelement <4 x float> undef, float %ld, i32 0 96 %res = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %ins) 97 %ext = extractelement <4 x float> %res, i32 0 98 ret float %ext 99} 100 101define <4 x float> @rcpss_full_size(ptr %a) optsize { 102; SSE-LABEL: rcpss_full_size: 103; SSE: # %bb.0: 104; SSE-NEXT: rcpss (%rdi), %xmm0 105; SSE-NEXT: retq 106; 107; AVX-LABEL: rcpss_full_size: 108; AVX: # %bb.0: 109; AVX-NEXT: vrcpss (%rdi), %xmm0, %xmm0 110; AVX-NEXT: retq 111 %ld = load <4 x float>, ptr %a 112 %res = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %ld) 113 ret <4 x float> %res 114} 115 116define float @rcpss_pgso(ptr %a) !prof !14 { 117; SSE-LABEL: rcpss_pgso: 118; SSE: # %bb.0: 119; SSE-NEXT: rcpss (%rdi), %xmm0 120; SSE-NEXT: retq 121; 122; AVX-LABEL: rcpss_pgso: 123; AVX: # %bb.0: 124; AVX-NEXT: vrcpss (%rdi), %xmm0, %xmm0 125; AVX-NEXT: retq 126 %ld = load float, ptr %a 127 %ins = insertelement <4 x float> undef, float %ld, i32 0 128 %res = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %ins) 129 %ext = extractelement <4 x float> %res, i32 0 130 ret float %ext 131} 132 133define <4 x float> @rcpss_full_pgso(ptr %a) !prof !14 { 134; SSE-LABEL: rcpss_full_pgso: 135; SSE: # %bb.0: 136; SSE-NEXT: rcpss (%rdi), %xmm0 137; SSE-NEXT: retq 138; 139; AVX-LABEL: rcpss_full_pgso: 140; AVX: # %bb.0: 141; AVX-NEXT: vrcpss (%rdi), %xmm0, %xmm0 142; AVX-NEXT: retq 143 %ld = load <4 x float>, ptr %a 144 %res = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %ld) 145 ret <4 x float> %res 146} 147 148define float @rsqrtss_size(ptr %a) optsize { 149; SSE-LABEL: rsqrtss_size: 150; SSE: # %bb.0: 151; SSE-NEXT: rsqrtss (%rdi), %xmm0 152; SSE-NEXT: retq 153; 154; AVX-LABEL: rsqrtss_size: 155; AVX: # %bb.0: 156; AVX-NEXT: vrsqrtss (%rdi), %xmm0, %xmm0 157; AVX-NEXT: retq 158 %ld = load float, ptr %a 159 %ins = insertelement <4 x float> undef, float %ld, i32 0 160 %res = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %ins) 161 %ext = extractelement <4 x float> %res, i32 0 162 ret float %ext 163} 164 165define <4 x float> @rsqrtss_full_size(ptr %a) optsize { 166; SSE-LABEL: rsqrtss_full_size: 167; SSE: # %bb.0: 168; SSE-NEXT: rsqrtss (%rdi), %xmm0 169; SSE-NEXT: retq 170; 171; AVX-LABEL: rsqrtss_full_size: 172; AVX: # %bb.0: 173; AVX-NEXT: vrsqrtss (%rdi), %xmm0, %xmm0 174; AVX-NEXT: retq 175 %ld = load <4 x float>, ptr %a 176 %res = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %ld) 177 ret <4 x float> %res 178} 179 180define float @rsqrtss_pgso(ptr %a) !prof !14 { 181; SSE-LABEL: rsqrtss_pgso: 182; SSE: # %bb.0: 183; SSE-NEXT: rsqrtss (%rdi), %xmm0 184; SSE-NEXT: retq 185; 186; AVX-LABEL: rsqrtss_pgso: 187; AVX: # %bb.0: 188; AVX-NEXT: vrsqrtss (%rdi), %xmm0, %xmm0 189; AVX-NEXT: retq 190 %ld = load float, ptr %a 191 %ins = insertelement <4 x float> undef, float %ld, i32 0 192 %res = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %ins) 193 %ext = extractelement <4 x float> %res, i32 0 194 ret float %ext 195} 196 197define <4 x float> @rsqrtss_full_pgso(ptr %a) !prof !14 { 198; SSE-LABEL: rsqrtss_full_pgso: 199; SSE: # %bb.0: 200; SSE-NEXT: rsqrtss (%rdi), %xmm0 201; SSE-NEXT: retq 202; 203; AVX-LABEL: rsqrtss_full_pgso: 204; AVX: # %bb.0: 205; AVX-NEXT: vrsqrtss (%rdi), %xmm0, %xmm0 206; AVX-NEXT: retq 207 %ld = load <4 x float>, ptr %a 208 %res = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %ld) 209 ret <4 x float> %res 210} 211 212define float @sqrtss_size(ptr %a) optsize{ 213; SSE-LABEL: sqrtss_size: 214; SSE: # %bb.0: 215; SSE-NEXT: sqrtss (%rdi), %xmm0 216; SSE-NEXT: retq 217; 218; AVX-LABEL: sqrtss_size: 219; AVX: # %bb.0: 220; AVX-NEXT: vsqrtss (%rdi), %xmm0, %xmm0 221; AVX-NEXT: retq 222 %ld = load float, ptr %a 223 %ins = insertelement <4 x float> undef, float %ld, i32 0 224 %res = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %ins) 225 %ext = extractelement <4 x float> %res, i32 0 226 ret float %ext 227} 228 229define <4 x float> @sqrtss_full_size(ptr %a) optsize{ 230; SSE-LABEL: sqrtss_full_size: 231; SSE: # %bb.0: 232; SSE-NEXT: movaps (%rdi), %xmm0 233; SSE-NEXT: sqrtss %xmm0, %xmm0 234; SSE-NEXT: retq 235; 236; AVX-LABEL: sqrtss_full_size: 237; AVX: # %bb.0: 238; AVX-NEXT: vmovaps (%rdi), %xmm0 239; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 240; AVX-NEXT: retq 241 %ld = load <4 x float>, ptr %a 242 %res = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %ld) 243 ret <4 x float> %res 244} 245 246define <4 x float> @sqrtss_full_size_volatile(ptr %a) optsize{ 247; SSE-LABEL: sqrtss_full_size_volatile: 248; SSE: # %bb.0: 249; SSE-NEXT: movaps (%rdi), %xmm0 250; SSE-NEXT: sqrtss %xmm0, %xmm0 251; SSE-NEXT: retq 252; 253; AVX-LABEL: sqrtss_full_size_volatile: 254; AVX: # %bb.0: 255; AVX-NEXT: vmovaps (%rdi), %xmm0 256; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 257; AVX-NEXT: retq 258 %ld = load volatile <4 x float>, ptr %a 259 %res = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %ld) 260 ret <4 x float> %res 261} 262 263define float @sqrtss_pgso(ptr %a) !prof !14 { 264; SSE-LABEL: sqrtss_pgso: 265; SSE: # %bb.0: 266; SSE-NEXT: sqrtss (%rdi), %xmm0 267; SSE-NEXT: retq 268; 269; AVX-LABEL: sqrtss_pgso: 270; AVX: # %bb.0: 271; AVX-NEXT: vsqrtss (%rdi), %xmm0, %xmm0 272; AVX-NEXT: retq 273 %ld = load float, ptr %a 274 %ins = insertelement <4 x float> undef, float %ld, i32 0 275 %res = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %ins) 276 %ext = extractelement <4 x float> %res, i32 0 277 ret float %ext 278} 279 280define <4 x float> @sqrtss_full_pgso(ptr %a) !prof !14 { 281; SSE-LABEL: sqrtss_full_pgso: 282; SSE: # %bb.0: 283; SSE-NEXT: movaps (%rdi), %xmm0 284; SSE-NEXT: sqrtss %xmm0, %xmm0 285; SSE-NEXT: retq 286; 287; AVX-LABEL: sqrtss_full_pgso: 288; AVX: # %bb.0: 289; AVX-NEXT: vmovaps (%rdi), %xmm0 290; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 291; AVX-NEXT: retq 292 %ld = load <4 x float>, ptr %a 293 %res = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %ld) 294 ret <4 x float> %res 295} 296 297define <4 x float> @sqrtss_full_pgso_volatile(ptr %a) !prof !14 { 298; SSE-LABEL: sqrtss_full_pgso_volatile: 299; SSE: # %bb.0: 300; SSE-NEXT: movaps (%rdi), %xmm0 301; SSE-NEXT: sqrtss %xmm0, %xmm0 302; SSE-NEXT: retq 303; 304; AVX-LABEL: sqrtss_full_pgso_volatile: 305; AVX: # %bb.0: 306; AVX-NEXT: vmovaps (%rdi), %xmm0 307; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 308; AVX-NEXT: retq 309 %ld = load volatile <4 x float>, ptr %a 310 %res = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %ld) 311 ret <4 x float> %res 312} 313 314define double @sqrtsd_size(ptr %a) optsize { 315; SSE-LABEL: sqrtsd_size: 316; SSE: # %bb.0: 317; SSE-NEXT: sqrtsd (%rdi), %xmm0 318; SSE-NEXT: retq 319; 320; AVX-LABEL: sqrtsd_size: 321; AVX: # %bb.0: 322; AVX-NEXT: vsqrtsd (%rdi), %xmm0, %xmm0 323; AVX-NEXT: retq 324 %ld = load double, ptr %a 325 %ins = insertelement <2 x double> undef, double %ld, i32 0 326 %res = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %ins) 327 %ext = extractelement <2 x double> %res, i32 0 328 ret double %ext 329} 330 331define <2 x double> @sqrtsd_full_size(ptr %a) optsize { 332; SSE-LABEL: sqrtsd_full_size: 333; SSE: # %bb.0: 334; SSE-NEXT: movapd (%rdi), %xmm0 335; SSE-NEXT: sqrtsd %xmm0, %xmm0 336; SSE-NEXT: retq 337; 338; AVX-LABEL: sqrtsd_full_size: 339; AVX: # %bb.0: 340; AVX-NEXT: vmovapd (%rdi), %xmm0 341; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 342; AVX-NEXT: retq 343 %ld = load <2 x double>, ptr %a 344 %res = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %ld) 345 ret <2 x double> %res 346} 347 348define <2 x double> @sqrtsd_full_size_volatile(ptr %a) optsize { 349; SSE-LABEL: sqrtsd_full_size_volatile: 350; SSE: # %bb.0: 351; SSE-NEXT: movapd (%rdi), %xmm0 352; SSE-NEXT: sqrtsd %xmm0, %xmm0 353; SSE-NEXT: retq 354; 355; AVX-LABEL: sqrtsd_full_size_volatile: 356; AVX: # %bb.0: 357; AVX-NEXT: vmovapd (%rdi), %xmm0 358; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 359; AVX-NEXT: retq 360 %ld = load volatile <2 x double>, ptr %a 361 %res = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %ld) 362 ret <2 x double> %res 363} 364 365define double @sqrtsd_pgso(ptr %a) !prof !14 { 366; SSE-LABEL: sqrtsd_pgso: 367; SSE: # %bb.0: 368; SSE-NEXT: sqrtsd (%rdi), %xmm0 369; SSE-NEXT: retq 370; 371; AVX-LABEL: sqrtsd_pgso: 372; AVX: # %bb.0: 373; AVX-NEXT: vsqrtsd (%rdi), %xmm0, %xmm0 374; AVX-NEXT: retq 375 %ld = load double, ptr %a 376 %ins = insertelement <2 x double> undef, double %ld, i32 0 377 %res = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %ins) 378 %ext = extractelement <2 x double> %res, i32 0 379 ret double %ext 380} 381 382define <2 x double> @sqrtsd_full_pgso(ptr %a) !prof !14 { 383; SSE-LABEL: sqrtsd_full_pgso: 384; SSE: # %bb.0: 385; SSE-NEXT: movapd (%rdi), %xmm0 386; SSE-NEXT: sqrtsd %xmm0, %xmm0 387; SSE-NEXT: retq 388; 389; AVX-LABEL: sqrtsd_full_pgso: 390; AVX: # %bb.0: 391; AVX-NEXT: vmovapd (%rdi), %xmm0 392; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 393; AVX-NEXT: retq 394 %ld = load <2 x double>, ptr %a 395 %res = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %ld) 396 ret <2 x double> %res 397} 398 399define <2 x double> @sqrtsd_full_pgso_volatile(ptr %a) !prof !14 { 400; SSE-LABEL: sqrtsd_full_pgso_volatile: 401; SSE: # %bb.0: 402; SSE-NEXT: movapd (%rdi), %xmm0 403; SSE-NEXT: sqrtsd %xmm0, %xmm0 404; SSE-NEXT: retq 405; 406; AVX-LABEL: sqrtsd_full_pgso_volatile: 407; AVX: # %bb.0: 408; AVX-NEXT: vmovapd (%rdi), %xmm0 409; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 410; AVX-NEXT: retq 411 %ld = load volatile <2 x double>, ptr %a 412 %res = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %ld) 413 ret <2 x double> %res 414} 415 416declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone 417declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone 418declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone 419declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone 420 421!llvm.module.flags = !{!0} 422!0 = !{i32 1, !"ProfileSummary", !1} 423!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} 424!2 = !{!"ProfileFormat", !"InstrProf"} 425!3 = !{!"TotalCount", i64 10000} 426!4 = !{!"MaxCount", i64 10} 427!5 = !{!"MaxInternalCount", i64 1} 428!6 = !{!"MaxFunctionCount", i64 1000} 429!7 = !{!"NumCounts", i64 3} 430!8 = !{!"NumFunctions", i64 3} 431!9 = !{!"DetailedSummary", !10} 432!10 = !{!11, !12, !13} 433!11 = !{i32 10000, i64 100, i32 1} 434!12 = !{i32 999000, i64 100, i32 1} 435!13 = !{i32 999999, i64 1, i32 2} 436!14 = !{!"function_entry_count", i64 0} 437