1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX 6 7; 0'th element insertion into an SSE register. 8 9define <4 x float> @insert_f32_firstelt(<4 x float> %x, ptr %s.addr) { 10; SSE2-LABEL: insert_f32_firstelt: 11; SSE2: # %bb.0: 12; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 13; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 14; SSE2-NEXT: retq 15; 16; SSE41-LABEL: insert_f32_firstelt: 17; SSE41: # %bb.0: 18; SSE41-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 19; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 20; SSE41-NEXT: retq 21; 22; AVX-LABEL: insert_f32_firstelt: 23; AVX: # %bb.0: 24; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero 25; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 26; AVX-NEXT: retq 27 %s = load float, ptr %s.addr 28 %i0 = insertelement <4 x float> %x, float %s, i32 0 29 ret <4 x float> %i0 30} 31 32define <2 x double> @insert_f64_firstelt(<2 x double> %x, ptr %s.addr) { 33; SSE-LABEL: insert_f64_firstelt: 34; SSE: # %bb.0: 35; SSE-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] 36; SSE-NEXT: retq 37; 38; AVX-LABEL: insert_f64_firstelt: 39; AVX: # %bb.0: 40; AVX-NEXT: vmovlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] 41; AVX-NEXT: retq 42 %s = load double, ptr %s.addr 43 %i0 = insertelement <2 x double> %x, double %s, i32 0 44 ret <2 x double> %i0 45} 46 47define <16 x i8> @insert_i8_firstelt(<16 x i8> %x, ptr %s.addr) { 48; SSE2-LABEL: insert_i8_firstelt: 49; SSE2: # %bb.0: 50; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 51; SSE2-NEXT: pand %xmm1, %xmm0 52; SSE2-NEXT: movzbl (%rdi), %eax 53; SSE2-NEXT: movd %eax, %xmm2 54; SSE2-NEXT: pandn %xmm2, %xmm1 55; SSE2-NEXT: por %xmm1, %xmm0 56; SSE2-NEXT: retq 57; 58; SSE41-LABEL: insert_i8_firstelt: 59; SSE41: # %bb.0: 60; SSE41-NEXT: pinsrb $0, (%rdi), %xmm0 61; SSE41-NEXT: retq 62; 63; AVX-LABEL: insert_i8_firstelt: 64; AVX: # %bb.0: 65; AVX-NEXT: vpinsrb $0, (%rdi), %xmm0, %xmm0 66; AVX-NEXT: retq 67 %s = load i8, ptr %s.addr 68 %i0 = insertelement <16 x i8> %x, i8 %s, i32 0 69 ret <16 x i8> %i0 70} 71 72define <8 x i16> @insert_i16_firstelt(<8 x i16> %x, ptr %s.addr) { 73; SSE-LABEL: insert_i16_firstelt: 74; SSE: # %bb.0: 75; SSE-NEXT: pinsrw $0, (%rdi), %xmm0 76; SSE-NEXT: retq 77; 78; AVX-LABEL: insert_i16_firstelt: 79; AVX: # %bb.0: 80; AVX-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0 81; AVX-NEXT: retq 82 %s = load i16, ptr %s.addr 83 %i0 = insertelement <8 x i16> %x, i16 %s, i32 0 84 ret <8 x i16> %i0 85} 86 87define <4 x i32> @insert_i32_firstelt(<4 x i32> %x, ptr %s.addr) { 88; SSE2-LABEL: insert_i32_firstelt: 89; SSE2: # %bb.0: 90; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 91; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 92; SSE2-NEXT: retq 93; 94; SSE41-LABEL: insert_i32_firstelt: 95; SSE41: # %bb.0: 96; SSE41-NEXT: pinsrd $0, (%rdi), %xmm0 97; SSE41-NEXT: retq 98; 99; AVX-LABEL: insert_i32_firstelt: 100; AVX: # %bb.0: 101; AVX-NEXT: vpinsrd $0, (%rdi), %xmm0, %xmm0 102; AVX-NEXT: retq 103 %s = load i32, ptr %s.addr 104 %i0 = insertelement <4 x i32> %x, i32 %s, i32 0 105 ret <4 x i32> %i0 106} 107 108define <2 x i64> @insert_i64_firstelt(<2 x i64> %x, ptr %s.addr) { 109; SSE2-LABEL: insert_i64_firstelt: 110; SSE2: # %bb.0: 111; SSE2-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] 112; SSE2-NEXT: retq 113; 114; SSE41-LABEL: insert_i64_firstelt: 115; SSE41: # %bb.0: 116; SSE41-NEXT: pinsrq $0, (%rdi), %xmm0 117; SSE41-NEXT: retq 118; 119; AVX-LABEL: insert_i64_firstelt: 120; AVX: # %bb.0: 121; AVX-NEXT: vpinsrq $0, (%rdi), %xmm0, %xmm0 122; AVX-NEXT: retq 123 %s = load i64, ptr %s.addr 124 %i0 = insertelement <2 x i64> %x, i64 %s, i32 0 125 ret <2 x i64> %i0 126} 127 128; 1'th element insertion. 129 130define <4 x float> @insert_f32_secondelt(<4 x float> %x, ptr %s.addr) { 131; SSE2-LABEL: insert_f32_secondelt: 132; SSE2: # %bb.0: 133; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 134; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 135; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] 136; SSE2-NEXT: movaps %xmm1, %xmm0 137; SSE2-NEXT: retq 138; 139; SSE41-LABEL: insert_f32_secondelt: 140; SSE41: # %bb.0: 141; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] 142; SSE41-NEXT: retq 143; 144; AVX-LABEL: insert_f32_secondelt: 145; AVX: # %bb.0: 146; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] 147; AVX-NEXT: retq 148 %s = load float, ptr %s.addr 149 %i0 = insertelement <4 x float> %x, float %s, i32 1 150 ret <4 x float> %i0 151} 152 153define <2 x double> @insert_f64_secondelt(<2 x double> %x, ptr %s.addr) { 154; SSE-LABEL: insert_f64_secondelt: 155; SSE: # %bb.0: 156; SSE-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] 157; SSE-NEXT: retq 158; 159; AVX-LABEL: insert_f64_secondelt: 160; AVX: # %bb.0: 161; AVX-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] 162; AVX-NEXT: retq 163 %s = load double, ptr %s.addr 164 %i0 = insertelement <2 x double> %x, double %s, i32 1 165 ret <2 x double> %i0 166} 167 168define <16 x i8> @insert_i8_secondelt(<16 x i8> %x, ptr %s.addr) { 169; SSE2-LABEL: insert_i8_secondelt: 170; SSE2: # %bb.0: 171; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 172; SSE2-NEXT: pand %xmm1, %xmm0 173; SSE2-NEXT: movzbl (%rdi), %eax 174; SSE2-NEXT: movd %eax, %xmm2 175; SSE2-NEXT: psllw $8, %xmm2 176; SSE2-NEXT: pandn %xmm2, %xmm1 177; SSE2-NEXT: por %xmm1, %xmm0 178; SSE2-NEXT: retq 179; 180; SSE41-LABEL: insert_i8_secondelt: 181; SSE41: # %bb.0: 182; SSE41-NEXT: pinsrb $1, (%rdi), %xmm0 183; SSE41-NEXT: retq 184; 185; AVX-LABEL: insert_i8_secondelt: 186; AVX: # %bb.0: 187; AVX-NEXT: vpinsrb $1, (%rdi), %xmm0, %xmm0 188; AVX-NEXT: retq 189 %s = load i8, ptr %s.addr 190 %i0 = insertelement <16 x i8> %x, i8 %s, i32 1 191 ret <16 x i8> %i0 192} 193 194define <8 x i16> @insert_i16_secondelt(<8 x i16> %x, ptr %s.addr) { 195; SSE-LABEL: insert_i16_secondelt: 196; SSE: # %bb.0: 197; SSE-NEXT: pinsrw $1, (%rdi), %xmm0 198; SSE-NEXT: retq 199; 200; AVX-LABEL: insert_i16_secondelt: 201; AVX: # %bb.0: 202; AVX-NEXT: vpinsrw $1, (%rdi), %xmm0, %xmm0 203; AVX-NEXT: retq 204 %s = load i16, ptr %s.addr 205 %i0 = insertelement <8 x i16> %x, i16 %s, i32 1 206 ret <8 x i16> %i0 207} 208 209define <4 x i32> @insert_i32_secondelt(<4 x i32> %x, ptr %s.addr) { 210; SSE2-LABEL: insert_i32_secondelt: 211; SSE2: # %bb.0: 212; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 213; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 214; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] 215; SSE2-NEXT: movaps %xmm1, %xmm0 216; SSE2-NEXT: retq 217; 218; SSE41-LABEL: insert_i32_secondelt: 219; SSE41: # %bb.0: 220; SSE41-NEXT: pinsrd $1, (%rdi), %xmm0 221; SSE41-NEXT: retq 222; 223; AVX-LABEL: insert_i32_secondelt: 224; AVX: # %bb.0: 225; AVX-NEXT: vpinsrd $1, (%rdi), %xmm0, %xmm0 226; AVX-NEXT: retq 227 %s = load i32, ptr %s.addr 228 %i0 = insertelement <4 x i32> %x, i32 %s, i32 1 229 ret <4 x i32> %i0 230} 231 232define <2 x i64> @insert_i64_secondelt(<2 x i64> %x, ptr %s.addr) { 233; SSE2-LABEL: insert_i64_secondelt: 234; SSE2: # %bb.0: 235; SSE2-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero 236; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 237; SSE2-NEXT: retq 238; 239; SSE41-LABEL: insert_i64_secondelt: 240; SSE41: # %bb.0: 241; SSE41-NEXT: pinsrq $1, (%rdi), %xmm0 242; SSE41-NEXT: retq 243; 244; AVX-LABEL: insert_i64_secondelt: 245; AVX: # %bb.0: 246; AVX-NEXT: vpinsrq $1, (%rdi), %xmm0, %xmm0 247; AVX-NEXT: retq 248 %s = load i64, ptr %s.addr 249 %i0 = insertelement <2 x i64> %x, i64 %s, i32 1 250 ret <2 x i64> %i0 251} 252 253; element insertion into two elements 254 255define <4 x float> @insert_f32_two_elts(<4 x float> %x, ptr %s.addr) { 256; SSE-LABEL: insert_f32_two_elts: 257; SSE: # %bb.0: 258; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 259; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3] 260; SSE-NEXT: movaps %xmm1, %xmm0 261; SSE-NEXT: retq 262; 263; AVX-LABEL: insert_f32_two_elts: 264; AVX: # %bb.0: 265; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero 266; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,0],xmm0[2,3] 267; AVX-NEXT: retq 268 %s = load float, ptr %s.addr 269 %i0 = insertelement <4 x float> %x, float %s, i32 0 270 %i1 = insertelement <4 x float> %i0, float %s, i32 1 271 ret <4 x float> %i1 272} 273 274define <2 x double> @insert_f64_two_elts(<2 x double> %x, ptr %s.addr) { 275; SSE2-LABEL: insert_f64_two_elts: 276; SSE2: # %bb.0: 277; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 278; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] 279; SSE2-NEXT: retq 280; 281; SSE41-LABEL: insert_f64_two_elts: 282; SSE41: # %bb.0: 283; SSE41-NEXT: movddup {{.*#+}} xmm0 = mem[0,0] 284; SSE41-NEXT: retq 285; 286; AVX-LABEL: insert_f64_two_elts: 287; AVX: # %bb.0: 288; AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] 289; AVX-NEXT: retq 290 %s = load double, ptr %s.addr 291 %i0 = insertelement <2 x double> %x, double %s, i32 0 292 %i1 = insertelement <2 x double> %i0, double %s, i32 1 293 ret <2 x double> %i1 294} 295 296define <16 x i8> @insert_i8_two_elts(<16 x i8> %x, ptr %s.addr) { 297; SSE2-LABEL: insert_i8_two_elts: 298; SSE2: # %bb.0: 299; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 300; SSE2-NEXT: pand %xmm1, %xmm0 301; SSE2-NEXT: movzbl (%rdi), %eax 302; SSE2-NEXT: movd %eax, %xmm2 303; SSE2-NEXT: pandn %xmm2, %xmm1 304; SSE2-NEXT: por %xmm1, %xmm0 305; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 306; SSE2-NEXT: pand %xmm1, %xmm0 307; SSE2-NEXT: psllw $8, %xmm2 308; SSE2-NEXT: pandn %xmm2, %xmm1 309; SSE2-NEXT: por %xmm1, %xmm0 310; SSE2-NEXT: retq 311; 312; SSE41-LABEL: insert_i8_two_elts: 313; SSE41: # %bb.0: 314; SSE41-NEXT: movzbl (%rdi), %eax 315; SSE41-NEXT: pinsrb $0, %eax, %xmm0 316; SSE41-NEXT: pinsrb $1, %eax, %xmm0 317; SSE41-NEXT: retq 318; 319; AVX-LABEL: insert_i8_two_elts: 320; AVX: # %bb.0: 321; AVX-NEXT: movzbl (%rdi), %eax 322; AVX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 323; AVX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 324; AVX-NEXT: retq 325 %s = load i8, ptr %s.addr 326 %i0 = insertelement <16 x i8> %x, i8 %s, i32 0 327 %i1 = insertelement <16 x i8> %i0, i8 %s, i32 1 328 ret <16 x i8> %i1 329} 330 331define <8 x i16> @insert_i16_two_elts(<8 x i16> %x, ptr %s.addr) { 332; SSE-LABEL: insert_i16_two_elts: 333; SSE: # %bb.0: 334; SSE-NEXT: movzwl (%rdi), %eax 335; SSE-NEXT: pinsrw $0, %eax, %xmm0 336; SSE-NEXT: pinsrw $1, %eax, %xmm0 337; SSE-NEXT: retq 338; 339; AVX-LABEL: insert_i16_two_elts: 340; AVX: # %bb.0: 341; AVX-NEXT: movzwl (%rdi), %eax 342; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 343; AVX-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 344; AVX-NEXT: retq 345 %s = load i16, ptr %s.addr 346 %i0 = insertelement <8 x i16> %x, i16 %s, i32 0 347 %i1 = insertelement <8 x i16> %i0, i16 %s, i32 1 348 ret <8 x i16> %i1 349} 350 351define <4 x i32> @insert_i32_two_elts(<4 x i32> %x, ptr %s.addr) { 352; SSE2-LABEL: insert_i32_two_elts: 353; SSE2: # %bb.0: 354; SSE2-NEXT: movl (%rdi), %eax 355; SSE2-NEXT: movd %eax, %xmm2 356; SSE2-NEXT: movd %eax, %xmm1 357; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 358; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] 359; SSE2-NEXT: movaps %xmm1, %xmm0 360; SSE2-NEXT: retq 361; 362; SSE41-LABEL: insert_i32_two_elts: 363; SSE41: # %bb.0: 364; SSE41-NEXT: movl (%rdi), %eax 365; SSE41-NEXT: pinsrd $0, %eax, %xmm0 366; SSE41-NEXT: pinsrd $1, %eax, %xmm0 367; SSE41-NEXT: retq 368; 369; AVX-LABEL: insert_i32_two_elts: 370; AVX: # %bb.0: 371; AVX-NEXT: movl (%rdi), %eax 372; AVX-NEXT: vpinsrd $0, %eax, %xmm0, %xmm0 373; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 374; AVX-NEXT: retq 375 %s = load i32, ptr %s.addr 376 %i0 = insertelement <4 x i32> %x, i32 %s, i32 0 377 %i1 = insertelement <4 x i32> %i0, i32 %s, i32 1 378 ret <4 x i32> %i1 379} 380 381define <2 x i64> @insert_i64_two_elts(<2 x i64> %x, ptr %s.addr) { 382; SSE-LABEL: insert_i64_two_elts: 383; SSE: # %bb.0: 384; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 385; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 386; SSE-NEXT: retq 387; 388; AVX-LABEL: insert_i64_two_elts: 389; AVX: # %bb.0: 390; AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] 391; AVX-NEXT: retq 392 %s = load i64, ptr %s.addr 393 %i0 = insertelement <2 x i64> %x, i64 %s, i32 0 394 %i1 = insertelement <2 x i64> %i0, i64 %s, i32 1 395 ret <2 x i64> %i1 396} 397 398; Special tests 399 400define void @insert_i32_two_elts_into_different_vectors(<4 x i32> %x, <4 x i32> %y, ptr %s.addr, ptr %x.out.addr, ptr %y.out.addr) { 401; SSE2-LABEL: insert_i32_two_elts_into_different_vectors: 402; SSE2: # %bb.0: 403; SSE2-NEXT: movl (%rdi), %eax 404; SSE2-NEXT: movd %eax, %xmm2 405; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] 406; SSE2-NEXT: movd %eax, %xmm2 407; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] 408; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3] 409; SSE2-NEXT: movaps %xmm0, (%rsi) 410; SSE2-NEXT: movaps %xmm2, (%rdx) 411; SSE2-NEXT: retq 412; 413; SSE41-LABEL: insert_i32_two_elts_into_different_vectors: 414; SSE41: # %bb.0: 415; SSE41-NEXT: movl (%rdi), %eax 416; SSE41-NEXT: pinsrd $0, %eax, %xmm0 417; SSE41-NEXT: pinsrd $1, %eax, %xmm1 418; SSE41-NEXT: movdqa %xmm0, (%rsi) 419; SSE41-NEXT: movdqa %xmm1, (%rdx) 420; SSE41-NEXT: retq 421; 422; AVX-LABEL: insert_i32_two_elts_into_different_vectors: 423; AVX: # %bb.0: 424; AVX-NEXT: movl (%rdi), %eax 425; AVX-NEXT: vpinsrd $0, %eax, %xmm0, %xmm0 426; AVX-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 427; AVX-NEXT: vmovdqa %xmm0, (%rsi) 428; AVX-NEXT: vmovdqa %xmm1, (%rdx) 429; AVX-NEXT: retq 430 %s = load i32, ptr %s.addr 431 %i0 = insertelement <4 x i32> %x, i32 %s, i32 0 432 %i1 = insertelement <4 x i32> %y, i32 %s, i32 1 433 store <4 x i32> %i0, ptr %x.out.addr 434 store <4 x i32> %i1, ptr %y.out.addr 435 ret void 436} 437 438define <4 x float> @insert_f32_two_elts_extrause_of_scalar(<4 x float> %x, ptr %s.addr, ptr %s.out) { 439; SSE-LABEL: insert_f32_two_elts_extrause_of_scalar: 440; SSE: # %bb.0: 441; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 442; SSE-NEXT: movss %xmm1, (%rsi) 443; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3] 444; SSE-NEXT: movaps %xmm1, %xmm0 445; SSE-NEXT: retq 446; 447; AVX-LABEL: insert_f32_two_elts_extrause_of_scalar: 448; AVX: # %bb.0: 449; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero 450; AVX-NEXT: vmovss %xmm1, (%rsi) 451; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,0],xmm0[2,3] 452; AVX-NEXT: retq 453 %s = load float, ptr %s.addr 454 store float %s, ptr %s.out 455 %i0 = insertelement <4 x float> %x, float %s, i32 0 456 %i1 = insertelement <4 x float> %i0, float %s, i32 1 457 ret <4 x float> %i1 458} 459