1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_mem_shuffle 2; RUN: llc < %s -disable-peephole -mtriple=i386-apple-darwin -mattr=+sse4.1 -show-mc-encoding | FileCheck %s --check-prefixes=SSE,X86-SSE 3; RUN: llc < %s -disable-peephole -mtriple=i386-apple-darwin -mattr=+avx -show-mc-encoding | FileCheck %s --check-prefixes=AVX,AVX1,X86-AVX1 4; RUN: llc < %s -disable-peephole -mtriple=i386-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl -show-mc-encoding | FileCheck %s --check-prefixes=AVX,AVX512,X86-AVX512 5; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=+sse4.1 -show-mc-encoding | FileCheck %s --check-prefixes=SSE,X64-SSE 6; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=+avx -show-mc-encoding | FileCheck %s --check-prefixes=AVX,X64-AVX,AVX1,X64-AVX1 7; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl -show-mc-encoding | FileCheck %s --check-prefixes=AVX,X64-AVX,AVX512,X64-AVX512 8 9@g16 = external global i16 10 11define <4 x i32> @pinsrd_1(i32 %s, <4 x i32> %tmp) nounwind { 12; X86-SSE-LABEL: pinsrd_1: 13; X86-SSE: ## %bb.0: 14; X86-SSE-NEXT: pinsrd $1, {{[0-9]+}}(%esp), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x22,0x44,0x24,0x04,0x01] 15; X86-SSE-NEXT: retl ## encoding: [0xc3] 16; 17; X86-AVX1-LABEL: pinsrd_1: 18; X86-AVX1: ## %bb.0: 19; X86-AVX1-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0x44,0x24,0x04,0x01] 20; X86-AVX1-NEXT: retl ## encoding: [0xc3] 21; 22; X86-AVX512-LABEL: pinsrd_1: 23; X86-AVX512: ## %bb.0: 24; X86-AVX512-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x22,0x44,0x24,0x04,0x01] 25; X86-AVX512-NEXT: retl ## encoding: [0xc3] 26; 27; X64-SSE-LABEL: pinsrd_1: 28; X64-SSE: ## %bb.0: 29; X64-SSE-NEXT: pinsrd $1, %edi, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x22,0xc7,0x01] 30; X64-SSE-NEXT: retq ## encoding: [0xc3] 31; 32; X64-AVX1-LABEL: pinsrd_1: 33; X64-AVX1: ## %bb.0: 34; X64-AVX1-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc7,0x01] 35; X64-AVX1-NEXT: retq ## encoding: [0xc3] 36; 37; X64-AVX512-LABEL: pinsrd_1: 38; X64-AVX512: ## %bb.0: 39; X64-AVX512-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x22,0xc7,0x01] 40; X64-AVX512-NEXT: retq ## encoding: [0xc3] 41 %tmp1 = insertelement <4 x i32> %tmp, i32 %s, i32 1 42 ret <4 x i32> %tmp1 43} 44 45define <16 x i8> @pinsrb_1(i8 %s, <16 x i8> %tmp) nounwind { 46; X86-SSE-LABEL: pinsrb_1: 47; X86-SSE: ## %bb.0: 48; X86-SSE-NEXT: pinsrb $1, {{[0-9]+}}(%esp), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x20,0x44,0x24,0x04,0x01] 49; X86-SSE-NEXT: retl ## encoding: [0xc3] 50; 51; X86-AVX1-LABEL: pinsrb_1: 52; X86-AVX1: ## %bb.0: 53; X86-AVX1-NEXT: vpinsrb $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0x44,0x24,0x04,0x01] 54; X86-AVX1-NEXT: retl ## encoding: [0xc3] 55; 56; X86-AVX512-LABEL: pinsrb_1: 57; X86-AVX512: ## %bb.0: 58; X86-AVX512-NEXT: vpinsrb $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0x44,0x24,0x04,0x01] 59; X86-AVX512-NEXT: retl ## encoding: [0xc3] 60; 61; X64-SSE-LABEL: pinsrb_1: 62; X64-SSE: ## %bb.0: 63; X64-SSE-NEXT: pinsrb $1, %edi, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x20,0xc7,0x01] 64; X64-SSE-NEXT: retq ## encoding: [0xc3] 65; 66; X64-AVX1-LABEL: pinsrb_1: 67; X64-AVX1: ## %bb.0: 68; X64-AVX1-NEXT: vpinsrb $1, %edi, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc7,0x01] 69; X64-AVX1-NEXT: retq ## encoding: [0xc3] 70; 71; X64-AVX512-LABEL: pinsrb_1: 72; X64-AVX512: ## %bb.0: 73; X64-AVX512-NEXT: vpinsrb $1, %edi, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc7,0x01] 74; X64-AVX512-NEXT: retq ## encoding: [0xc3] 75 %tmp1 = insertelement <16 x i8> %tmp, i8 %s, i32 1 76 ret <16 x i8> %tmp1 77} 78 79define <2 x i64> @pmovzxbq_1() nounwind { 80; X86-SSE-LABEL: pmovzxbq_1: 81; X86-SSE: ## %bb.0: ## %entry 82; X86-SSE-NEXT: movl L_g16$non_lazy_ptr, %eax ## encoding: [0xa1,A,A,A,A] 83; X86-SSE-NEXT: ## fixup A - offset: 1, value: L_g16$non_lazy_ptr, kind: FK_Data_4 84; X86-SSE-NEXT: pmovzxbq (%eax), %xmm0 ## xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero 85; X86-SSE-NEXT: ## encoding: [0x66,0x0f,0x38,0x32,0x00] 86; X86-SSE-NEXT: retl ## encoding: [0xc3] 87; 88; X86-AVX1-LABEL: pmovzxbq_1: 89; X86-AVX1: ## %bb.0: ## %entry 90; X86-AVX1-NEXT: movl L_g16$non_lazy_ptr, %eax ## encoding: [0xa1,A,A,A,A] 91; X86-AVX1-NEXT: ## fixup A - offset: 1, value: L_g16$non_lazy_ptr, kind: FK_Data_4 92; X86-AVX1-NEXT: vpmovzxbq (%eax), %xmm0 ## xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero 93; X86-AVX1-NEXT: ## encoding: [0xc4,0xe2,0x79,0x32,0x00] 94; X86-AVX1-NEXT: retl ## encoding: [0xc3] 95; 96; X86-AVX512-LABEL: pmovzxbq_1: 97; X86-AVX512: ## %bb.0: ## %entry 98; X86-AVX512-NEXT: movl L_g16$non_lazy_ptr, %eax ## encoding: [0xa1,A,A,A,A] 99; X86-AVX512-NEXT: ## fixup A - offset: 1, value: L_g16$non_lazy_ptr, kind: FK_Data_4 100; X86-AVX512-NEXT: vpmovzxbq (%eax), %xmm0 ## xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero 101; X86-AVX512-NEXT: ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x32,0x00] 102; X86-AVX512-NEXT: retl ## encoding: [0xc3] 103; 104; X64-SSE-LABEL: pmovzxbq_1: 105; X64-SSE: ## %bb.0: ## %entry 106; X64-SSE-NEXT: movq _g16@GOTPCREL(%rip), %rax ## encoding: [0x48,0x8b,0x05,A,A,A,A] 107; X64-SSE-NEXT: ## fixup A - offset: 3, value: _g16@GOTPCREL-4, kind: reloc_riprel_4byte_movq_load 108; X64-SSE-NEXT: pmovzxbq (%rax), %xmm0 ## xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero 109; X64-SSE-NEXT: ## encoding: [0x66,0x0f,0x38,0x32,0x00] 110; X64-SSE-NEXT: retq ## encoding: [0xc3] 111; 112; X64-AVX1-LABEL: pmovzxbq_1: 113; X64-AVX1: ## %bb.0: ## %entry 114; X64-AVX1-NEXT: movq _g16@GOTPCREL(%rip), %rax ## encoding: [0x48,0x8b,0x05,A,A,A,A] 115; X64-AVX1-NEXT: ## fixup A - offset: 3, value: _g16@GOTPCREL-4, kind: reloc_riprel_4byte_movq_load 116; X64-AVX1-NEXT: vpmovzxbq (%rax), %xmm0 ## xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero 117; X64-AVX1-NEXT: ## encoding: [0xc4,0xe2,0x79,0x32,0x00] 118; X64-AVX1-NEXT: retq ## encoding: [0xc3] 119; 120; X64-AVX512-LABEL: pmovzxbq_1: 121; X64-AVX512: ## %bb.0: ## %entry 122; X64-AVX512-NEXT: movq _g16@GOTPCREL(%rip), %rax ## encoding: [0x48,0x8b,0x05,A,A,A,A] 123; X64-AVX512-NEXT: ## fixup A - offset: 3, value: _g16@GOTPCREL-4, kind: reloc_riprel_4byte_movq_load 124; X64-AVX512-NEXT: vpmovzxbq (%rax), %xmm0 ## xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero 125; X64-AVX512-NEXT: ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x32,0x00] 126; X64-AVX512-NEXT: retq ## encoding: [0xc3] 127entry: 128 %0 = load i16, ptr @g16, align 2 ; <i16> [#uses=1] 129 %1 = insertelement <8 x i16> undef, i16 %0, i32 0 ; <<8 x i16>> [#uses=1] 130 %2 = bitcast <8 x i16> %1 to <16 x i8> ; <<16 x i8>> [#uses=1] 131 %3 = tail call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %2) nounwind readnone ; <<2 x i64>> [#uses=1] 132 ret <2 x i64> %3 133} 134 135declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone 136 137define i32 @extractps_1(<4 x float> %v) nounwind { 138; SSE-LABEL: extractps_1: 139; SSE: ## %bb.0: 140; SSE-NEXT: extractps $3, %xmm0, %eax ## encoding: [0x66,0x0f,0x3a,0x17,0xc0,0x03] 141; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 142; 143; AVX1-LABEL: extractps_1: 144; AVX1: ## %bb.0: 145; AVX1-NEXT: vextractps $3, %xmm0, %eax ## encoding: [0xc4,0xe3,0x79,0x17,0xc0,0x03] 146; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 147; 148; AVX512-LABEL: extractps_1: 149; AVX512: ## %bb.0: 150; AVX512-NEXT: vextractps $3, %xmm0, %eax ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x17,0xc0,0x03] 151; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 152 %s = extractelement <4 x float> %v, i32 3 153 %i = bitcast float %s to i32 154 ret i32 %i 155} 156define i32 @extractps_2(<4 x float> %v) nounwind { 157; SSE-LABEL: extractps_2: 158; SSE: ## %bb.0: 159; SSE-NEXT: extractps $3, %xmm0, %eax ## encoding: [0x66,0x0f,0x3a,0x17,0xc0,0x03] 160; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 161; 162; AVX1-LABEL: extractps_2: 163; AVX1: ## %bb.0: 164; AVX1-NEXT: vextractps $3, %xmm0, %eax ## encoding: [0xc4,0xe3,0x79,0x17,0xc0,0x03] 165; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 166; 167; AVX512-LABEL: extractps_2: 168; AVX512: ## %bb.0: 169; AVX512-NEXT: vextractps $3, %xmm0, %eax ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x17,0xc0,0x03] 170; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 171 %t = bitcast <4 x float> %v to <4 x i32> 172 %s = extractelement <4 x i32> %t, i32 3 173 ret i32 %s 174} 175 176 177; The non-store form of extractps puts its result into a GPR. 178; This makes it suitable for an extract from a <4 x float> that 179; is bitcasted to i32, but unsuitable for much of anything else. 180 181define float @ext_1(<4 x float> %v) nounwind { 182; X86-SSE-LABEL: ext_1: 183; X86-SSE: ## %bb.0: 184; X86-SSE-NEXT: pushl %eax ## encoding: [0x50] 185; X86-SSE-NEXT: shufps $255, %xmm0, %xmm0 ## encoding: [0x0f,0xc6,0xc0,0xff] 186; X86-SSE-NEXT: ## xmm0 = xmm0[3,3,3,3] 187; X86-SSE-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ## encoding: [0xf3,0x0f,0x58,0x05,A,A,A,A] 188; X86-SSE-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 189; X86-SSE-NEXT: movss %xmm0, (%esp) ## encoding: [0xf3,0x0f,0x11,0x04,0x24] 190; X86-SSE-NEXT: flds (%esp) ## encoding: [0xd9,0x04,0x24] 191; X86-SSE-NEXT: popl %eax ## encoding: [0x58] 192; X86-SSE-NEXT: retl ## encoding: [0xc3] 193; 194; X86-AVX1-LABEL: ext_1: 195; X86-AVX1: ## %bb.0: 196; X86-AVX1-NEXT: pushl %eax ## encoding: [0x50] 197; X86-AVX1-NEXT: vshufps $255, %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc6,0xc0,0xff] 198; X86-AVX1-NEXT: ## xmm0 = xmm0[3,3,3,3] 199; X86-AVX1-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x58,0x05,A,A,A,A] 200; X86-AVX1-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 201; X86-AVX1-NEXT: vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24] 202; X86-AVX1-NEXT: flds (%esp) ## encoding: [0xd9,0x04,0x24] 203; X86-AVX1-NEXT: popl %eax ## encoding: [0x58] 204; X86-AVX1-NEXT: retl ## encoding: [0xc3] 205; 206; X86-AVX512-LABEL: ext_1: 207; X86-AVX512: ## %bb.0: 208; X86-AVX512-NEXT: pushl %eax ## encoding: [0x50] 209; X86-AVX512-NEXT: vshufps $255, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0xc6,0xc0,0xff] 210; X86-AVX512-NEXT: ## xmm0 = xmm0[3,3,3,3] 211; X86-AVX512-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x58,0x05,A,A,A,A] 212; X86-AVX512-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 213; X86-AVX512-NEXT: vmovss %xmm0, (%esp) ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x11,0x04,0x24] 214; X86-AVX512-NEXT: flds (%esp) ## encoding: [0xd9,0x04,0x24] 215; X86-AVX512-NEXT: popl %eax ## encoding: [0x58] 216; X86-AVX512-NEXT: retl ## encoding: [0xc3] 217; 218; X64-SSE-LABEL: ext_1: 219; X64-SSE: ## %bb.0: 220; X64-SSE-NEXT: shufps $255, %xmm0, %xmm0 ## encoding: [0x0f,0xc6,0xc0,0xff] 221; X64-SSE-NEXT: ## xmm0 = xmm0[3,3,3,3] 222; X64-SSE-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ## encoding: [0xf3,0x0f,0x58,0x05,A,A,A,A] 223; X64-SSE-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte 224; X64-SSE-NEXT: retq ## encoding: [0xc3] 225; 226; X64-AVX1-LABEL: ext_1: 227; X64-AVX1: ## %bb.0: 228; X64-AVX1-NEXT: vshufps $255, %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc6,0xc0,0xff] 229; X64-AVX1-NEXT: ## xmm0 = xmm0[3,3,3,3] 230; X64-AVX1-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x58,0x05,A,A,A,A] 231; X64-AVX1-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte 232; X64-AVX1-NEXT: retq ## encoding: [0xc3] 233; 234; X64-AVX512-LABEL: ext_1: 235; X64-AVX512: ## %bb.0: 236; X64-AVX512-NEXT: vshufps $255, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0xc6,0xc0,0xff] 237; X64-AVX512-NEXT: ## xmm0 = xmm0[3,3,3,3] 238; X64-AVX512-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x58,0x05,A,A,A,A] 239; X64-AVX512-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte 240; X64-AVX512-NEXT: retq ## encoding: [0xc3] 241 %s = extractelement <4 x float> %v, i32 3 242 %t = fadd float %s, 1.0 243 ret float %t 244} 245 246define float @ext_2(<4 x float> %v) nounwind { 247; X86-SSE-LABEL: ext_2: 248; X86-SSE: ## %bb.0: 249; X86-SSE-NEXT: pushl %eax ## encoding: [0x50] 250; X86-SSE-NEXT: shufps $255, %xmm0, %xmm0 ## encoding: [0x0f,0xc6,0xc0,0xff] 251; X86-SSE-NEXT: ## xmm0 = xmm0[3,3,3,3] 252; X86-SSE-NEXT: movss %xmm0, (%esp) ## encoding: [0xf3,0x0f,0x11,0x04,0x24] 253; X86-SSE-NEXT: flds (%esp) ## encoding: [0xd9,0x04,0x24] 254; X86-SSE-NEXT: popl %eax ## encoding: [0x58] 255; X86-SSE-NEXT: retl ## encoding: [0xc3] 256; 257; X86-AVX1-LABEL: ext_2: 258; X86-AVX1: ## %bb.0: 259; X86-AVX1-NEXT: pushl %eax ## encoding: [0x50] 260; X86-AVX1-NEXT: vshufps $255, %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc6,0xc0,0xff] 261; X86-AVX1-NEXT: ## xmm0 = xmm0[3,3,3,3] 262; X86-AVX1-NEXT: vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24] 263; X86-AVX1-NEXT: flds (%esp) ## encoding: [0xd9,0x04,0x24] 264; X86-AVX1-NEXT: popl %eax ## encoding: [0x58] 265; X86-AVX1-NEXT: retl ## encoding: [0xc3] 266; 267; X86-AVX512-LABEL: ext_2: 268; X86-AVX512: ## %bb.0: 269; X86-AVX512-NEXT: pushl %eax ## encoding: [0x50] 270; X86-AVX512-NEXT: vshufps $255, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0xc6,0xc0,0xff] 271; X86-AVX512-NEXT: ## xmm0 = xmm0[3,3,3,3] 272; X86-AVX512-NEXT: vmovss %xmm0, (%esp) ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x11,0x04,0x24] 273; X86-AVX512-NEXT: flds (%esp) ## encoding: [0xd9,0x04,0x24] 274; X86-AVX512-NEXT: popl %eax ## encoding: [0x58] 275; X86-AVX512-NEXT: retl ## encoding: [0xc3] 276; 277; X64-SSE-LABEL: ext_2: 278; X64-SSE: ## %bb.0: 279; X64-SSE-NEXT: shufps $255, %xmm0, %xmm0 ## encoding: [0x0f,0xc6,0xc0,0xff] 280; X64-SSE-NEXT: ## xmm0 = xmm0[3,3,3,3] 281; X64-SSE-NEXT: retq ## encoding: [0xc3] 282; 283; X64-AVX1-LABEL: ext_2: 284; X64-AVX1: ## %bb.0: 285; X64-AVX1-NEXT: vshufps $255, %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc6,0xc0,0xff] 286; X64-AVX1-NEXT: ## xmm0 = xmm0[3,3,3,3] 287; X64-AVX1-NEXT: retq ## encoding: [0xc3] 288; 289; X64-AVX512-LABEL: ext_2: 290; X64-AVX512: ## %bb.0: 291; X64-AVX512-NEXT: vshufps $255, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0xc6,0xc0,0xff] 292; X64-AVX512-NEXT: ## xmm0 = xmm0[3,3,3,3] 293; X64-AVX512-NEXT: retq ## encoding: [0xc3] 294 %s = extractelement <4 x float> %v, i32 3 295 ret float %s 296} 297 298define i32 @ext_3(<4 x i32> %v) nounwind { 299; SSE-LABEL: ext_3: 300; SSE: ## %bb.0: 301; SSE-NEXT: extractps $3, %xmm0, %eax ## encoding: [0x66,0x0f,0x3a,0x17,0xc0,0x03] 302; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 303; 304; AVX1-LABEL: ext_3: 305; AVX1: ## %bb.0: 306; AVX1-NEXT: vextractps $3, %xmm0, %eax ## encoding: [0xc4,0xe3,0x79,0x17,0xc0,0x03] 307; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 308; 309; AVX512-LABEL: ext_3: 310; AVX512: ## %bb.0: 311; AVX512-NEXT: vextractps $3, %xmm0, %eax ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x17,0xc0,0x03] 312; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 313 %i = extractelement <4 x i32> %v, i32 3 314 ret i32 %i 315} 316 317define <4 x float> @insertps_1(<4 x float> %t1, <4 x float> %t2) nounwind { 318; SSE-LABEL: insertps_1: 319; SSE: ## %bb.0: 320; SSE-NEXT: insertps $21, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x15] 321; SSE-NEXT: ## xmm0 = zero,xmm1[0],zero,xmm0[3] 322; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 323; 324; AVX1-LABEL: insertps_1: 325; AVX1: ## %bb.0: 326; AVX1-NEXT: vinsertps $21, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x15] 327; AVX1-NEXT: ## xmm0 = zero,xmm1[0],zero,xmm0[3] 328; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 329; 330; AVX512-LABEL: insertps_1: 331; AVX512: ## %bb.0: 332; AVX512-NEXT: vinsertps $21, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x15] 333; AVX512-NEXT: ## xmm0 = zero,xmm1[0],zero,xmm0[3] 334; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 335 %tmp1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %t1, <4 x float> %t2, i32 21) nounwind readnone 336 ret <4 x float> %tmp1 337} 338 339declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone 340 341; When optimizing for speed, prefer blendps over insertps even if it means we have to 342; generate a separate movss to load the scalar operand. 343define <4 x float> @blendps_not_insertps_1(<4 x float> %t1, float %t2) nounwind { 344; X86-SSE-LABEL: blendps_not_insertps_1: 345; X86-SSE: ## %bb.0: 346; X86-SSE-NEXT: movss {{[0-9]+}}(%esp), %xmm1 ## xmm1 = mem[0],zero,zero,zero 347; X86-SSE-NEXT: ## encoding: [0xf3,0x0f,0x10,0x4c,0x24,0x04] 348; X86-SSE-NEXT: blendps $1, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0c,0xc1,0x01] 349; X86-SSE-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3] 350; X86-SSE-NEXT: retl ## encoding: [0xc3] 351; 352; X86-AVX1-LABEL: blendps_not_insertps_1: 353; X86-AVX1: ## %bb.0: 354; X86-AVX1-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 ## xmm1 = mem[0],zero,zero,zero 355; X86-AVX1-NEXT: ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x04] 356; X86-AVX1-NEXT: vblendps $1, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01] 357; X86-AVX1-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3] 358; X86-AVX1-NEXT: retl ## encoding: [0xc3] 359; 360; X86-AVX512-LABEL: blendps_not_insertps_1: 361; X86-AVX512: ## %bb.0: 362; X86-AVX512-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 ## xmm1 = mem[0],zero,zero,zero 363; X86-AVX512-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x04] 364; X86-AVX512-NEXT: vblendps $1, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01] 365; X86-AVX512-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3] 366; X86-AVX512-NEXT: retl ## encoding: [0xc3] 367; 368; X64-SSE-LABEL: blendps_not_insertps_1: 369; X64-SSE: ## %bb.0: 370; X64-SSE-NEXT: blendps $1, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0c,0xc1,0x01] 371; X64-SSE-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3] 372; X64-SSE-NEXT: retq ## encoding: [0xc3] 373; 374; X64-AVX-LABEL: blendps_not_insertps_1: 375; X64-AVX: ## %bb.0: 376; X64-AVX-NEXT: vblendps $1, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01] 377; X64-AVX-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3] 378; X64-AVX-NEXT: retq ## encoding: [0xc3] 379 %tmp1 = insertelement <4 x float> %t1, float %t2, i32 0 380 ret <4 x float> %tmp1 381} 382 383; When optimizing for size, generate an insertps if there's a load fold opportunity. 384; The difference between i386 and x86-64 ABIs for the float operand means we should 385; generate an insertps for X86 but not for X64! 386define <4 x float> @insertps_or_blendps(<4 x float> %t1, float %t2) minsize nounwind { 387; X86-SSE-LABEL: insertps_or_blendps: 388; X86-SSE: ## %bb.0: 389; X86-SSE-NEXT: movss {{[0-9]+}}(%esp), %xmm1 ## xmm1 = mem[0],zero,zero,zero 390; X86-SSE-NEXT: ## encoding: [0xf3,0x0f,0x10,0x4c,0x24,0x04] 391; X86-SSE-NEXT: movss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0x10,0xc1] 392; X86-SSE-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3] 393; X86-SSE-NEXT: retl ## encoding: [0xc3] 394; 395; X86-AVX1-LABEL: insertps_or_blendps: 396; X86-AVX1: ## %bb.0: 397; X86-AVX1-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 ## xmm1 = mem[0],zero,zero,zero 398; X86-AVX1-NEXT: ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x04] 399; X86-AVX1-NEXT: vmovss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x10,0xc1] 400; X86-AVX1-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3] 401; X86-AVX1-NEXT: retl ## encoding: [0xc3] 402; 403; X86-AVX512-LABEL: insertps_or_blendps: 404; X86-AVX512: ## %bb.0: 405; X86-AVX512-NEXT: vmovss {{[0-9]+}}(%esp), %xmm1 ## xmm1 = mem[0],zero,zero,zero 406; X86-AVX512-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x04] 407; X86-AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0xc1] 408; X86-AVX512-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3] 409; X86-AVX512-NEXT: retl ## encoding: [0xc3] 410; 411; X64-SSE-LABEL: insertps_or_blendps: 412; X64-SSE: ## %bb.0: 413; X64-SSE-NEXT: movss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0x10,0xc1] 414; X64-SSE-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3] 415; X64-SSE-NEXT: retq ## encoding: [0xc3] 416; 417; X64-AVX1-LABEL: insertps_or_blendps: 418; X64-AVX1: ## %bb.0: 419; X64-AVX1-NEXT: vmovss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x10,0xc1] 420; X64-AVX1-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3] 421; X64-AVX1-NEXT: retq ## encoding: [0xc3] 422; 423; X64-AVX512-LABEL: insertps_or_blendps: 424; X64-AVX512: ## %bb.0: 425; X64-AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0xc1] 426; X64-AVX512-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3] 427; X64-AVX512-NEXT: retq ## encoding: [0xc3] 428 %tmp1 = insertelement <4 x float> %t1, float %t2, i32 0 429 ret <4 x float> %tmp1 430} 431 432; An insert into the low 32-bits of a vector from the low 32-bits of another vector 433; is always just a blendps because blendps is never more expensive than insertps. 434define <4 x float> @blendps_not_insertps_2(<4 x float> %t1, <4 x float> %t2) nounwind { 435; SSE-LABEL: blendps_not_insertps_2: 436; SSE: ## %bb.0: 437; SSE-NEXT: blendps $1, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0c,0xc1,0x01] 438; SSE-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3] 439; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 440; 441; AVX-LABEL: blendps_not_insertps_2: 442; AVX: ## %bb.0: 443; AVX-NEXT: vblendps $1, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01] 444; AVX-NEXT: ## xmm0 = xmm1[0],xmm0[1,2,3] 445; AVX-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 446 %tmp2 = extractelement <4 x float> %t2, i32 0 447 %tmp1 = insertelement <4 x float> %t1, float %tmp2, i32 0 448 ret <4 x float> %tmp1 449} 450 451define i32 @ptestz_1(<2 x i64> %t1, <2 x i64> %t2) nounwind { 452; SSE-LABEL: ptestz_1: 453; SSE: ## %bb.0: 454; SSE-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] 455; SSE-NEXT: ptest %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0x17,0xc1] 456; SSE-NEXT: sete %al ## encoding: [0x0f,0x94,0xc0] 457; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 458; 459; AVX-LABEL: ptestz_1: 460; AVX: ## %bb.0: 461; AVX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] 462; AVX-NEXT: vptest %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x17,0xc1] 463; AVX-NEXT: sete %al ## encoding: [0x0f,0x94,0xc0] 464; AVX-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 465 %tmp1 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone 466 ret i32 %tmp1 467} 468 469define i32 @ptestz_2(<2 x i64> %t1, <2 x i64> %t2) nounwind { 470; SSE-LABEL: ptestz_2: 471; SSE: ## %bb.0: 472; SSE-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] 473; SSE-NEXT: ptest %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0x17,0xc1] 474; SSE-NEXT: setb %al ## encoding: [0x0f,0x92,0xc0] 475; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 476; 477; AVX-LABEL: ptestz_2: 478; AVX: ## %bb.0: 479; AVX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] 480; AVX-NEXT: vptest %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x17,0xc1] 481; AVX-NEXT: setb %al ## encoding: [0x0f,0x92,0xc0] 482; AVX-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 483 %tmp1 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone 484 ret i32 %tmp1 485} 486 487define i32 @ptestz_3(<2 x i64> %t1, <2 x i64> %t2) nounwind { 488; SSE-LABEL: ptestz_3: 489; SSE: ## %bb.0: 490; SSE-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] 491; SSE-NEXT: ptest %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0x17,0xc1] 492; SSE-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0] 493; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 494; 495; AVX-LABEL: ptestz_3: 496; AVX: ## %bb.0: 497; AVX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0] 498; AVX-NEXT: vptest %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x17,0xc1] 499; AVX-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0] 500; AVX-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 501 %tmp1 = call i32 @llvm.x86.sse41.ptestnzc(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone 502 ret i32 %tmp1 503} 504 505declare i32 @llvm.x86.sse41.ptestz(<2 x i64>, <2 x i64>) nounwind readnone 506declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone 507declare i32 @llvm.x86.sse41.ptestnzc(<2 x i64>, <2 x i64>) nounwind readnone 508 509; This used to compile to insertps $0 + insertps $16. insertps $0 is always 510; pointless. 511define <2 x float> @buildvector(<2 x float> %A, <2 x float> %B) nounwind { 512; SSE-LABEL: buildvector: 513; SSE: ## %bb.0: ## %entry 514; SSE-NEXT: movshdup %xmm0, %xmm2 ## encoding: [0xf3,0x0f,0x16,0xd0] 515; SSE-NEXT: ## xmm2 = xmm0[1,1,3,3] 516; SSE-NEXT: movshdup %xmm1, %xmm3 ## encoding: [0xf3,0x0f,0x16,0xd9] 517; SSE-NEXT: ## xmm3 = xmm1[1,1,3,3] 518; SSE-NEXT: addss %xmm2, %xmm3 ## encoding: [0xf3,0x0f,0x58,0xda] 519; SSE-NEXT: addss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0x58,0xc1] 520; SSE-NEXT: insertps $16, %xmm3, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc3,0x10] 521; SSE-NEXT: ## xmm0 = xmm0[0],xmm3[0],xmm0[2,3] 522; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 523; 524; AVX1-LABEL: buildvector: 525; AVX1: ## %bb.0: ## %entry 526; AVX1-NEXT: vmovshdup %xmm0, %xmm2 ## encoding: [0xc5,0xfa,0x16,0xd0] 527; AVX1-NEXT: ## xmm2 = xmm0[1,1,3,3] 528; AVX1-NEXT: vmovshdup %xmm1, %xmm3 ## encoding: [0xc5,0xfa,0x16,0xd9] 529; AVX1-NEXT: ## xmm3 = xmm1[1,1,3,3] 530; AVX1-NEXT: vaddss %xmm3, %xmm2, %xmm2 ## encoding: [0xc5,0xea,0x58,0xd3] 531; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x58,0xc1] 532; AVX1-NEXT: vinsertps $16, %xmm2, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc2,0x10] 533; AVX1-NEXT: ## xmm0 = xmm0[0],xmm2[0],xmm0[2,3] 534; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 535; 536; AVX512-LABEL: buildvector: 537; AVX512: ## %bb.0: ## %entry 538; AVX512-NEXT: vmovshdup %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x16,0xd0] 539; AVX512-NEXT: ## xmm2 = xmm0[1,1,3,3] 540; AVX512-NEXT: vmovshdup %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x16,0xd9] 541; AVX512-NEXT: ## xmm3 = xmm1[1,1,3,3] 542; AVX512-NEXT: vaddss %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xea,0x58,0xd3] 543; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x58,0xc1] 544; AVX512-NEXT: vinsertps $16, %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc2,0x10] 545; AVX512-NEXT: ## xmm0 = xmm0[0],xmm2[0],xmm0[2,3] 546; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 547entry: 548 %tmp7 = extractelement <2 x float> %A, i32 0 549 %tmp5 = extractelement <2 x float> %A, i32 1 550 %tmp3 = extractelement <2 x float> %B, i32 0 551 %tmp1 = extractelement <2 x float> %B, i32 1 552 %add.r = fadd float %tmp7, %tmp3 553 %add.i = fadd float %tmp5, %tmp1 554 %tmp11 = insertelement <2 x float> undef, float %add.r, i32 0 555 %tmp9 = insertelement <2 x float> %tmp11, float %add.i, i32 1 556 ret <2 x float> %tmp9 557} 558 559define <4 x float> @insertps_from_shufflevector_1(<4 x float> %a, ptr nocapture readonly %pb) { 560; X86-SSE-LABEL: insertps_from_shufflevector_1: 561; X86-SSE: ## %bb.0: ## %entry 562; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 563; X86-SSE-NEXT: movaps (%eax), %xmm1 ## encoding: [0x0f,0x28,0x08] 564; X86-SSE-NEXT: insertps $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x30] 565; X86-SSE-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] 566; X86-SSE-NEXT: retl ## encoding: [0xc3] 567; 568; X86-AVX1-LABEL: insertps_from_shufflevector_1: 569; X86-AVX1: ## %bb.0: ## %entry 570; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 571; X86-AVX1-NEXT: vmovaps (%eax), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x08] 572; X86-AVX1-NEXT: vinsertps $48, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30] 573; X86-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] 574; X86-AVX1-NEXT: retl ## encoding: [0xc3] 575; 576; X86-AVX512-LABEL: insertps_from_shufflevector_1: 577; X86-AVX512: ## %bb.0: ## %entry 578; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 579; X86-AVX512-NEXT: vmovaps (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x08] 580; X86-AVX512-NEXT: vinsertps $48, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30] 581; X86-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] 582; X86-AVX512-NEXT: retl ## encoding: [0xc3] 583; 584; X64-SSE-LABEL: insertps_from_shufflevector_1: 585; X64-SSE: ## %bb.0: ## %entry 586; X64-SSE-NEXT: movaps (%rdi), %xmm1 ## encoding: [0x0f,0x28,0x0f] 587; X64-SSE-NEXT: insertps $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x30] 588; X64-SSE-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] 589; X64-SSE-NEXT: retq ## encoding: [0xc3] 590; 591; X64-AVX1-LABEL: insertps_from_shufflevector_1: 592; X64-AVX1: ## %bb.0: ## %entry 593; X64-AVX1-NEXT: vmovaps (%rdi), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x0f] 594; X64-AVX1-NEXT: vinsertps $48, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30] 595; X64-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] 596; X64-AVX1-NEXT: retq ## encoding: [0xc3] 597; 598; X64-AVX512-LABEL: insertps_from_shufflevector_1: 599; X64-AVX512: ## %bb.0: ## %entry 600; X64-AVX512-NEXT: vmovaps (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0f] 601; X64-AVX512-NEXT: vinsertps $48, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30] 602; X64-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] 603; X64-AVX512-NEXT: retq ## encoding: [0xc3] 604entry: 605 %0 = load <4 x float>, ptr %pb, align 16 606 %vecinit6 = shufflevector <4 x float> %a, <4 x float> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4> 607 ret <4 x float> %vecinit6 608} 609 610define <4 x float> @insertps_from_shufflevector_2(<4 x float> %a, <4 x float> %b) { 611; SSE-LABEL: insertps_from_shufflevector_2: 612; SSE: ## %bb.0: ## %entry 613; SSE-NEXT: insertps $96, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x60] 614; SSE-NEXT: ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3] 615; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 616; 617; AVX1-LABEL: insertps_from_shufflevector_2: 618; AVX1: ## %bb.0: ## %entry 619; AVX1-NEXT: vinsertps $96, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x60] 620; AVX1-NEXT: ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3] 621; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 622; 623; AVX512-LABEL: insertps_from_shufflevector_2: 624; AVX512: ## %bb.0: ## %entry 625; AVX512-NEXT: vinsertps $96, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x60] 626; AVX512-NEXT: ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3] 627; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 628entry: 629 %vecinit6 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 5, i32 3> 630 ret <4 x float> %vecinit6 631} 632 633; For loading an i32 from memory into an xmm register we use pinsrd 634; instead of insertps 635define <4 x i32> @pinsrd_from_shufflevector_i32(<4 x i32> %a, ptr nocapture readonly %pb) { 636; X86-SSE-LABEL: pinsrd_from_shufflevector_i32: 637; X86-SSE: ## %bb.0: ## %entry 638; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 639; X86-SSE-NEXT: pshufd $0, (%eax), %xmm1 ## encoding: [0x66,0x0f,0x70,0x08,0x00] 640; X86-SSE-NEXT: ## xmm1 = mem[0,0,0,0] 641; X86-SSE-NEXT: pblendw $192, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0xc0] 642; X86-SSE-NEXT: ## xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] 643; X86-SSE-NEXT: retl ## encoding: [0xc3] 644; 645; X86-AVX1-LABEL: pinsrd_from_shufflevector_i32: 646; X86-AVX1: ## %bb.0: ## %entry 647; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 648; X86-AVX1-NEXT: vbroadcastss (%eax), %xmm1 ## encoding: [0xc4,0xe2,0x79,0x18,0x08] 649; X86-AVX1-NEXT: vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08] 650; X86-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[3] 651; X86-AVX1-NEXT: retl ## encoding: [0xc3] 652; 653; X86-AVX512-LABEL: pinsrd_from_shufflevector_i32: 654; X86-AVX512: ## %bb.0: ## %entry 655; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 656; X86-AVX512-NEXT: vbroadcastss (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x08] 657; X86-AVX512-NEXT: vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08] 658; X86-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[3] 659; X86-AVX512-NEXT: retl ## encoding: [0xc3] 660; 661; X64-SSE-LABEL: pinsrd_from_shufflevector_i32: 662; X64-SSE: ## %bb.0: ## %entry 663; X64-SSE-NEXT: pshufd $0, (%rdi), %xmm1 ## encoding: [0x66,0x0f,0x70,0x0f,0x00] 664; X64-SSE-NEXT: ## xmm1 = mem[0,0,0,0] 665; X64-SSE-NEXT: pblendw $192, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0xc0] 666; X64-SSE-NEXT: ## xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] 667; X64-SSE-NEXT: retq ## encoding: [0xc3] 668; 669; X64-AVX1-LABEL: pinsrd_from_shufflevector_i32: 670; X64-AVX1: ## %bb.0: ## %entry 671; X64-AVX1-NEXT: vbroadcastss (%rdi), %xmm1 ## encoding: [0xc4,0xe2,0x79,0x18,0x0f] 672; X64-AVX1-NEXT: vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08] 673; X64-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[3] 674; X64-AVX1-NEXT: retq ## encoding: [0xc3] 675; 676; X64-AVX512-LABEL: pinsrd_from_shufflevector_i32: 677; X64-AVX512: ## %bb.0: ## %entry 678; X64-AVX512-NEXT: vbroadcastss (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x0f] 679; X64-AVX512-NEXT: vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08] 680; X64-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[3] 681; X64-AVX512-NEXT: retq ## encoding: [0xc3] 682entry: 683 %0 = load <4 x i32>, ptr %pb, align 16 684 %vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4> 685 ret <4 x i32> %vecinit6 686} 687 688define <4 x i32> @insertps_from_shufflevector_i32_2(<4 x i32> %a, <4 x i32> %b) { 689; SSE-LABEL: insertps_from_shufflevector_i32_2: 690; SSE: ## %bb.0: ## %entry 691; SSE-NEXT: pshufd $238, %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc9,0xee] 692; SSE-NEXT: ## xmm1 = xmm1[2,3,2,3] 693; SSE-NEXT: pblendw $12, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0x0c] 694; SSE-NEXT: ## xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 695; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 696; 697; AVX1-LABEL: insertps_from_shufflevector_i32_2: 698; AVX1: ## %bb.0: ## %entry 699; AVX1-NEXT: vshufps $238, %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0xc6,0xc9,0xee] 700; AVX1-NEXT: ## xmm1 = xmm1[2,3,2,3] 701; AVX1-NEXT: vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02] 702; AVX1-NEXT: ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 703; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 704; 705; AVX512-LABEL: insertps_from_shufflevector_i32_2: 706; AVX512: ## %bb.0: ## %entry 707; AVX512-NEXT: vshufps $238, %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0xc6,0xc9,0xee] 708; AVX512-NEXT: ## xmm1 = xmm1[2,3,2,3] 709; AVX512-NEXT: vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02] 710; AVX512-NEXT: ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 711; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 712entry: 713 %vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 3> 714 ret <4 x i32> %vecinit6 715} 716 717define <4 x float> @insertps_from_load_ins_elt_undef(<4 x float> %a, ptr %b) { 718; X86-SSE-LABEL: insertps_from_load_ins_elt_undef: 719; X86-SSE: ## %bb.0: 720; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 721; X86-SSE-NEXT: insertps $16, (%eax), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0x00,0x10] 722; X86-SSE-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[2,3] 723; X86-SSE-NEXT: retl ## encoding: [0xc3] 724; 725; X86-AVX1-LABEL: insertps_from_load_ins_elt_undef: 726; X86-AVX1: ## %bb.0: 727; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 728; X86-AVX1-NEXT: vinsertps $16, (%eax), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x00,0x10] 729; X86-AVX1-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[2,3] 730; X86-AVX1-NEXT: retl ## encoding: [0xc3] 731; 732; X86-AVX512-LABEL: insertps_from_load_ins_elt_undef: 733; X86-AVX512: ## %bb.0: 734; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 735; X86-AVX512-NEXT: vinsertps $16, (%eax), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0x00,0x10] 736; X86-AVX512-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[2,3] 737; X86-AVX512-NEXT: retl ## encoding: [0xc3] 738; 739; X64-SSE-LABEL: insertps_from_load_ins_elt_undef: 740; X64-SSE: ## %bb.0: 741; X64-SSE-NEXT: insertps $16, (%rdi), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0x07,0x10] 742; X64-SSE-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[2,3] 743; X64-SSE-NEXT: retq ## encoding: [0xc3] 744; 745; X64-AVX1-LABEL: insertps_from_load_ins_elt_undef: 746; X64-AVX1: ## %bb.0: 747; X64-AVX1-NEXT: vinsertps $16, (%rdi), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x07,0x10] 748; X64-AVX1-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[2,3] 749; X64-AVX1-NEXT: retq ## encoding: [0xc3] 750; 751; X64-AVX512-LABEL: insertps_from_load_ins_elt_undef: 752; X64-AVX512: ## %bb.0: 753; X64-AVX512-NEXT: vinsertps $16, (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0x07,0x10] 754; X64-AVX512-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[2,3] 755; X64-AVX512-NEXT: retq ## encoding: [0xc3] 756 %1 = load float, ptr %b, align 4 757 %2 = insertelement <4 x float> undef, float %1, i32 0 758 %result = shufflevector <4 x float> %a, <4 x float> %2, <4 x i32> <i32 0, i32 4, i32 2, i32 3> 759 ret <4 x float> %result 760} 761 762; TODO: Like on pinsrd_from_shufflevector_i32, remove this mov instr 763define <4 x i32> @insertps_from_load_ins_elt_undef_i32(<4 x i32> %a, ptr %b) { 764; X86-SSE-LABEL: insertps_from_load_ins_elt_undef_i32: 765; X86-SSE: ## %bb.0: 766; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 767; X86-SSE-NEXT: pinsrd $2, (%eax), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x22,0x00,0x02] 768; X86-SSE-NEXT: retl ## encoding: [0xc3] 769; 770; X86-AVX1-LABEL: insertps_from_load_ins_elt_undef_i32: 771; X86-AVX1: ## %bb.0: 772; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 773; X86-AVX1-NEXT: vpinsrd $2, (%eax), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0x00,0x02] 774; X86-AVX1-NEXT: retl ## encoding: [0xc3] 775; 776; X86-AVX512-LABEL: insertps_from_load_ins_elt_undef_i32: 777; X86-AVX512: ## %bb.0: 778; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 779; X86-AVX512-NEXT: vpinsrd $2, (%eax), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x22,0x00,0x02] 780; X86-AVX512-NEXT: retl ## encoding: [0xc3] 781; 782; X64-SSE-LABEL: insertps_from_load_ins_elt_undef_i32: 783; X64-SSE: ## %bb.0: 784; X64-SSE-NEXT: pinsrd $2, (%rdi), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x22,0x07,0x02] 785; X64-SSE-NEXT: retq ## encoding: [0xc3] 786; 787; X64-AVX1-LABEL: insertps_from_load_ins_elt_undef_i32: 788; X64-AVX1: ## %bb.0: 789; X64-AVX1-NEXT: vpinsrd $2, (%rdi), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0x07,0x02] 790; X64-AVX1-NEXT: retq ## encoding: [0xc3] 791; 792; X64-AVX512-LABEL: insertps_from_load_ins_elt_undef_i32: 793; X64-AVX512: ## %bb.0: 794; X64-AVX512-NEXT: vpinsrd $2, (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x22,0x07,0x02] 795; X64-AVX512-NEXT: retq ## encoding: [0xc3] 796 %1 = load i32, ptr %b, align 4 797 %2 = insertelement <4 x i32> undef, i32 %1, i32 0 798 %result = shufflevector <4 x i32> %a, <4 x i32> %2, <4 x i32> <i32 0, i32 1, i32 4, i32 3> 799 ret <4 x i32> %result 800} 801 802;;;;;; Shuffles optimizable with a single insertps or blend instruction 803define <4 x float> @shuf_XYZ0(<4 x float> %x, <4 x float> %a) { 804; SSE-LABEL: shuf_XYZ0: 805; SSE: ## %bb.0: 806; SSE-NEXT: xorps %xmm1, %xmm1 ## encoding: [0x0f,0x57,0xc9] 807; SSE-NEXT: blendps $8, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0c,0xc1,0x08] 808; SSE-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[3] 809; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 810; 811; AVX1-LABEL: shuf_XYZ0: 812; AVX1: ## %bb.0: 813; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9] 814; AVX1-NEXT: vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08] 815; AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[3] 816; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 817; 818; AVX512-LABEL: shuf_XYZ0: 819; AVX512: ## %bb.0: 820; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9] 821; AVX512-NEXT: vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08] 822; AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[3] 823; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 824 %vecext = extractelement <4 x float> %x, i32 0 825 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 826 %vecext1 = extractelement <4 x float> %x, i32 1 827 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1 828 %vecext3 = extractelement <4 x float> %x, i32 2 829 %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext3, i32 2 830 %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3 831 ret <4 x float> %vecinit5 832} 833 834define <4 x float> @shuf_XY00(<4 x float> %x, <4 x float> %a) { 835; SSE-LABEL: shuf_XY00: 836; SSE: ## %bb.0: 837; SSE-NEXT: movq %xmm0, %xmm0 ## encoding: [0xf3,0x0f,0x7e,0xc0] 838; SSE-NEXT: ## xmm0 = xmm0[0],zero 839; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 840; 841; AVX1-LABEL: shuf_XY00: 842; AVX1: ## %bb.0: 843; AVX1-NEXT: vmovq %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x7e,0xc0] 844; AVX1-NEXT: ## xmm0 = xmm0[0],zero 845; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 846; 847; AVX512-LABEL: shuf_XY00: 848; AVX512: ## %bb.0: 849; AVX512-NEXT: vmovq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0xc0] 850; AVX512-NEXT: ## xmm0 = xmm0[0],zero 851; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 852 %vecext = extractelement <4 x float> %x, i32 0 853 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 854 %vecext1 = extractelement <4 x float> %x, i32 1 855 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1 856 %vecinit3 = insertelement <4 x float> %vecinit2, float 0.0, i32 2 857 %vecinit4 = insertelement <4 x float> %vecinit3, float 0.0, i32 3 858 ret <4 x float> %vecinit4 859} 860 861define <4 x float> @shuf_XYY0(<4 x float> %x, <4 x float> %a) { 862; SSE-LABEL: shuf_XYY0: 863; SSE: ## %bb.0: 864; SSE-NEXT: insertps $104, %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc0,0x68] 865; SSE-NEXT: ## xmm0 = xmm0[0,1,1],zero 866; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 867; 868; AVX1-LABEL: shuf_XYY0: 869; AVX1: ## %bb.0: 870; AVX1-NEXT: vinsertps $104, %xmm0, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc0,0x68] 871; AVX1-NEXT: ## xmm0 = xmm0[0,1,1],zero 872; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 873; 874; AVX512-LABEL: shuf_XYY0: 875; AVX512: ## %bb.0: 876; AVX512-NEXT: vinsertps $104, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc0,0x68] 877; AVX512-NEXT: ## xmm0 = xmm0[0,1,1],zero 878; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 879 %vecext = extractelement <4 x float> %x, i32 0 880 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 881 %vecext1 = extractelement <4 x float> %x, i32 1 882 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1 883 %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext1, i32 2 884 %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3 885 ret <4 x float> %vecinit5 886} 887 888define <4 x float> @shuf_XYW0(<4 x float> %x, <4 x float> %a) { 889; SSE-LABEL: shuf_XYW0: 890; SSE: ## %bb.0: 891; SSE-NEXT: insertps $232, %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc0,0xe8] 892; SSE-NEXT: ## xmm0 = xmm0[0,1,3],zero 893; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 894; 895; AVX1-LABEL: shuf_XYW0: 896; AVX1: ## %bb.0: 897; AVX1-NEXT: vinsertps $232, %xmm0, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc0,0xe8] 898; AVX1-NEXT: ## xmm0 = xmm0[0,1,3],zero 899; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 900; 901; AVX512-LABEL: shuf_XYW0: 902; AVX512: ## %bb.0: 903; AVX512-NEXT: vinsertps $232, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc0,0xe8] 904; AVX512-NEXT: ## xmm0 = xmm0[0,1,3],zero 905; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 906 %vecext = extractelement <4 x float> %x, i32 0 907 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 908 %vecext1 = extractelement <4 x float> %x, i32 1 909 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1 910 %vecext2 = extractelement <4 x float> %x, i32 3 911 %vecinit3 = insertelement <4 x float> %vecinit2, float %vecext2, i32 2 912 %vecinit4 = insertelement <4 x float> %vecinit3, float 0.0, i32 3 913 ret <4 x float> %vecinit4 914} 915 916define <4 x float> @shuf_W00W(<4 x float> %x, <4 x float> %a) { 917; SSE-LABEL: shuf_W00W: 918; SSE: ## %bb.0: 919; SSE-NEXT: insertps $198, %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc0,0xc6] 920; SSE-NEXT: ## xmm0 = xmm0[3],zero,zero,xmm0[3] 921; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 922; 923; AVX1-LABEL: shuf_W00W: 924; AVX1: ## %bb.0: 925; AVX1-NEXT: vinsertps $198, %xmm0, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc0,0xc6] 926; AVX1-NEXT: ## xmm0 = xmm0[3],zero,zero,xmm0[3] 927; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 928; 929; AVX512-LABEL: shuf_W00W: 930; AVX512: ## %bb.0: 931; AVX512-NEXT: vinsertps $198, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc0,0xc6] 932; AVX512-NEXT: ## xmm0 = xmm0[3],zero,zero,xmm0[3] 933; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 934 %vecext = extractelement <4 x float> %x, i32 3 935 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 936 %vecinit2 = insertelement <4 x float> %vecinit, float 0.0, i32 1 937 %vecinit3 = insertelement <4 x float> %vecinit2, float 0.0, i32 2 938 %vecinit4 = insertelement <4 x float> %vecinit3, float %vecext, i32 3 939 ret <4 x float> %vecinit4 940} 941 942define <4 x float> @shuf_X00A(<4 x float> %x, <4 x float> %a) { 943; SSE-LABEL: shuf_X00A: 944; SSE: ## %bb.0: 945; SSE-NEXT: insertps $54, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x36] 946; SSE-NEXT: ## xmm0 = xmm0[0],zero,zero,xmm1[0] 947; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 948; 949; AVX1-LABEL: shuf_X00A: 950; AVX1: ## %bb.0: 951; AVX1-NEXT: vinsertps $54, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x36] 952; AVX1-NEXT: ## xmm0 = xmm0[0],zero,zero,xmm1[0] 953; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 954; 955; AVX512-LABEL: shuf_X00A: 956; AVX512: ## %bb.0: 957; AVX512-NEXT: vinsertps $54, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x36] 958; AVX512-NEXT: ## xmm0 = xmm0[0],zero,zero,xmm1[0] 959; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 960 %vecext = extractelement <4 x float> %x, i32 0 961 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 962 %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1 963 %vecinit2 = insertelement <4 x float> %vecinit1, float 0.0, i32 2 964 %vecinit4 = shufflevector <4 x float> %vecinit2, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 4> 965 ret <4 x float> %vecinit4 966} 967 968define <4 x float> @shuf_X00X(<4 x float> %x, <4 x float> %a) { 969; SSE-LABEL: shuf_X00X: 970; SSE: ## %bb.0: 971; SSE-NEXT: insertps $54, %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc0,0x36] 972; SSE-NEXT: ## xmm0 = xmm0[0],zero,zero,xmm0[0] 973; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 974; 975; AVX1-LABEL: shuf_X00X: 976; AVX1: ## %bb.0: 977; AVX1-NEXT: vinsertps $54, %xmm0, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc0,0x36] 978; AVX1-NEXT: ## xmm0 = xmm0[0],zero,zero,xmm0[0] 979; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 980; 981; AVX512-LABEL: shuf_X00X: 982; AVX512: ## %bb.0: 983; AVX512-NEXT: vinsertps $54, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc0,0x36] 984; AVX512-NEXT: ## xmm0 = xmm0[0],zero,zero,xmm0[0] 985; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 986 %vecext = extractelement <4 x float> %x, i32 0 987 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 988 %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1 989 %vecinit2 = insertelement <4 x float> %vecinit1, float 0.0, i32 2 990 %vecinit4 = shufflevector <4 x float> %vecinit2, <4 x float> %x, <4 x i32> <i32 0, i32 1, i32 2, i32 4> 991 ret <4 x float> %vecinit4 992} 993 994define <4 x float> @shuf_X0YC(<4 x float> %x, <4 x float> %a) { 995; SSE-LABEL: shuf_X0YC: 996; SSE: ## %bb.0: 997; SSE-NEXT: xorps %xmm2, %xmm2 ## encoding: [0x0f,0x57,0xd2] 998; SSE-NEXT: unpcklps %xmm2, %xmm0 ## encoding: [0x0f,0x14,0xc2] 999; SSE-NEXT: ## xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 1000; SSE-NEXT: insertps $176, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0xb0] 1001; SSE-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[2] 1002; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1003; 1004; AVX1-LABEL: shuf_X0YC: 1005; AVX1: ## %bb.0: 1006; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x57,0xd2] 1007; AVX1-NEXT: vunpcklps %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x14,0xc2] 1008; AVX1-NEXT: ## xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 1009; AVX1-NEXT: vinsertps $176, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xb0] 1010; AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[2] 1011; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1012; 1013; AVX512-LABEL: shuf_X0YC: 1014; AVX512: ## %bb.0: 1015; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x57,0xd2] 1016; AVX512-NEXT: vunpcklps %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x14,0xc2] 1017; AVX512-NEXT: ## xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 1018; AVX512-NEXT: vinsertps $176, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xb0] 1019; AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[2] 1020; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1021 %vecext = extractelement <4 x float> %x, i32 0 1022 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 1023 %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1 1024 %vecinit3 = shufflevector <4 x float> %vecinit1, <4 x float> %x, <4 x i32> <i32 0, i32 1, i32 5, i32 undef> 1025 %vecinit5 = shufflevector <4 x float> %vecinit3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 6> 1026 ret <4 x float> %vecinit5 1027} 1028 1029define <4 x i32> @i32_shuf_XYZ0(<4 x i32> %x, <4 x i32> %a) { 1030; SSE-LABEL: i32_shuf_XYZ0: 1031; SSE: ## %bb.0: 1032; SSE-NEXT: xorps %xmm1, %xmm1 ## encoding: [0x0f,0x57,0xc9] 1033; SSE-NEXT: blendps $8, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0c,0xc1,0x08] 1034; SSE-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[3] 1035; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1036; 1037; AVX1-LABEL: i32_shuf_XYZ0: 1038; AVX1: ## %bb.0: 1039; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9] 1040; AVX1-NEXT: vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08] 1041; AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[3] 1042; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1043; 1044; AVX512-LABEL: i32_shuf_XYZ0: 1045; AVX512: ## %bb.0: 1046; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9] 1047; AVX512-NEXT: vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08] 1048; AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[3] 1049; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1050 %vecext = extractelement <4 x i32> %x, i32 0 1051 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 1052 %vecext1 = extractelement <4 x i32> %x, i32 1 1053 %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1 1054 %vecext3 = extractelement <4 x i32> %x, i32 2 1055 %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext3, i32 2 1056 %vecinit5 = insertelement <4 x i32> %vecinit4, i32 0, i32 3 1057 ret <4 x i32> %vecinit5 1058} 1059 1060define <4 x i32> @i32_shuf_XY00(<4 x i32> %x, <4 x i32> %a) { 1061; SSE-LABEL: i32_shuf_XY00: 1062; SSE: ## %bb.0: 1063; SSE-NEXT: movq %xmm0, %xmm0 ## encoding: [0xf3,0x0f,0x7e,0xc0] 1064; SSE-NEXT: ## xmm0 = xmm0[0],zero 1065; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1066; 1067; AVX1-LABEL: i32_shuf_XY00: 1068; AVX1: ## %bb.0: 1069; AVX1-NEXT: vmovq %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x7e,0xc0] 1070; AVX1-NEXT: ## xmm0 = xmm0[0],zero 1071; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1072; 1073; AVX512-LABEL: i32_shuf_XY00: 1074; AVX512: ## %bb.0: 1075; AVX512-NEXT: vmovq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0xc0] 1076; AVX512-NEXT: ## xmm0 = xmm0[0],zero 1077; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1078 %vecext = extractelement <4 x i32> %x, i32 0 1079 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 1080 %vecext1 = extractelement <4 x i32> %x, i32 1 1081 %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1 1082 %vecinit3 = insertelement <4 x i32> %vecinit2, i32 0, i32 2 1083 %vecinit4 = insertelement <4 x i32> %vecinit3, i32 0, i32 3 1084 ret <4 x i32> %vecinit4 1085} 1086 1087define <4 x i32> @i32_shuf_XYY0(<4 x i32> %x, <4 x i32> %a) { 1088; SSE-LABEL: i32_shuf_XYY0: 1089; SSE: ## %bb.0: 1090; SSE-NEXT: pshufd $212, %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc8,0xd4] 1091; SSE-NEXT: ## xmm1 = xmm0[0,1,1,3] 1092; SSE-NEXT: pxor %xmm0, %xmm0 ## encoding: [0x66,0x0f,0xef,0xc0] 1093; SSE-NEXT: pblendw $63, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0x3f] 1094; SSE-NEXT: ## xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] 1095; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1096; 1097; AVX1-LABEL: i32_shuf_XYY0: 1098; AVX1: ## %bb.0: 1099; AVX1-NEXT: vshufps $212, %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc6,0xc0,0xd4] 1100; AVX1-NEXT: ## xmm0 = xmm0[0,1,1,3] 1101; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9] 1102; AVX1-NEXT: vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08] 1103; AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[3] 1104; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1105; 1106; AVX512-LABEL: i32_shuf_XYY0: 1107; AVX512: ## %bb.0: 1108; AVX512-NEXT: vshufps $212, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0xc6,0xc0,0xd4] 1109; AVX512-NEXT: ## xmm0 = xmm0[0,1,1,3] 1110; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9] 1111; AVX512-NEXT: vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08] 1112; AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[3] 1113; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1114 %vecext = extractelement <4 x i32> %x, i32 0 1115 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 1116 %vecext1 = extractelement <4 x i32> %x, i32 1 1117 %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1 1118 %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext1, i32 2 1119 %vecinit5 = insertelement <4 x i32> %vecinit4, i32 0, i32 3 1120 ret <4 x i32> %vecinit5 1121} 1122 1123define <4 x i32> @i32_shuf_XYW0(<4 x i32> %x, <4 x i32> %a) { 1124; SSE-LABEL: i32_shuf_XYW0: 1125; SSE: ## %bb.0: 1126; SSE-NEXT: pshufd $244, %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc8,0xf4] 1127; SSE-NEXT: ## xmm1 = xmm0[0,1,3,3] 1128; SSE-NEXT: pxor %xmm0, %xmm0 ## encoding: [0x66,0x0f,0xef,0xc0] 1129; SSE-NEXT: pblendw $63, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0x3f] 1130; SSE-NEXT: ## xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] 1131; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1132; 1133; AVX1-LABEL: i32_shuf_XYW0: 1134; AVX1: ## %bb.0: 1135; AVX1-NEXT: vshufps $244, %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc6,0xc0,0xf4] 1136; AVX1-NEXT: ## xmm0 = xmm0[0,1,3,3] 1137; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9] 1138; AVX1-NEXT: vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08] 1139; AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[3] 1140; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1141; 1142; AVX512-LABEL: i32_shuf_XYW0: 1143; AVX512: ## %bb.0: 1144; AVX512-NEXT: vshufps $244, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0xc6,0xc0,0xf4] 1145; AVX512-NEXT: ## xmm0 = xmm0[0,1,3,3] 1146; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9] 1147; AVX512-NEXT: vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08] 1148; AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[3] 1149; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1150 %vecext = extractelement <4 x i32> %x, i32 0 1151 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 1152 %vecext1 = extractelement <4 x i32> %x, i32 1 1153 %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1 1154 %vecext2 = extractelement <4 x i32> %x, i32 3 1155 %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %vecext2, i32 2 1156 %vecinit4 = insertelement <4 x i32> %vecinit3, i32 0, i32 3 1157 ret <4 x i32> %vecinit4 1158} 1159 1160define <4 x i32> @i32_shuf_W00W(<4 x i32> %x, <4 x i32> %a) { 1161; SSE-LABEL: i32_shuf_W00W: 1162; SSE: ## %bb.0: 1163; SSE-NEXT: pshufd $255, %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc8,0xff] 1164; SSE-NEXT: ## xmm1 = xmm0[3,3,3,3] 1165; SSE-NEXT: pxor %xmm0, %xmm0 ## encoding: [0x66,0x0f,0xef,0xc0] 1166; SSE-NEXT: pblendw $195, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0xc3] 1167; SSE-NEXT: ## xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] 1168; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1169; 1170; AVX1-LABEL: i32_shuf_W00W: 1171; AVX1: ## %bb.0: 1172; AVX1-NEXT: vshufps $255, %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc6,0xc0,0xff] 1173; AVX1-NEXT: ## xmm0 = xmm0[3,3,3,3] 1174; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9] 1175; AVX1-NEXT: vblendps $6, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x06] 1176; AVX1-NEXT: ## xmm0 = xmm0[0],xmm1[1,2],xmm0[3] 1177; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1178; 1179; AVX512-LABEL: i32_shuf_W00W: 1180; AVX512: ## %bb.0: 1181; AVX512-NEXT: vshufps $255, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0xc6,0xc0,0xff] 1182; AVX512-NEXT: ## xmm0 = xmm0[3,3,3,3] 1183; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9] 1184; AVX512-NEXT: vblendps $6, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x06] 1185; AVX512-NEXT: ## xmm0 = xmm0[0],xmm1[1,2],xmm0[3] 1186; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1187 %vecext = extractelement <4 x i32> %x, i32 3 1188 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 1189 %vecinit2 = insertelement <4 x i32> %vecinit, i32 0, i32 1 1190 %vecinit3 = insertelement <4 x i32> %vecinit2, i32 0, i32 2 1191 %vecinit4 = insertelement <4 x i32> %vecinit3, i32 %vecext, i32 3 1192 ret <4 x i32> %vecinit4 1193} 1194 1195define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) { 1196; SSE-LABEL: i32_shuf_X00A: 1197; SSE: ## %bb.0: 1198; SSE-NEXT: pxor %xmm2, %xmm2 ## encoding: [0x66,0x0f,0xef,0xd2] 1199; SSE-NEXT: pblendw $252, %xmm2, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc2,0xfc] 1200; SSE-NEXT: ## xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] 1201; SSE-NEXT: pshufd $0, %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc9,0x00] 1202; SSE-NEXT: ## xmm1 = xmm1[0,0,0,0] 1203; SSE-NEXT: pblendw $192, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0xc0] 1204; SSE-NEXT: ## xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] 1205; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1206; 1207; AVX1-LABEL: i32_shuf_X00A: 1208; AVX1: ## %bb.0: 1209; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x57,0xd2] 1210; AVX1-NEXT: vblendps $1, %xmm0, %xmm2, %xmm0 ## encoding: [0xc4,0xe3,0x69,0x0c,0xc0,0x01] 1211; AVX1-NEXT: ## xmm0 = xmm0[0],xmm2[1,2,3] 1212; AVX1-NEXT: vshufps $0, %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0xc6,0xc9,0x00] 1213; AVX1-NEXT: ## xmm1 = xmm1[0,0,0,0] 1214; AVX1-NEXT: vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08] 1215; AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[3] 1216; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1217; 1218; AVX512-LABEL: i32_shuf_X00A: 1219; AVX512: ## %bb.0: 1220; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x57,0xd2] 1221; AVX512-NEXT: vblendps $1, %xmm0, %xmm2, %xmm0 ## encoding: [0xc4,0xe3,0x69,0x0c,0xc0,0x01] 1222; AVX512-NEXT: ## xmm0 = xmm0[0],xmm2[1,2,3] 1223; AVX512-NEXT: vbroadcastss %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc9] 1224; AVX512-NEXT: vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08] 1225; AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[3] 1226; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1227 %vecext = extractelement <4 x i32> %x, i32 0 1228 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 1229 %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1 1230 %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2 1231 %vecinit4 = shufflevector <4 x i32> %vecinit2, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 4> 1232 ret <4 x i32> %vecinit4 1233} 1234 1235define <4 x i32> @i32_shuf_X00X(<4 x i32> %x, <4 x i32> %a) { 1236; SSE-LABEL: i32_shuf_X00X: 1237; SSE: ## %bb.0: 1238; SSE-NEXT: pxor %xmm1, %xmm1 ## encoding: [0x66,0x0f,0xef,0xc9] 1239; SSE-NEXT: pshufd $0, %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x70,0xc0,0x00] 1240; SSE-NEXT: ## xmm0 = xmm0[0,0,0,0] 1241; SSE-NEXT: pblendw $60, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0x3c] 1242; SSE-NEXT: ## xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7] 1243; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1244; 1245; AVX1-LABEL: i32_shuf_X00X: 1246; AVX1: ## %bb.0: 1247; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9] 1248; AVX1-NEXT: vshufps $0, %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc6,0xc0,0x00] 1249; AVX1-NEXT: ## xmm0 = xmm0[0,0,0,0] 1250; AVX1-NEXT: vblendps $6, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x06] 1251; AVX1-NEXT: ## xmm0 = xmm0[0],xmm1[1,2],xmm0[3] 1252; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1253; 1254; AVX512-LABEL: i32_shuf_X00X: 1255; AVX512: ## %bb.0: 1256; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9] 1257; AVX512-NEXT: vbroadcastss %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0] 1258; AVX512-NEXT: vblendps $6, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x06] 1259; AVX512-NEXT: ## xmm0 = xmm0[0],xmm1[1,2],xmm0[3] 1260; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1261 %vecext = extractelement <4 x i32> %x, i32 0 1262 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 1263 %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1 1264 %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2 1265 %vecinit4 = shufflevector <4 x i32> %vecinit2, <4 x i32> %x, <4 x i32> <i32 0, i32 1, i32 2, i32 4> 1266 ret <4 x i32> %vecinit4 1267} 1268 1269define <4 x i32> @i32_shuf_X0YC(<4 x i32> %x, <4 x i32> %a) { 1270; SSE-LABEL: i32_shuf_X0YC: 1271; SSE: ## %bb.0: 1272; SSE-NEXT: pmovzxdq %xmm0, %xmm2 ## encoding: [0x66,0x0f,0x38,0x35,0xd0] 1273; SSE-NEXT: ## xmm2 = xmm0[0],zero,xmm0[1],zero 1274; SSE-NEXT: pshufd $170, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x70,0xc1,0xaa] 1275; SSE-NEXT: ## xmm0 = xmm1[2,2,2,2] 1276; SSE-NEXT: pblendw $63, %xmm2, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc2,0x3f] 1277; SSE-NEXT: ## xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7] 1278; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1279; 1280; AVX1-LABEL: i32_shuf_X0YC: 1281; AVX1: ## %bb.0: 1282; AVX1-NEXT: vpmovzxdq %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x35,0xc0] 1283; AVX1-NEXT: ## xmm0 = xmm0[0],zero,xmm0[1],zero 1284; AVX1-NEXT: vpshufd $170, %xmm1, %xmm1 ## encoding: [0xc5,0xf9,0x70,0xc9,0xaa] 1285; AVX1-NEXT: ## xmm1 = xmm1[2,2,2,2] 1286; AVX1-NEXT: vpblendw $192, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0xc0] 1287; AVX1-NEXT: ## xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] 1288; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1289; 1290; AVX512-LABEL: i32_shuf_X0YC: 1291; AVX512: ## %bb.0: 1292; AVX512-NEXT: vpmovzxdq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x35,0xc0] 1293; AVX512-NEXT: ## xmm0 = xmm0[0],zero,xmm0[1],zero 1294; AVX512-NEXT: vpshufd $170, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x70,0xc9,0xaa] 1295; AVX512-NEXT: ## xmm1 = xmm1[2,2,2,2] 1296; AVX512-NEXT: vpblendd $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x02,0xc1,0x08] 1297; AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[3] 1298; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1299 %vecext = extractelement <4 x i32> %x, i32 0 1300 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 1301 %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1 1302 %vecinit3 = shufflevector <4 x i32> %vecinit1, <4 x i32> %x, <4 x i32> <i32 0, i32 1, i32 5, i32 undef> 1303 %vecinit5 = shufflevector <4 x i32> %vecinit3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 6> 1304 ret <4 x i32> %vecinit5 1305} 1306 1307;; Test for a bug in the first implementation of LowerBuildVectorv4X86 1308define < 4 x float> @test_insertps_no_undef(<4 x float> %x) { 1309; SSE-LABEL: test_insertps_no_undef: 1310; SSE: ## %bb.0: 1311; SSE-NEXT: xorps %xmm1, %xmm1 ## encoding: [0x0f,0x57,0xc9] 1312; SSE-NEXT: blendps $7, %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x3a,0x0c,0xc8,0x07] 1313; SSE-NEXT: ## xmm1 = xmm0[0,1,2],xmm1[3] 1314; SSE-NEXT: maxps %xmm1, %xmm0 ## encoding: [0x0f,0x5f,0xc1] 1315; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1316; 1317; AVX1-LABEL: test_insertps_no_undef: 1318; AVX1: ## %bb.0: 1319; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9] 1320; AVX1-NEXT: vblendps $8, %xmm1, %xmm0, %xmm1 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc9,0x08] 1321; AVX1-NEXT: ## xmm1 = xmm0[0,1,2],xmm1[3] 1322; AVX1-NEXT: vmaxps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x5f,0xc1] 1323; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1324; 1325; AVX512-LABEL: test_insertps_no_undef: 1326; AVX512: ## %bb.0: 1327; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9] 1328; AVX512-NEXT: vblendps $8, %xmm1, %xmm0, %xmm1 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc9,0x08] 1329; AVX512-NEXT: ## xmm1 = xmm0[0,1,2],xmm1[3] 1330; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5f,0xc1] 1331; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1332 %vecext = extractelement <4 x float> %x, i32 0 1333 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 1334 %vecext1 = extractelement <4 x float> %x, i32 1 1335 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1 1336 %vecext3 = extractelement <4 x float> %x, i32 2 1337 %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext3, i32 2 1338 %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3 1339 %mask = fcmp olt <4 x float> %vecinit5, %x 1340 %res = select <4 x i1> %mask, <4 x float> %x, <4 x float>%vecinit5 1341 ret <4 x float> %res 1342} 1343 1344define <8 x i16> @blendvb_fallback(<8 x i1> %mask, <8 x i16> %x, <8 x i16> %y) { 1345; SSE-LABEL: blendvb_fallback: 1346; SSE: ## %bb.0: 1347; SSE-NEXT: psllw $15, %xmm0 ## encoding: [0x66,0x0f,0x71,0xf0,0x0f] 1348; SSE-NEXT: psraw $15, %xmm0 ## encoding: [0x66,0x0f,0x71,0xe0,0x0f] 1349; SSE-NEXT: pblendvb %xmm0, %xmm1, %xmm2 ## encoding: [0x66,0x0f,0x38,0x10,0xd1] 1350; SSE-NEXT: movdqa %xmm2, %xmm0 ## encoding: [0x66,0x0f,0x6f,0xc2] 1351; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1352; 1353; AVX1-LABEL: blendvb_fallback: 1354; AVX1: ## %bb.0: 1355; AVX1-NEXT: vpsllw $15, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x71,0xf0,0x0f] 1356; AVX1-NEXT: vpsraw $15, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x71,0xe0,0x0f] 1357; AVX1-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 ## encoding: [0xc4,0xe3,0x69,0x4c,0xc1,0x00] 1358; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1359; 1360; AVX512-LABEL: blendvb_fallback: 1361; AVX512: ## %bb.0: 1362; AVX512-NEXT: vpsllw $15, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x71,0xf0,0x0f] 1363; AVX512-NEXT: vpmovw2m %xmm0, %k1 ## encoding: [0x62,0xf2,0xfe,0x08,0x29,0xc8] 1364; AVX512-NEXT: vpblendmw %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x09,0x66,0xc1] 1365; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1366 %ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %y 1367 ret <8 x i16> %ret 1368} 1369 1370; On X86, account for the argument's move to registers 1371define <4 x float> @insertps_from_vector_load(<4 x float> %a, ptr nocapture readonly %pb) { 1372; X86-SSE-LABEL: insertps_from_vector_load: 1373; X86-SSE: ## %bb.0: 1374; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 1375; X86-SSE-NEXT: movaps (%eax), %xmm1 ## encoding: [0x0f,0x28,0x08] 1376; X86-SSE-NEXT: insertps $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x30] 1377; X86-SSE-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] 1378; X86-SSE-NEXT: retl ## encoding: [0xc3] 1379; 1380; X86-AVX1-LABEL: insertps_from_vector_load: 1381; X86-AVX1: ## %bb.0: 1382; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 1383; X86-AVX1-NEXT: vmovaps (%eax), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x08] 1384; X86-AVX1-NEXT: vinsertps $48, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30] 1385; X86-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] 1386; X86-AVX1-NEXT: retl ## encoding: [0xc3] 1387; 1388; X86-AVX512-LABEL: insertps_from_vector_load: 1389; X86-AVX512: ## %bb.0: 1390; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 1391; X86-AVX512-NEXT: vmovaps (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x08] 1392; X86-AVX512-NEXT: vinsertps $48, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30] 1393; X86-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] 1394; X86-AVX512-NEXT: retl ## encoding: [0xc3] 1395; 1396; X64-SSE-LABEL: insertps_from_vector_load: 1397; X64-SSE: ## %bb.0: 1398; X64-SSE-NEXT: movaps (%rdi), %xmm1 ## encoding: [0x0f,0x28,0x0f] 1399; X64-SSE-NEXT: insertps $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x30] 1400; X64-SSE-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] 1401; X64-SSE-NEXT: retq ## encoding: [0xc3] 1402; 1403; X64-AVX1-LABEL: insertps_from_vector_load: 1404; X64-AVX1: ## %bb.0: 1405; X64-AVX1-NEXT: vmovaps (%rdi), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x0f] 1406; X64-AVX1-NEXT: vinsertps $48, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30] 1407; X64-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] 1408; X64-AVX1-NEXT: retq ## encoding: [0xc3] 1409; 1410; X64-AVX512-LABEL: insertps_from_vector_load: 1411; X64-AVX512: ## %bb.0: 1412; X64-AVX512-NEXT: vmovaps (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0f] 1413; X64-AVX512-NEXT: vinsertps $48, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30] 1414; X64-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] 1415; X64-AVX512-NEXT: retq ## encoding: [0xc3] 1416 %1 = load <4 x float>, ptr %pb, align 16 1417 %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48) 1418 ret <4 x float> %2 1419} 1420 1421;; Use a non-zero CountS for insertps 1422;; Try to match a bit more of the instr, since we need the load's offset. 1423define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, ptr nocapture readonly %pb) { 1424; X86-SSE-LABEL: insertps_from_vector_load_offset: 1425; X86-SSE: ## %bb.0: 1426; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 1427; X86-SSE-NEXT: movaps (%eax), %xmm1 ## encoding: [0x0f,0x28,0x08] 1428; X86-SSE-NEXT: insertps $96, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x60] 1429; X86-SSE-NEXT: ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3] 1430; X86-SSE-NEXT: retl ## encoding: [0xc3] 1431; 1432; X86-AVX1-LABEL: insertps_from_vector_load_offset: 1433; X86-AVX1: ## %bb.0: 1434; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 1435; X86-AVX1-NEXT: vmovaps (%eax), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x08] 1436; X86-AVX1-NEXT: vinsertps $96, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x60] 1437; X86-AVX1-NEXT: ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3] 1438; X86-AVX1-NEXT: retl ## encoding: [0xc3] 1439; 1440; X86-AVX512-LABEL: insertps_from_vector_load_offset: 1441; X86-AVX512: ## %bb.0: 1442; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 1443; X86-AVX512-NEXT: vmovaps (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x08] 1444; X86-AVX512-NEXT: vinsertps $96, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x60] 1445; X86-AVX512-NEXT: ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3] 1446; X86-AVX512-NEXT: retl ## encoding: [0xc3] 1447; 1448; X64-SSE-LABEL: insertps_from_vector_load_offset: 1449; X64-SSE: ## %bb.0: 1450; X64-SSE-NEXT: movaps (%rdi), %xmm1 ## encoding: [0x0f,0x28,0x0f] 1451; X64-SSE-NEXT: insertps $96, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x60] 1452; X64-SSE-NEXT: ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3] 1453; X64-SSE-NEXT: retq ## encoding: [0xc3] 1454; 1455; X64-AVX1-LABEL: insertps_from_vector_load_offset: 1456; X64-AVX1: ## %bb.0: 1457; X64-AVX1-NEXT: vmovaps (%rdi), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x0f] 1458; X64-AVX1-NEXT: vinsertps $96, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x60] 1459; X64-AVX1-NEXT: ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3] 1460; X64-AVX1-NEXT: retq ## encoding: [0xc3] 1461; 1462; X64-AVX512-LABEL: insertps_from_vector_load_offset: 1463; X64-AVX512: ## %bb.0: 1464; X64-AVX512-NEXT: vmovaps (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0f] 1465; X64-AVX512-NEXT: vinsertps $96, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x60] 1466; X64-AVX512-NEXT: ## xmm0 = xmm0[0,1],xmm1[1],xmm0[3] 1467; X64-AVX512-NEXT: retq ## encoding: [0xc3] 1468 %1 = load <4 x float>, ptr %pb, align 16 1469 %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96) 1470 ret <4 x float> %2 1471} 1472 1473;; Try to match a bit more of the instr, since we need the load's offset. 1474define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, ptr nocapture readonly %pb, i64 %index) { 1475; X86-SSE-LABEL: insertps_from_vector_load_offset_2: 1476; X86-SSE: ## %bb.0: 1477; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 1478; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08] 1479; X86-SSE-NEXT: shll $4, %ecx ## encoding: [0xc1,0xe1,0x04] 1480; X86-SSE-NEXT: movaps (%eax,%ecx), %xmm1 ## encoding: [0x0f,0x28,0x0c,0x08] 1481; X86-SSE-NEXT: insertps $192, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0xc0] 1482; X86-SSE-NEXT: ## xmm0 = xmm1[3],xmm0[1,2,3] 1483; X86-SSE-NEXT: retl ## encoding: [0xc3] 1484; 1485; X86-AVX1-LABEL: insertps_from_vector_load_offset_2: 1486; X86-AVX1: ## %bb.0: 1487; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 1488; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08] 1489; X86-AVX1-NEXT: shll $4, %ecx ## encoding: [0xc1,0xe1,0x04] 1490; X86-AVX1-NEXT: vmovaps (%eax,%ecx), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x0c,0x08] 1491; X86-AVX1-NEXT: vinsertps $192, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xc0] 1492; X86-AVX1-NEXT: ## xmm0 = xmm1[3],xmm0[1,2,3] 1493; X86-AVX1-NEXT: retl ## encoding: [0xc3] 1494; 1495; X86-AVX512-LABEL: insertps_from_vector_load_offset_2: 1496; X86-AVX512: ## %bb.0: 1497; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 1498; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08] 1499; X86-AVX512-NEXT: shll $4, %ecx ## encoding: [0xc1,0xe1,0x04] 1500; X86-AVX512-NEXT: vmovaps (%eax,%ecx), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0c,0x08] 1501; X86-AVX512-NEXT: vinsertps $192, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xc0] 1502; X86-AVX512-NEXT: ## xmm0 = xmm1[3],xmm0[1,2,3] 1503; X86-AVX512-NEXT: retl ## encoding: [0xc3] 1504; 1505; X64-SSE-LABEL: insertps_from_vector_load_offset_2: 1506; X64-SSE: ## %bb.0: 1507; X64-SSE-NEXT: shlq $4, %rsi ## encoding: [0x48,0xc1,0xe6,0x04] 1508; X64-SSE-NEXT: movaps (%rdi,%rsi), %xmm1 ## encoding: [0x0f,0x28,0x0c,0x37] 1509; X64-SSE-NEXT: insertps $192, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0xc0] 1510; X64-SSE-NEXT: ## xmm0 = xmm1[3],xmm0[1,2,3] 1511; X64-SSE-NEXT: retq ## encoding: [0xc3] 1512; 1513; X64-AVX1-LABEL: insertps_from_vector_load_offset_2: 1514; X64-AVX1: ## %bb.0: 1515; X64-AVX1-NEXT: shlq $4, %rsi ## encoding: [0x48,0xc1,0xe6,0x04] 1516; X64-AVX1-NEXT: vmovaps (%rdi,%rsi), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x0c,0x37] 1517; X64-AVX1-NEXT: vinsertps $192, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xc0] 1518; X64-AVX1-NEXT: ## xmm0 = xmm1[3],xmm0[1,2,3] 1519; X64-AVX1-NEXT: retq ## encoding: [0xc3] 1520; 1521; X64-AVX512-LABEL: insertps_from_vector_load_offset_2: 1522; X64-AVX512: ## %bb.0: 1523; X64-AVX512-NEXT: shlq $4, %rsi ## encoding: [0x48,0xc1,0xe6,0x04] 1524; X64-AVX512-NEXT: vmovaps (%rdi,%rsi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0c,0x37] 1525; X64-AVX512-NEXT: vinsertps $192, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xc0] 1526; X64-AVX512-NEXT: ## xmm0 = xmm1[3],xmm0[1,2,3] 1527; X64-AVX512-NEXT: retq ## encoding: [0xc3] 1528 %1 = getelementptr inbounds <4 x float>, ptr %pb, i64 %index 1529 %2 = load <4 x float>, ptr %1, align 16 1530 %3 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %2, i32 192) 1531 ret <4 x float> %3 1532} 1533 1534define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, ptr nocapture readonly %fb, i64 %index) { 1535; X86-SSE-LABEL: insertps_from_broadcast_loadf32: 1536; X86-SSE: ## %bb.0: 1537; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08] 1538; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04] 1539; X86-SSE-NEXT: insertps $48, (%ecx,%eax,4), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0x04,0x81,0x30] 1540; X86-SSE-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] 1541; X86-SSE-NEXT: retl ## encoding: [0xc3] 1542; 1543; X86-AVX1-LABEL: insertps_from_broadcast_loadf32: 1544; X86-AVX1: ## %bb.0: 1545; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08] 1546; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04] 1547; X86-AVX1-NEXT: vinsertps $48, (%ecx,%eax,4), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x04,0x81,0x30] 1548; X86-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] 1549; X86-AVX1-NEXT: retl ## encoding: [0xc3] 1550; 1551; X86-AVX512-LABEL: insertps_from_broadcast_loadf32: 1552; X86-AVX512: ## %bb.0: 1553; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08] 1554; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04] 1555; X86-AVX512-NEXT: vinsertps $48, (%ecx,%eax,4), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0x04,0x81,0x30] 1556; X86-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] 1557; X86-AVX512-NEXT: retl ## encoding: [0xc3] 1558; 1559; X64-SSE-LABEL: insertps_from_broadcast_loadf32: 1560; X64-SSE: ## %bb.0: 1561; X64-SSE-NEXT: insertps $48, (%rdi,%rsi,4), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0x04,0xb7,0x30] 1562; X64-SSE-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] 1563; X64-SSE-NEXT: retq ## encoding: [0xc3] 1564; 1565; X64-AVX1-LABEL: insertps_from_broadcast_loadf32: 1566; X64-AVX1: ## %bb.0: 1567; X64-AVX1-NEXT: vinsertps $48, (%rdi,%rsi,4), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x04,0xb7,0x30] 1568; X64-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] 1569; X64-AVX1-NEXT: retq ## encoding: [0xc3] 1570; 1571; X64-AVX512-LABEL: insertps_from_broadcast_loadf32: 1572; X64-AVX512: ## %bb.0: 1573; X64-AVX512-NEXT: vinsertps $48, (%rdi,%rsi,4), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0x04,0xb7,0x30] 1574; X64-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] 1575; X64-AVX512-NEXT: retq ## encoding: [0xc3] 1576 %1 = getelementptr inbounds float, ptr %fb, i64 %index 1577 %2 = load float, ptr %1, align 4 1578 %3 = insertelement <4 x float> undef, float %2, i32 0 1579 %4 = insertelement <4 x float> %3, float %2, i32 1 1580 %5 = insertelement <4 x float> %4, float %2, i32 2 1581 %6 = insertelement <4 x float> %5, float %2, i32 3 1582 %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48) 1583 ret <4 x float> %7 1584} 1585 1586define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, ptr nocapture readonly %b) { 1587; X86-SSE-LABEL: insertps_from_broadcast_loadv4f32: 1588; X86-SSE: ## %bb.0: 1589; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 1590; X86-SSE-NEXT: movups (%eax), %xmm1 ## encoding: [0x0f,0x10,0x08] 1591; X86-SSE-NEXT: insertps $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x30] 1592; X86-SSE-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] 1593; X86-SSE-NEXT: retl ## encoding: [0xc3] 1594; 1595; X86-AVX1-LABEL: insertps_from_broadcast_loadv4f32: 1596; X86-AVX1: ## %bb.0: 1597; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 1598; X86-AVX1-NEXT: vinsertps $48, (%eax), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x00,0x30] 1599; X86-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] 1600; X86-AVX1-NEXT: retl ## encoding: [0xc3] 1601; 1602; X86-AVX512-LABEL: insertps_from_broadcast_loadv4f32: 1603; X86-AVX512: ## %bb.0: 1604; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 1605; X86-AVX512-NEXT: vinsertps $48, (%eax), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0x00,0x30] 1606; X86-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] 1607; X86-AVX512-NEXT: retl ## encoding: [0xc3] 1608; 1609; X64-SSE-LABEL: insertps_from_broadcast_loadv4f32: 1610; X64-SSE: ## %bb.0: 1611; X64-SSE-NEXT: movups (%rdi), %xmm1 ## encoding: [0x0f,0x10,0x0f] 1612; X64-SSE-NEXT: insertps $48, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x30] 1613; X64-SSE-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0] 1614; X64-SSE-NEXT: retq ## encoding: [0xc3] 1615; 1616; X64-AVX1-LABEL: insertps_from_broadcast_loadv4f32: 1617; X64-AVX1: ## %bb.0: 1618; X64-AVX1-NEXT: vinsertps $48, (%rdi), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x07,0x30] 1619; X64-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] 1620; X64-AVX1-NEXT: retq ## encoding: [0xc3] 1621; 1622; X64-AVX512-LABEL: insertps_from_broadcast_loadv4f32: 1623; X64-AVX512: ## %bb.0: 1624; X64-AVX512-NEXT: vinsertps $48, (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0x07,0x30] 1625; X64-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],mem[0] 1626; X64-AVX512-NEXT: retq ## encoding: [0xc3] 1627 %1 = load <4 x float>, ptr %b, align 4 1628 %2 = extractelement <4 x float> %1, i32 0 1629 %3 = insertelement <4 x float> undef, float %2, i32 0 1630 %4 = insertelement <4 x float> %3, float %2, i32 1 1631 %5 = insertelement <4 x float> %4, float %2, i32 2 1632 %6 = insertelement <4 x float> %5, float %2, i32 3 1633 %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48) 1634 ret <4 x float> %7 1635} 1636 1637define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, ptr nocapture readonly %fb, i64 %index) { 1638; X86-SSE-LABEL: insertps_from_broadcast_multiple_use: 1639; X86-SSE: ## %bb.0: 1640; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08] 1641; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04] 1642; X86-SSE-NEXT: movss (%ecx,%eax,4), %xmm4 ## xmm4 = mem[0],zero,zero,zero 1643; X86-SSE-NEXT: ## encoding: [0xf3,0x0f,0x10,0x24,0x81] 1644; X86-SSE-NEXT: insertps $48, %xmm4, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc4,0x30] 1645; X86-SSE-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[0] 1646; X86-SSE-NEXT: insertps $48, %xmm4, %xmm1 ## encoding: [0x66,0x0f,0x3a,0x21,0xcc,0x30] 1647; X86-SSE-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[0] 1648; X86-SSE-NEXT: addps %xmm1, %xmm0 ## encoding: [0x0f,0x58,0xc1] 1649; X86-SSE-NEXT: insertps $48, %xmm4, %xmm2 ## encoding: [0x66,0x0f,0x3a,0x21,0xd4,0x30] 1650; X86-SSE-NEXT: ## xmm2 = xmm2[0,1,2],xmm4[0] 1651; X86-SSE-NEXT: insertps $48, %xmm4, %xmm3 ## encoding: [0x66,0x0f,0x3a,0x21,0xdc,0x30] 1652; X86-SSE-NEXT: ## xmm3 = xmm3[0,1,2],xmm4[0] 1653; X86-SSE-NEXT: addps %xmm2, %xmm3 ## encoding: [0x0f,0x58,0xda] 1654; X86-SSE-NEXT: addps %xmm3, %xmm0 ## encoding: [0x0f,0x58,0xc3] 1655; X86-SSE-NEXT: retl ## encoding: [0xc3] 1656; 1657; X86-AVX1-LABEL: insertps_from_broadcast_multiple_use: 1658; X86-AVX1: ## %bb.0: 1659; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08] 1660; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04] 1661; X86-AVX1-NEXT: vbroadcastss (%ecx,%eax,4), %xmm4 ## encoding: [0xc4,0xe2,0x79,0x18,0x24,0x81] 1662; X86-AVX1-NEXT: vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08] 1663; X86-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[3] 1664; X86-AVX1-NEXT: vblendps $8, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0c,0xcc,0x08] 1665; X86-AVX1-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[3] 1666; X86-AVX1-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1] 1667; X86-AVX1-NEXT: vblendps $8, %xmm4, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x0c,0xcc,0x08] 1668; X86-AVX1-NEXT: ## xmm1 = xmm2[0,1,2],xmm4[3] 1669; X86-AVX1-NEXT: vblendps $8, %xmm4, %xmm3, %xmm2 ## encoding: [0xc4,0xe3,0x61,0x0c,0xd4,0x08] 1670; X86-AVX1-NEXT: ## xmm2 = xmm3[0,1,2],xmm4[3] 1671; X86-AVX1-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x58,0xca] 1672; X86-AVX1-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1] 1673; X86-AVX1-NEXT: retl ## encoding: [0xc3] 1674; 1675; X86-AVX512-LABEL: insertps_from_broadcast_multiple_use: 1676; X86-AVX512: ## %bb.0: 1677; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08] 1678; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04] 1679; X86-AVX512-NEXT: vbroadcastss (%ecx,%eax,4), %xmm4 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x24,0x81] 1680; X86-AVX512-NEXT: vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08] 1681; X86-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[3] 1682; X86-AVX512-NEXT: vblendps $8, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0c,0xcc,0x08] 1683; X86-AVX512-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[3] 1684; X86-AVX512-NEXT: vblendps $8, %xmm4, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0c,0xd4,0x08] 1685; X86-AVX512-NEXT: ## xmm2 = xmm2[0,1,2],xmm4[3] 1686; X86-AVX512-NEXT: vblendps $8, %xmm4, %xmm3, %xmm3 ## encoding: [0xc4,0xe3,0x61,0x0c,0xdc,0x08] 1687; X86-AVX512-NEXT: ## xmm3 = xmm3[0,1,2],xmm4[3] 1688; X86-AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1] 1689; X86-AVX512-NEXT: vaddps %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xcb] 1690; X86-AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1] 1691; X86-AVX512-NEXT: retl ## encoding: [0xc3] 1692; 1693; X64-SSE-LABEL: insertps_from_broadcast_multiple_use: 1694; X64-SSE: ## %bb.0: 1695; X64-SSE-NEXT: movss (%rdi,%rsi,4), %xmm4 ## xmm4 = mem[0],zero,zero,zero 1696; X64-SSE-NEXT: ## encoding: [0xf3,0x0f,0x10,0x24,0xb7] 1697; X64-SSE-NEXT: insertps $48, %xmm4, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc4,0x30] 1698; X64-SSE-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[0] 1699; X64-SSE-NEXT: insertps $48, %xmm4, %xmm1 ## encoding: [0x66,0x0f,0x3a,0x21,0xcc,0x30] 1700; X64-SSE-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[0] 1701; X64-SSE-NEXT: addps %xmm1, %xmm0 ## encoding: [0x0f,0x58,0xc1] 1702; X64-SSE-NEXT: insertps $48, %xmm4, %xmm2 ## encoding: [0x66,0x0f,0x3a,0x21,0xd4,0x30] 1703; X64-SSE-NEXT: ## xmm2 = xmm2[0,1,2],xmm4[0] 1704; X64-SSE-NEXT: insertps $48, %xmm4, %xmm3 ## encoding: [0x66,0x0f,0x3a,0x21,0xdc,0x30] 1705; X64-SSE-NEXT: ## xmm3 = xmm3[0,1,2],xmm4[0] 1706; X64-SSE-NEXT: addps %xmm2, %xmm3 ## encoding: [0x0f,0x58,0xda] 1707; X64-SSE-NEXT: addps %xmm3, %xmm0 ## encoding: [0x0f,0x58,0xc3] 1708; X64-SSE-NEXT: retq ## encoding: [0xc3] 1709; 1710; X64-AVX1-LABEL: insertps_from_broadcast_multiple_use: 1711; X64-AVX1: ## %bb.0: 1712; X64-AVX1-NEXT: vbroadcastss (%rdi,%rsi,4), %xmm4 ## encoding: [0xc4,0xe2,0x79,0x18,0x24,0xb7] 1713; X64-AVX1-NEXT: vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08] 1714; X64-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[3] 1715; X64-AVX1-NEXT: vblendps $8, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0c,0xcc,0x08] 1716; X64-AVX1-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[3] 1717; X64-AVX1-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1] 1718; X64-AVX1-NEXT: vblendps $8, %xmm4, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x0c,0xcc,0x08] 1719; X64-AVX1-NEXT: ## xmm1 = xmm2[0,1,2],xmm4[3] 1720; X64-AVX1-NEXT: vblendps $8, %xmm4, %xmm3, %xmm2 ## encoding: [0xc4,0xe3,0x61,0x0c,0xd4,0x08] 1721; X64-AVX1-NEXT: ## xmm2 = xmm3[0,1,2],xmm4[3] 1722; X64-AVX1-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x58,0xca] 1723; X64-AVX1-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1] 1724; X64-AVX1-NEXT: retq ## encoding: [0xc3] 1725; 1726; X64-AVX512-LABEL: insertps_from_broadcast_multiple_use: 1727; X64-AVX512: ## %bb.0: 1728; X64-AVX512-NEXT: vbroadcastss (%rdi,%rsi,4), %xmm4 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x24,0xb7] 1729; X64-AVX512-NEXT: vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08] 1730; X64-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[3] 1731; X64-AVX512-NEXT: vblendps $8, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0c,0xcc,0x08] 1732; X64-AVX512-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[3] 1733; X64-AVX512-NEXT: vblendps $8, %xmm4, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0c,0xd4,0x08] 1734; X64-AVX512-NEXT: ## xmm2 = xmm2[0,1,2],xmm4[3] 1735; X64-AVX512-NEXT: vblendps $8, %xmm4, %xmm3, %xmm3 ## encoding: [0xc4,0xe3,0x61,0x0c,0xdc,0x08] 1736; X64-AVX512-NEXT: ## xmm3 = xmm3[0,1,2],xmm4[3] 1737; X64-AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1] 1738; X64-AVX512-NEXT: vaddps %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xcb] 1739; X64-AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1] 1740; X64-AVX512-NEXT: retq ## encoding: [0xc3] 1741 %1 = getelementptr inbounds float, ptr %fb, i64 %index 1742 %2 = load float, ptr %1, align 4 1743 %3 = insertelement <4 x float> undef, float %2, i32 0 1744 %4 = insertelement <4 x float> %3, float %2, i32 1 1745 %5 = insertelement <4 x float> %4, float %2, i32 2 1746 %6 = insertelement <4 x float> %5, float %2, i32 3 1747 %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48) 1748 %8 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %b, <4 x float> %6, i32 48) 1749 %9 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %c, <4 x float> %6, i32 48) 1750 %10 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %d, <4 x float> %6, i32 48) 1751 %11 = fadd <4 x float> %7, %8 1752 %12 = fadd <4 x float> %9, %10 1753 %13 = fadd <4 x float> %11, %12 1754 ret <4 x float> %13 1755} 1756 1757define <4 x float> @insertps_with_undefs(<4 x float> %a, ptr %b) { 1758; X86-SSE-LABEL: insertps_with_undefs: 1759; X86-SSE: ## %bb.0: 1760; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 1761; X86-SSE-NEXT: movss (%eax), %xmm1 ## xmm1 = mem[0],zero,zero,zero 1762; X86-SSE-NEXT: ## encoding: [0xf3,0x0f,0x10,0x08] 1763; X86-SSE-NEXT: movlhps %xmm0, %xmm1 ## encoding: [0x0f,0x16,0xc8] 1764; X86-SSE-NEXT: ## xmm1 = xmm1[0],xmm0[0] 1765; X86-SSE-NEXT: movaps %xmm1, %xmm0 ## encoding: [0x0f,0x28,0xc1] 1766; X86-SSE-NEXT: retl ## encoding: [0xc3] 1767; 1768; X86-AVX1-LABEL: insertps_with_undefs: 1769; X86-AVX1: ## %bb.0: 1770; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 1771; X86-AVX1-NEXT: vmovss (%eax), %xmm1 ## xmm1 = mem[0],zero,zero,zero 1772; X86-AVX1-NEXT: ## encoding: [0xc5,0xfa,0x10,0x08] 1773; X86-AVX1-NEXT: vmovlhps %xmm0, %xmm1, %xmm0 ## encoding: [0xc5,0xf0,0x16,0xc0] 1774; X86-AVX1-NEXT: ## xmm0 = xmm1[0],xmm0[0] 1775; X86-AVX1-NEXT: retl ## encoding: [0xc3] 1776; 1777; X86-AVX512-LABEL: insertps_with_undefs: 1778; X86-AVX512: ## %bb.0: 1779; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 1780; X86-AVX512-NEXT: vmovss (%eax), %xmm1 ## xmm1 = mem[0],zero,zero,zero 1781; X86-AVX512-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x08] 1782; X86-AVX512-NEXT: vmovlhps %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x16,0xc0] 1783; X86-AVX512-NEXT: ## xmm0 = xmm1[0],xmm0[0] 1784; X86-AVX512-NEXT: retl ## encoding: [0xc3] 1785; 1786; X64-SSE-LABEL: insertps_with_undefs: 1787; X64-SSE: ## %bb.0: 1788; X64-SSE-NEXT: movss (%rdi), %xmm1 ## xmm1 = mem[0],zero,zero,zero 1789; X64-SSE-NEXT: ## encoding: [0xf3,0x0f,0x10,0x0f] 1790; X64-SSE-NEXT: movlhps %xmm0, %xmm1 ## encoding: [0x0f,0x16,0xc8] 1791; X64-SSE-NEXT: ## xmm1 = xmm1[0],xmm0[0] 1792; X64-SSE-NEXT: movaps %xmm1, %xmm0 ## encoding: [0x0f,0x28,0xc1] 1793; X64-SSE-NEXT: retq ## encoding: [0xc3] 1794; 1795; X64-AVX1-LABEL: insertps_with_undefs: 1796; X64-AVX1: ## %bb.0: 1797; X64-AVX1-NEXT: vmovss (%rdi), %xmm1 ## xmm1 = mem[0],zero,zero,zero 1798; X64-AVX1-NEXT: ## encoding: [0xc5,0xfa,0x10,0x0f] 1799; X64-AVX1-NEXT: vmovlhps %xmm0, %xmm1, %xmm0 ## encoding: [0xc5,0xf0,0x16,0xc0] 1800; X64-AVX1-NEXT: ## xmm0 = xmm1[0],xmm0[0] 1801; X64-AVX1-NEXT: retq ## encoding: [0xc3] 1802; 1803; X64-AVX512-LABEL: insertps_with_undefs: 1804; X64-AVX512: ## %bb.0: 1805; X64-AVX512-NEXT: vmovss (%rdi), %xmm1 ## xmm1 = mem[0],zero,zero,zero 1806; X64-AVX512-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x0f] 1807; X64-AVX512-NEXT: vmovlhps %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x16,0xc0] 1808; X64-AVX512-NEXT: ## xmm0 = xmm1[0],xmm0[0] 1809; X64-AVX512-NEXT: retq ## encoding: [0xc3] 1810 %1 = load float, ptr %b, align 4 1811 %2 = insertelement <4 x float> undef, float %1, i32 0 1812 %result = shufflevector <4 x float> %a, <4 x float> %2, <4 x i32> <i32 4, i32 undef, i32 0, i32 7> 1813 ret <4 x float> %result 1814} 1815 1816; Test for a bug in X86ISelLowering.cpp:getINSERTPS where we were using 1817; the destination index to change the load, instead of the source index. 1818define <4 x float> @pr20087(<4 x float> %a, ptr%ptr) { 1819; X86-SSE-LABEL: pr20087: 1820; X86-SSE: ## %bb.0: 1821; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 1822; X86-SSE-NEXT: movaps (%eax), %xmm1 ## encoding: [0x0f,0x28,0x08] 1823; X86-SSE-NEXT: insertps $178, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0xb2] 1824; X86-SSE-NEXT: ## xmm0 = xmm0[0],zero,xmm0[2],xmm1[2] 1825; X86-SSE-NEXT: retl ## encoding: [0xc3] 1826; 1827; X86-AVX1-LABEL: pr20087: 1828; X86-AVX1: ## %bb.0: 1829; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 1830; X86-AVX1-NEXT: vmovaps (%eax), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x08] 1831; X86-AVX1-NEXT: vinsertps $178, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xb2] 1832; X86-AVX1-NEXT: ## xmm0 = xmm0[0],zero,xmm0[2],xmm1[2] 1833; X86-AVX1-NEXT: retl ## encoding: [0xc3] 1834; 1835; X86-AVX512-LABEL: pr20087: 1836; X86-AVX512: ## %bb.0: 1837; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 1838; X86-AVX512-NEXT: vmovaps (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x08] 1839; X86-AVX512-NEXT: vinsertps $178, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xb2] 1840; X86-AVX512-NEXT: ## xmm0 = xmm0[0],zero,xmm0[2],xmm1[2] 1841; X86-AVX512-NEXT: retl ## encoding: [0xc3] 1842; 1843; X64-SSE-LABEL: pr20087: 1844; X64-SSE: ## %bb.0: 1845; X64-SSE-NEXT: movaps (%rdi), %xmm1 ## encoding: [0x0f,0x28,0x0f] 1846; X64-SSE-NEXT: insertps $178, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0xb2] 1847; X64-SSE-NEXT: ## xmm0 = xmm0[0],zero,xmm0[2],xmm1[2] 1848; X64-SSE-NEXT: retq ## encoding: [0xc3] 1849; 1850; X64-AVX1-LABEL: pr20087: 1851; X64-AVX1: ## %bb.0: 1852; X64-AVX1-NEXT: vmovaps (%rdi), %xmm1 ## encoding: [0xc5,0xf8,0x28,0x0f] 1853; X64-AVX1-NEXT: vinsertps $178, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xb2] 1854; X64-AVX1-NEXT: ## xmm0 = xmm0[0],zero,xmm0[2],xmm1[2] 1855; X64-AVX1-NEXT: retq ## encoding: [0xc3] 1856; 1857; X64-AVX512-LABEL: pr20087: 1858; X64-AVX512: ## %bb.0: 1859; X64-AVX512-NEXT: vmovaps (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0f] 1860; X64-AVX512-NEXT: vinsertps $178, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xb2] 1861; X64-AVX512-NEXT: ## xmm0 = xmm0[0],zero,xmm0[2],xmm1[2] 1862; X64-AVX512-NEXT: retq ## encoding: [0xc3] 1863 %load = load <4 x float> , ptr%ptr 1864 %ret = shufflevector <4 x float> %load, <4 x float> %a, <4 x i32> <i32 4, i32 undef, i32 6, i32 2> 1865 ret <4 x float> %ret 1866} 1867 1868; Edge case for insertps where we end up with a shuffle with mask=<0, 7, -1, -1> 1869define void @insertps_pr20411(<4 x i32> %shuffle109, <4 x i32> %shuffle116, ptr noalias nocapture %RET) #1 { 1870; X86-SSE-LABEL: insertps_pr20411: 1871; X86-SSE: ## %bb.0: 1872; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 1873; X86-SSE-NEXT: pshufd $238, %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc9,0xee] 1874; X86-SSE-NEXT: ## xmm1 = xmm1[2,3,2,3] 1875; X86-SSE-NEXT: pblendw $243, %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc8,0xf3] 1876; X86-SSE-NEXT: ## xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 1877; X86-SSE-NEXT: movdqu %xmm1, (%eax) ## encoding: [0xf3,0x0f,0x7f,0x08] 1878; X86-SSE-NEXT: retl ## encoding: [0xc3] 1879; 1880; X86-AVX1-LABEL: insertps_pr20411: 1881; X86-AVX1: ## %bb.0: 1882; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 1883; X86-AVX1-NEXT: vshufps $238, %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0xc6,0xc9,0xee] 1884; X86-AVX1-NEXT: ## xmm1 = xmm1[2,3,2,3] 1885; X86-AVX1-NEXT: vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02] 1886; X86-AVX1-NEXT: ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 1887; X86-AVX1-NEXT: vmovups %xmm0, (%eax) ## encoding: [0xc5,0xf8,0x11,0x00] 1888; X86-AVX1-NEXT: retl ## encoding: [0xc3] 1889; 1890; X86-AVX512-LABEL: insertps_pr20411: 1891; X86-AVX512: ## %bb.0: 1892; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] 1893; X86-AVX512-NEXT: vshufps $238, %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0xc6,0xc9,0xee] 1894; X86-AVX512-NEXT: ## xmm1 = xmm1[2,3,2,3] 1895; X86-AVX512-NEXT: vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02] 1896; X86-AVX512-NEXT: ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 1897; X86-AVX512-NEXT: vmovups %xmm0, (%eax) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x00] 1898; X86-AVX512-NEXT: retl ## encoding: [0xc3] 1899; 1900; X64-SSE-LABEL: insertps_pr20411: 1901; X64-SSE: ## %bb.0: 1902; X64-SSE-NEXT: pshufd $238, %xmm1, %xmm1 ## encoding: [0x66,0x0f,0x70,0xc9,0xee] 1903; X64-SSE-NEXT: ## xmm1 = xmm1[2,3,2,3] 1904; X64-SSE-NEXT: pblendw $243, %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc8,0xf3] 1905; X64-SSE-NEXT: ## xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 1906; X64-SSE-NEXT: movdqu %xmm1, (%rdi) ## encoding: [0xf3,0x0f,0x7f,0x0f] 1907; X64-SSE-NEXT: retq ## encoding: [0xc3] 1908; 1909; X64-AVX1-LABEL: insertps_pr20411: 1910; X64-AVX1: ## %bb.0: 1911; X64-AVX1-NEXT: vshufps $238, %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0xc6,0xc9,0xee] 1912; X64-AVX1-NEXT: ## xmm1 = xmm1[2,3,2,3] 1913; X64-AVX1-NEXT: vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02] 1914; X64-AVX1-NEXT: ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 1915; X64-AVX1-NEXT: vmovups %xmm0, (%rdi) ## encoding: [0xc5,0xf8,0x11,0x07] 1916; X64-AVX1-NEXT: retq ## encoding: [0xc3] 1917; 1918; X64-AVX512-LABEL: insertps_pr20411: 1919; X64-AVX512: ## %bb.0: 1920; X64-AVX512-NEXT: vshufps $238, %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0xc6,0xc9,0xee] 1921; X64-AVX512-NEXT: ## xmm1 = xmm1[2,3,2,3] 1922; X64-AVX512-NEXT: vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02] 1923; X64-AVX512-NEXT: ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 1924; X64-AVX512-NEXT: vmovups %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x07] 1925; X64-AVX512-NEXT: retq ## encoding: [0xc3] 1926 %shuffle117 = shufflevector <4 x i32> %shuffle109, <4 x i32> %shuffle116, <4 x i32> <i32 0, i32 7, i32 undef, i32 undef> 1927 store <4 x i32> %shuffle117, ptr %RET, align 4 1928 ret void 1929} 1930 1931define <4 x float> @insertps_4(<4 x float> %A, <4 x float> %B) { 1932; SSE-LABEL: insertps_4: 1933; SSE: ## %bb.0: 1934; SSE-NEXT: insertps $170, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0xaa] 1935; SSE-NEXT: ## xmm0 = xmm0[0],zero,xmm1[2],zero 1936; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1937; 1938; AVX1-LABEL: insertps_4: 1939; AVX1: ## %bb.0: 1940; AVX1-NEXT: vinsertps $170, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xaa] 1941; AVX1-NEXT: ## xmm0 = xmm0[0],zero,xmm1[2],zero 1942; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1943; 1944; AVX512-LABEL: insertps_4: 1945; AVX512: ## %bb.0: 1946; AVX512-NEXT: vinsertps $170, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xaa] 1947; AVX512-NEXT: ## xmm0 = xmm0[0],zero,xmm1[2],zero 1948; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1949 %vecext = extractelement <4 x float> %A, i32 0 1950 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 1951 %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1 1952 %vecext2 = extractelement <4 x float> %B, i32 2 1953 %vecinit3 = insertelement <4 x float> %vecinit1, float %vecext2, i32 2 1954 %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3 1955 ret <4 x float> %vecinit4 1956} 1957 1958define <4 x float> @insertps_5(<4 x float> %A, <4 x float> %B) { 1959; SSE-LABEL: insertps_5: 1960; SSE: ## %bb.0: 1961; SSE-NEXT: insertps $92, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x5c] 1962; SSE-NEXT: ## xmm0 = xmm0[0],xmm1[1],zero,zero 1963; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1964; 1965; AVX1-LABEL: insertps_5: 1966; AVX1: ## %bb.0: 1967; AVX1-NEXT: vinsertps $92, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x5c] 1968; AVX1-NEXT: ## xmm0 = xmm0[0],xmm1[1],zero,zero 1969; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1970; 1971; AVX512-LABEL: insertps_5: 1972; AVX512: ## %bb.0: 1973; AVX512-NEXT: vinsertps $92, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x5c] 1974; AVX512-NEXT: ## xmm0 = xmm0[0],xmm1[1],zero,zero 1975; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1976 %vecext = extractelement <4 x float> %A, i32 0 1977 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 1978 %vecext1 = extractelement <4 x float> %B, i32 1 1979 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1 1980 %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 2 1981 %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3 1982 ret <4 x float> %vecinit4 1983} 1984 1985define <4 x float> @insertps_6(<4 x float> %A, <4 x float> %B) { 1986; SSE-LABEL: insertps_6: 1987; SSE: ## %bb.0: 1988; SSE-NEXT: insertps $169, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0xa9] 1989; SSE-NEXT: ## xmm0 = zero,xmm0[1],xmm1[2],zero 1990; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1991; 1992; AVX1-LABEL: insertps_6: 1993; AVX1: ## %bb.0: 1994; AVX1-NEXT: vinsertps $169, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xa9] 1995; AVX1-NEXT: ## xmm0 = zero,xmm0[1],xmm1[2],zero 1996; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 1997; 1998; AVX512-LABEL: insertps_6: 1999; AVX512: ## %bb.0: 2000; AVX512-NEXT: vinsertps $169, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xa9] 2001; AVX512-NEXT: ## xmm0 = zero,xmm0[1],xmm1[2],zero 2002; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 2003 %vecext = extractelement <4 x float> %A, i32 1 2004 %vecinit = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %vecext, i32 1 2005 %vecext1 = extractelement <4 x float> %B, i32 2 2006 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 2 2007 %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 3 2008 ret <4 x float> %vecinit3 2009} 2010 2011define <4 x float> @insertps_7(<4 x float> %A, <4 x float> %B) { 2012; SSE-LABEL: insertps_7: 2013; SSE: ## %bb.0: 2014; SSE-NEXT: insertps $106, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x6a] 2015; SSE-NEXT: ## xmm0 = xmm0[0],zero,xmm1[1],zero 2016; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 2017; 2018; AVX1-LABEL: insertps_7: 2019; AVX1: ## %bb.0: 2020; AVX1-NEXT: vinsertps $106, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x6a] 2021; AVX1-NEXT: ## xmm0 = xmm0[0],zero,xmm1[1],zero 2022; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 2023; 2024; AVX512-LABEL: insertps_7: 2025; AVX512: ## %bb.0: 2026; AVX512-NEXT: vinsertps $106, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x6a] 2027; AVX512-NEXT: ## xmm0 = xmm0[0],zero,xmm1[1],zero 2028; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 2029 %vecext = extractelement <4 x float> %A, i32 0 2030 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 2031 %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1 2032 %vecext2 = extractelement <4 x float> %B, i32 1 2033 %vecinit3 = insertelement <4 x float> %vecinit1, float %vecext2, i32 2 2034 %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3 2035 ret <4 x float> %vecinit4 2036} 2037 2038define <4 x float> @insertps_8(<4 x float> %A, <4 x float> %B) { 2039; SSE-LABEL: insertps_8: 2040; SSE: ## %bb.0: 2041; SSE-NEXT: insertps $28, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x1c] 2042; SSE-NEXT: ## xmm0 = xmm0[0],xmm1[0],zero,zero 2043; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 2044; 2045; AVX1-LABEL: insertps_8: 2046; AVX1: ## %bb.0: 2047; AVX1-NEXT: vinsertps $28, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x1c] 2048; AVX1-NEXT: ## xmm0 = xmm0[0],xmm1[0],zero,zero 2049; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 2050; 2051; AVX512-LABEL: insertps_8: 2052; AVX512: ## %bb.0: 2053; AVX512-NEXT: vinsertps $28, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x1c] 2054; AVX512-NEXT: ## xmm0 = xmm0[0],xmm1[0],zero,zero 2055; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 2056 %vecext = extractelement <4 x float> %A, i32 0 2057 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 2058 %vecext1 = extractelement <4 x float> %B, i32 0 2059 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1 2060 %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 2 2061 %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3 2062 ret <4 x float> %vecinit4 2063} 2064 2065define <4 x float> @insertps_9(<4 x float> %A, <4 x float> %B) { 2066; SSE-LABEL: insertps_9: 2067; SSE: ## %bb.0: 2068; SSE-NEXT: insertps $25, %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x3a,0x21,0xc8,0x19] 2069; SSE-NEXT: ## xmm1 = zero,xmm0[0],xmm1[2],zero 2070; SSE-NEXT: movaps %xmm1, %xmm0 ## encoding: [0x0f,0x28,0xc1] 2071; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 2072; 2073; AVX1-LABEL: insertps_9: 2074; AVX1: ## %bb.0: 2075; AVX1-NEXT: vinsertps $25, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x21,0xc0,0x19] 2076; AVX1-NEXT: ## xmm0 = zero,xmm0[0],xmm1[2],zero 2077; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 2078; 2079; AVX512-LABEL: insertps_9: 2080; AVX512: ## %bb.0: 2081; AVX512-NEXT: vinsertps $25, %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x71,0x21,0xc0,0x19] 2082; AVX512-NEXT: ## xmm0 = zero,xmm0[0],xmm1[2],zero 2083; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 2084 %vecext = extractelement <4 x float> %A, i32 0 2085 %vecinit = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %vecext, i32 1 2086 %vecext1 = extractelement <4 x float> %B, i32 2 2087 %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 2 2088 %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 3 2089 ret <4 x float> %vecinit3 2090} 2091 2092define <4 x float> @insertps_10(<4 x float> %A) { 2093; SSE-LABEL: insertps_10: 2094; SSE: ## %bb.0: 2095; SSE-NEXT: insertps $42, %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc0,0x2a] 2096; SSE-NEXT: ## xmm0 = xmm0[0],zero,xmm0[0],zero 2097; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 2098; 2099; AVX1-LABEL: insertps_10: 2100; AVX1: ## %bb.0: 2101; AVX1-NEXT: vinsertps $42, %xmm0, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc0,0x2a] 2102; AVX1-NEXT: ## xmm0 = xmm0[0],zero,xmm0[0],zero 2103; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 2104; 2105; AVX512-LABEL: insertps_10: 2106; AVX512: ## %bb.0: 2107; AVX512-NEXT: vinsertps $42, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc0,0x2a] 2108; AVX512-NEXT: ## xmm0 = xmm0[0],zero,xmm0[0],zero 2109; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 2110 %vecext = extractelement <4 x float> %A, i32 0 2111 %vecbuild1 = insertelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float %vecext, i32 0 2112 %vecbuild2 = insertelement <4 x float> %vecbuild1, float %vecext, i32 2 2113 ret <4 x float> %vecbuild2 2114} 2115 2116define <4 x float> @build_vector_to_shuffle_1(<4 x float> %A) { 2117; SSE-LABEL: build_vector_to_shuffle_1: 2118; SSE: ## %bb.0: 2119; SSE-NEXT: xorps %xmm1, %xmm1 ## encoding: [0x0f,0x57,0xc9] 2120; SSE-NEXT: blendps $5, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0c,0xc1,0x05] 2121; SSE-NEXT: ## xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] 2122; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 2123; 2124; AVX1-LABEL: build_vector_to_shuffle_1: 2125; AVX1: ## %bb.0: 2126; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9] 2127; AVX1-NEXT: vblendps $10, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x0a] 2128; AVX1-NEXT: ## xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] 2129; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 2130; 2131; AVX512-LABEL: build_vector_to_shuffle_1: 2132; AVX512: ## %bb.0: 2133; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9] 2134; AVX512-NEXT: vblendps $10, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x0a] 2135; AVX512-NEXT: ## xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] 2136; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 2137 %vecext = extractelement <4 x float> %A, i32 1 2138 %vecinit = insertelement <4 x float> zeroinitializer, float %vecext, i32 1 2139 %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 2 2140 %vecinit3 = shufflevector <4 x float> %vecinit1, <4 x float> %A, <4 x i32> <i32 0, i32 1, i32 2, i32 7> 2141 ret <4 x float> %vecinit3 2142} 2143 2144define <4 x float> @build_vector_to_shuffle_2(<4 x float> %A) { 2145; SSE-LABEL: build_vector_to_shuffle_2: 2146; SSE: ## %bb.0: 2147; SSE-NEXT: xorps %xmm1, %xmm1 ## encoding: [0x0f,0x57,0xc9] 2148; SSE-NEXT: blendps $13, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0c,0xc1,0x0d] 2149; SSE-NEXT: ## xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 2150; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 2151; 2152; AVX1-LABEL: build_vector_to_shuffle_2: 2153; AVX1: ## %bb.0: 2154; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9] 2155; AVX1-NEXT: vblendps $2, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x02] 2156; AVX1-NEXT: ## xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 2157; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 2158; 2159; AVX512-LABEL: build_vector_to_shuffle_2: 2160; AVX512: ## %bb.0: 2161; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9] 2162; AVX512-NEXT: vblendps $2, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x02] 2163; AVX512-NEXT: ## xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 2164; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] 2165 %vecext = extractelement <4 x float> %A, i32 1 2166 %vecinit = insertelement <4 x float> zeroinitializer, float %vecext, i32 1 2167 %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 2 2168 ret <4 x float> %vecinit1 2169} 2170