1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 2; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A %s 3; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940 %s 4; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX1030 %s 5; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX1100 %s 6 7define amdgpu_kernel void @test_insert_extract(i32 %p, i32 %q) { 8; GFX90A-LABEL: test_insert_extract: 9; GFX90A: ; %bb.0: ; %entry 10; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 11; GFX90A-NEXT: s_mov_b32 s2, 0 12; GFX90A-NEXT: s_and_b64 vcc, exec, -1 13; GFX90A-NEXT: s_mov_b32 s3, 0 14; GFX90A-NEXT: s_mov_b32 s4, 0 15; GFX90A-NEXT: s_mov_b32 s5, 0 16; GFX90A-NEXT: s_mov_b32 s6, 0 17; GFX90A-NEXT: .LBB0_1: ; %for.body 18; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 19; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 20; GFX90A-NEXT: s_cmp_eq_u32 s1, 1 21; GFX90A-NEXT: s_cselect_b64 s[8:9], -1, 0 22; GFX90A-NEXT: s_and_b64 s[8:9], s[8:9], exec 23; GFX90A-NEXT: s_cselect_b32 s7, s4, s3 24; GFX90A-NEXT: s_cmp_eq_u32 s1, 2 25; GFX90A-NEXT: s_cselect_b64 s[8:9], -1, 0 26; GFX90A-NEXT: s_and_b64 s[8:9], s[8:9], exec 27; GFX90A-NEXT: s_cselect_b32 s7, s5, s7 28; GFX90A-NEXT: s_cmp_eq_u32 s1, 3 29; GFX90A-NEXT: s_cselect_b64 s[8:9], -1, 0 30; GFX90A-NEXT: s_and_b64 s[8:9], s[8:9], exec 31; GFX90A-NEXT: s_cselect_b32 s7, s6, s7 32; GFX90A-NEXT: s_or_b32 s7, s7, s0 33; GFX90A-NEXT: s_cmp_eq_u32 s1, 1 34; GFX90A-NEXT: s_cselect_b64 s[8:9], -1, 0 35; GFX90A-NEXT: s_and_b64 s[10:11], s[8:9], exec 36; GFX90A-NEXT: s_cselect_b32 s4, s7, s4 37; GFX90A-NEXT: s_cmp_eq_u32 s1, 3 38; GFX90A-NEXT: s_cselect_b64 s[10:11], -1, 0 39; GFX90A-NEXT: s_and_b64 s[12:13], s[10:11], exec 40; GFX90A-NEXT: s_cselect_b32 s6, s7, s6 41; GFX90A-NEXT: s_cmp_eq_u32 s1, 2 42; GFX90A-NEXT: s_cselect_b64 s[12:13], -1, 0 43; GFX90A-NEXT: s_and_b64 s[14:15], s[12:13], exec 44; GFX90A-NEXT: s_cselect_b32 s5, s7, s5 45; GFX90A-NEXT: s_cmp_eq_u32 s1, 0 46; GFX90A-NEXT: s_cselect_b32 s3, s7, s3 47; GFX90A-NEXT: s_or_b64 s[8:9], s[12:13], s[8:9] 48; GFX90A-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] 49; GFX90A-NEXT: s_and_b64 s[8:9], s[8:9], exec 50; GFX90A-NEXT: s_cselect_b32 s2, 0, s2 51; GFX90A-NEXT: s_mov_b64 vcc, vcc 52; GFX90A-NEXT: s_cbranch_vccnz .LBB0_1 53; GFX90A-NEXT: ; %bb.2: ; %DummyReturnBlock 54; GFX90A-NEXT: s_endpgm 55; 56; GFX940-LABEL: test_insert_extract: 57; GFX940: ; %bb.0: ; %entry 58; GFX940-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 59; GFX940-NEXT: s_mov_b32 s2, 0 60; GFX940-NEXT: s_and_b64 vcc, exec, -1 61; GFX940-NEXT: s_mov_b32 s3, 0 62; GFX940-NEXT: s_mov_b32 s4, 0 63; GFX940-NEXT: s_mov_b32 s5, 0 64; GFX940-NEXT: s_mov_b32 s6, 0 65; GFX940-NEXT: .LBB0_1: ; %for.body 66; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 67; GFX940-NEXT: s_waitcnt lgkmcnt(0) 68; GFX940-NEXT: s_cmp_eq_u32 s1, 1 69; GFX940-NEXT: s_cselect_b64 s[8:9], -1, 0 70; GFX940-NEXT: s_and_b64 s[8:9], s[8:9], exec 71; GFX940-NEXT: s_cselect_b32 s7, s4, s3 72; GFX940-NEXT: s_cmp_eq_u32 s1, 2 73; GFX940-NEXT: s_cselect_b64 s[8:9], -1, 0 74; GFX940-NEXT: s_and_b64 s[8:9], s[8:9], exec 75; GFX940-NEXT: s_cselect_b32 s7, s5, s7 76; GFX940-NEXT: s_cmp_eq_u32 s1, 3 77; GFX940-NEXT: s_cselect_b64 s[8:9], -1, 0 78; GFX940-NEXT: s_and_b64 s[8:9], s[8:9], exec 79; GFX940-NEXT: s_cselect_b32 s7, s6, s7 80; GFX940-NEXT: s_or_b32 s7, s7, s0 81; GFX940-NEXT: s_cmp_eq_u32 s1, 1 82; GFX940-NEXT: s_cselect_b64 s[8:9], -1, 0 83; GFX940-NEXT: s_and_b64 s[10:11], s[8:9], exec 84; GFX940-NEXT: s_cselect_b32 s4, s7, s4 85; GFX940-NEXT: s_cmp_eq_u32 s1, 3 86; GFX940-NEXT: s_cselect_b64 s[10:11], -1, 0 87; GFX940-NEXT: s_and_b64 s[12:13], s[10:11], exec 88; GFX940-NEXT: s_cselect_b32 s6, s7, s6 89; GFX940-NEXT: s_cmp_eq_u32 s1, 2 90; GFX940-NEXT: s_cselect_b64 s[12:13], -1, 0 91; GFX940-NEXT: s_and_b64 s[14:15], s[12:13], exec 92; GFX940-NEXT: s_cselect_b32 s5, s7, s5 93; GFX940-NEXT: s_cmp_eq_u32 s1, 0 94; GFX940-NEXT: s_cselect_b32 s3, s7, s3 95; GFX940-NEXT: s_or_b64 s[8:9], s[12:13], s[8:9] 96; GFX940-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] 97; GFX940-NEXT: s_and_b64 s[8:9], s[8:9], exec 98; GFX940-NEXT: s_cselect_b32 s2, 0, s2 99; GFX940-NEXT: s_mov_b64 vcc, vcc 100; GFX940-NEXT: s_cbranch_vccnz .LBB0_1 101; GFX940-NEXT: ; %bb.2: ; %DummyReturnBlock 102; GFX940-NEXT: s_endpgm 103; 104; GFX1030-LABEL: test_insert_extract: 105; GFX1030: ; %bb.0: ; %entry 106; GFX1030-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 107; GFX1030-NEXT: s_mov_b32 s2, 0 108; GFX1030-NEXT: s_mov_b32 s3, 0 109; GFX1030-NEXT: s_mov_b32 s4, 0 110; GFX1030-NEXT: s_mov_b32 s5, 0 111; GFX1030-NEXT: s_mov_b32 s6, 0 112; GFX1030-NEXT: s_mov_b32 vcc_lo, exec_lo 113; GFX1030-NEXT: .p2align 6 114; GFX1030-NEXT: .LBB0_1: ; %for.body 115; GFX1030-NEXT: ; =>This Inner Loop Header: Depth=1 116; GFX1030-NEXT: s_waitcnt lgkmcnt(0) 117; GFX1030-NEXT: s_cmp_eq_u32 s1, 1 118; GFX1030-NEXT: s_cselect_b32 s7, -1, 0 119; GFX1030-NEXT: s_and_b32 s7, s7, exec_lo 120; GFX1030-NEXT: s_cselect_b32 s7, s4, s3 121; GFX1030-NEXT: s_cmp_eq_u32 s1, 2 122; GFX1030-NEXT: s_cselect_b32 s8, -1, 0 123; GFX1030-NEXT: s_and_b32 s8, s8, exec_lo 124; GFX1030-NEXT: s_cselect_b32 s7, s5, s7 125; GFX1030-NEXT: s_cmp_eq_u32 s1, 3 126; GFX1030-NEXT: s_cselect_b32 s8, -1, 0 127; GFX1030-NEXT: s_and_b32 s8, s8, exec_lo 128; GFX1030-NEXT: s_cselect_b32 s7, s6, s7 129; GFX1030-NEXT: s_or_b32 s7, s7, s0 130; GFX1030-NEXT: s_cmp_eq_u32 s1, 1 131; GFX1030-NEXT: s_cselect_b32 s8, -1, 0 132; GFX1030-NEXT: s_and_b32 s9, s8, exec_lo 133; GFX1030-NEXT: s_cselect_b32 s4, s7, s4 134; GFX1030-NEXT: s_cmp_eq_u32 s1, 3 135; GFX1030-NEXT: s_cselect_b32 s9, -1, 0 136; GFX1030-NEXT: s_and_b32 s10, s9, exec_lo 137; GFX1030-NEXT: s_cselect_b32 s6, s7, s6 138; GFX1030-NEXT: s_cmp_eq_u32 s1, 2 139; GFX1030-NEXT: s_cselect_b32 s10, -1, 0 140; GFX1030-NEXT: s_and_b32 s11, s10, exec_lo 141; GFX1030-NEXT: s_cselect_b32 s5, s7, s5 142; GFX1030-NEXT: s_cmp_eq_u32 s1, 0 143; GFX1030-NEXT: s_cselect_b32 s3, s7, s3 144; GFX1030-NEXT: s_or_b32 s7, s10, s8 145; GFX1030-NEXT: s_or_b32 s7, s9, s7 146; GFX1030-NEXT: s_and_b32 s7, s7, exec_lo 147; GFX1030-NEXT: s_cselect_b32 s2, 0, s2 148; GFX1030-NEXT: s_cbranch_vccnz .LBB0_1 149; GFX1030-NEXT: ; %bb.2: ; %DummyReturnBlock 150; GFX1030-NEXT: s_endpgm 151; 152; GFX1100-LABEL: test_insert_extract: 153; GFX1100: ; %bb.0: ; %entry 154; GFX1100-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 155; GFX1100-NEXT: s_mov_b32 s2, 0 156; GFX1100-NEXT: s_mov_b32 s3, 0 157; GFX1100-NEXT: s_mov_b32 s4, 0 158; GFX1100-NEXT: s_mov_b32 s5, 0 159; GFX1100-NEXT: s_mov_b32 s6, 0 160; GFX1100-NEXT: s_mov_b32 vcc_lo, exec_lo 161; GFX1100-NEXT: .p2align 6 162; GFX1100-NEXT: .LBB0_1: ; %for.body 163; GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1 164; GFX1100-NEXT: s_waitcnt lgkmcnt(0) 165; GFX1100-NEXT: s_cmp_eq_u32 s1, 1 166; GFX1100-NEXT: s_cselect_b32 s7, -1, 0 167; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) 168; GFX1100-NEXT: s_and_b32 s7, s7, exec_lo 169; GFX1100-NEXT: s_cselect_b32 s7, s4, s3 170; GFX1100-NEXT: s_cmp_eq_u32 s1, 2 171; GFX1100-NEXT: s_cselect_b32 s8, -1, 0 172; GFX1100-NEXT: s_and_b32 s8, s8, exec_lo 173; GFX1100-NEXT: s_cselect_b32 s7, s5, s7 174; GFX1100-NEXT: s_cmp_eq_u32 s1, 3 175; GFX1100-NEXT: s_cselect_b32 s8, -1, 0 176; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 177; GFX1100-NEXT: s_and_b32 s8, s8, exec_lo 178; GFX1100-NEXT: s_cselect_b32 s7, s6, s7 179; GFX1100-NEXT: s_or_b32 s7, s7, s0 180; GFX1100-NEXT: s_cmp_eq_u32 s1, 1 181; GFX1100-NEXT: s_cselect_b32 s8, -1, 0 182; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) 183; GFX1100-NEXT: s_and_b32 s9, s8, exec_lo 184; GFX1100-NEXT: s_cselect_b32 s4, s7, s4 185; GFX1100-NEXT: s_cmp_eq_u32 s1, 3 186; GFX1100-NEXT: s_cselect_b32 s9, -1, 0 187; GFX1100-NEXT: s_and_b32 s10, s9, exec_lo 188; GFX1100-NEXT: s_cselect_b32 s6, s7, s6 189; GFX1100-NEXT: s_cmp_eq_u32 s1, 2 190; GFX1100-NEXT: s_cselect_b32 s10, -1, 0 191; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) 192; GFX1100-NEXT: s_and_b32 s11, s10, exec_lo 193; GFX1100-NEXT: s_cselect_b32 s5, s7, s5 194; GFX1100-NEXT: s_cmp_eq_u32 s1, 0 195; GFX1100-NEXT: s_cselect_b32 s3, s7, s3 196; GFX1100-NEXT: s_or_b32 s7, s10, s8 197; GFX1100-NEXT: s_or_b32 s7, s9, s7 198; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 199; GFX1100-NEXT: s_and_b32 s7, s7, exec_lo 200; GFX1100-NEXT: s_cselect_b32 s2, 0, s2 201; GFX1100-NEXT: s_cbranch_vccnz .LBB0_1 202; GFX1100-NEXT: ; %bb.2: ; %DummyReturnBlock 203; GFX1100-NEXT: s_endpgm 204entry: 205 %init = insertelement <4 x i32> zeroinitializer, i32 0, i64 0 206 br label %for.body 207 208for.body: ; preds = %for.body, %entry 209 %x1 = phi <4 x i32> [ %init, %entry ], [ %i4, %for.body ] 210 %x2 = phi <4 x i32> [ zeroinitializer, %entry ], [ %i2, %for.body ] 211 %idxprom = zext i32 %q to i64 212 %e1 = extractelement <4 x i32> %x2, i64 %idxprom 213 %add = or i32 %e1, %p 214 %i2 = insertelement <4 x i32> %x2, i32 %add, i64 %idxprom 215 %e3 = extractelement <4 x i32> %x1, i64 %idxprom 216 %i4 = insertelement <4 x i32> %x1, i32 %e3, i64 0 217 br label %for.body 218} 219 220