1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 2; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-SDAG %s 3; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-SDAG %s 4; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-SDAG %s 5 6define void @v_permlane16_p0(ptr addrspace(1) %out, ptr %src0, i32 %src1, i32 %src2) { 7; GFX10-SDAG-LABEL: v_permlane16_p0: 8; GFX10-SDAG: ; %bb.0: 9; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v4 11; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v5 12; GFX10-SDAG-NEXT: v_permlane16_b32 v3, v3, s4, s5 13; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s4, s5 14; GFX10-SDAG-NEXT: global_store_dwordx2 v[0:1], v[2:3], off 15; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] 16; 17; GFX11-SDAG-LABEL: v_permlane16_p0: 18; GFX11-SDAG: ; %bb.0: 19; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 20; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v4 21; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v5 22; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) 23; GFX11-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 24; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 25; GFX11-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off 26; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] 27; 28; GFX12-SDAG-LABEL: v_permlane16_p0: 29; GFX12-SDAG: ; %bb.0: 30; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 31; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 32; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 33; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 34; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 35; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v4 36; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v5 37; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) 38; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 39; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 40; GFX12-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off 41; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] 42 %v = call ptr @llvm.amdgcn.permlane16.p0(ptr %src0, ptr %src0, i32 %src1, i32 %src2, i1 false, i1 false) 43 store ptr %v, ptr addrspace(1) %out 44 ret void 45} 46 47define void @v_permlanex16_p0(ptr addrspace(1) %out, ptr %src0, i32 %src1, i32 %src2) { 48; GFX10-SDAG-LABEL: v_permlanex16_p0: 49; GFX10-SDAG: ; %bb.0: 50; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 51; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v4 52; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v5 53; GFX10-SDAG-NEXT: v_permlanex16_b32 v3, v3, s4, s5 54; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v2, s4, s5 55; GFX10-SDAG-NEXT: global_store_dwordx2 v[0:1], v[2:3], off 56; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] 57; 58; GFX11-SDAG-LABEL: v_permlanex16_p0: 59; GFX11-SDAG: ; %bb.0: 60; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 61; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v4 62; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v5 63; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) 64; GFX11-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 65; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 66; GFX11-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off 67; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] 68; 69; GFX12-SDAG-LABEL: v_permlanex16_p0: 70; GFX12-SDAG: ; %bb.0: 71; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 72; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 73; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 74; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 75; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 76; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v4 77; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v5 78; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) 79; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 80; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 81; GFX12-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off 82; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] 83 %v = call ptr @llvm.amdgcn.permlanex16.p0(ptr %src0, ptr %src0, i32 %src1, i32 %src2, i1 false, i1 false) 84 store ptr %v, ptr addrspace(1) %out 85 ret void 86} 87 88define void @v_permlane16_v3p0(ptr addrspace(1) %out, <3 x ptr> %src0, i32 %src1, i32 %src2) { 89; GFX10-SDAG-LABEL: v_permlane16_v3p0: 90; GFX10-SDAG: ; %bb.0: 91; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 92; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v8 93; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v9 94; GFX10-SDAG-NEXT: v_permlane16_b32 v7, v7, s4, s5 95; GFX10-SDAG-NEXT: v_permlane16_b32 v6, v6, s4, s5 96; GFX10-SDAG-NEXT: v_permlane16_b32 v5, v5, s4, s5 97; GFX10-SDAG-NEXT: v_permlane16_b32 v4, v4, s4, s5 98; GFX10-SDAG-NEXT: v_permlane16_b32 v3, v3, s4, s5 99; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s4, s5 100; GFX10-SDAG-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16 101; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off 102; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] 103; 104; GFX11-SDAG-LABEL: v_permlane16_v3p0: 105; GFX11-SDAG: ; %bb.0: 106; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 107; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v8 108; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v9 109; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) 110; GFX11-SDAG-NEXT: v_permlane16_b32 v7, v7, s0, s1 111; GFX11-SDAG-NEXT: v_permlane16_b32 v6, v6, s0, s1 112; GFX11-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1 113; GFX11-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 114; GFX11-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 115; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 116; GFX11-SDAG-NEXT: s_clause 0x1 117; GFX11-SDAG-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16 118; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off 119; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] 120; 121; GFX12-SDAG-LABEL: v_permlane16_v3p0: 122; GFX12-SDAG: ; %bb.0: 123; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 124; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 125; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 126; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 127; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 128; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v8 129; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v9 130; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) 131; GFX12-SDAG-NEXT: v_permlane16_b32 v7, v7, s0, s1 132; GFX12-SDAG-NEXT: v_permlane16_b32 v6, v6, s0, s1 133; GFX12-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1 134; GFX12-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 135; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 136; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 137; GFX12-SDAG-NEXT: s_clause 0x1 138; GFX12-SDAG-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16 139; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off 140; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] 141 %v = call <3 x ptr> @llvm.amdgcn.permlane16.v3p0(<3 x ptr> %src0, <3 x ptr> %src0, i32 %src1, i32 %src2, i1 false, i1 false) 142 store <3 x ptr> %v, ptr addrspace(1) %out 143 ret void 144} 145 146define void @v_permlanex16_v3p0(ptr addrspace(1) %out, <3 x ptr> %src0, i32 %src1, i32 %src2) { 147; GFX10-SDAG-LABEL: v_permlanex16_v3p0: 148; GFX10-SDAG: ; %bb.0: 149; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 150; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v8 151; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v9 152; GFX10-SDAG-NEXT: v_permlanex16_b32 v7, v7, s4, s5 153; GFX10-SDAG-NEXT: v_permlanex16_b32 v6, v6, s4, s5 154; GFX10-SDAG-NEXT: v_permlanex16_b32 v5, v5, s4, s5 155; GFX10-SDAG-NEXT: v_permlanex16_b32 v4, v4, s4, s5 156; GFX10-SDAG-NEXT: v_permlanex16_b32 v3, v3, s4, s5 157; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v2, s4, s5 158; GFX10-SDAG-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16 159; GFX10-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off 160; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] 161; 162; GFX11-SDAG-LABEL: v_permlanex16_v3p0: 163; GFX11-SDAG: ; %bb.0: 164; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 165; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v8 166; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v9 167; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) 168; GFX11-SDAG-NEXT: v_permlanex16_b32 v7, v7, s0, s1 169; GFX11-SDAG-NEXT: v_permlanex16_b32 v6, v6, s0, s1 170; GFX11-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1 171; GFX11-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 172; GFX11-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 173; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 174; GFX11-SDAG-NEXT: s_clause 0x1 175; GFX11-SDAG-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16 176; GFX11-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off 177; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] 178; 179; GFX12-SDAG-LABEL: v_permlanex16_v3p0: 180; GFX12-SDAG: ; %bb.0: 181; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 182; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 183; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 184; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 185; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 186; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v8 187; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v9 188; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) 189; GFX12-SDAG-NEXT: v_permlanex16_b32 v7, v7, s0, s1 190; GFX12-SDAG-NEXT: v_permlanex16_b32 v6, v6, s0, s1 191; GFX12-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1 192; GFX12-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 193; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 194; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 195; GFX12-SDAG-NEXT: s_clause 0x1 196; GFX12-SDAG-NEXT: global_store_b64 v[0:1], v[6:7], off offset:16 197; GFX12-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off 198; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] 199 %v = call <3 x ptr> @llvm.amdgcn.permlanex16.v3p0(<3 x ptr> %src0, <3 x ptr> %src0, i32 %src1, i32 %src2, i1 false, i1 false) 200 store <3 x ptr> %v, ptr addrspace(1) %out 201 ret void 202} 203 204define void @v_permlane16_p3(ptr addrspace(1) %out, ptr addrspace(3) %src0, i32 %src1, i32 %src2) { 205; GFX10-SDAG-LABEL: v_permlane16_p3: 206; GFX10-SDAG: ; %bb.0: 207; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 208; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v3 209; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v4 210; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s4, s5 211; GFX10-SDAG-NEXT: global_store_dword v[0:1], v2, off 212; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] 213; 214; GFX11-SDAG-LABEL: v_permlane16_p3: 215; GFX11-SDAG: ; %bb.0: 216; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 217; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v3 218; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v4 219; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) 220; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 221; GFX11-SDAG-NEXT: global_store_b32 v[0:1], v2, off 222; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] 223; 224; GFX12-SDAG-LABEL: v_permlane16_p3: 225; GFX12-SDAG: ; %bb.0: 226; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 227; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 228; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 229; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 230; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 231; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v3 232; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v4 233; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) 234; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 235; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off 236; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] 237 %v = call ptr addrspace(3) @llvm.amdgcn.permlane16.p3(ptr addrspace(3) %src0, ptr addrspace(3) %src0, i32 %src1, i32 %src2, i1 false, i1 false) 238 store ptr addrspace(3) %v, ptr addrspace(1) %out 239 ret void 240} 241 242define void @v_permlanex16_p3(ptr addrspace(1) %out, ptr addrspace(3) %src0, i32 %src1, i32 %src2) { 243; GFX10-SDAG-LABEL: v_permlanex16_p3: 244; GFX10-SDAG: ; %bb.0: 245; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 246; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v3 247; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v4 248; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v2, s4, s5 249; GFX10-SDAG-NEXT: global_store_dword v[0:1], v2, off 250; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] 251; 252; GFX11-SDAG-LABEL: v_permlanex16_p3: 253; GFX11-SDAG: ; %bb.0: 254; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 255; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v3 256; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v4 257; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) 258; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 259; GFX11-SDAG-NEXT: global_store_b32 v[0:1], v2, off 260; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] 261; 262; GFX12-SDAG-LABEL: v_permlanex16_p3: 263; GFX12-SDAG: ; %bb.0: 264; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 265; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 266; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 267; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 268; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 269; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v3 270; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v4 271; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) 272; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 273; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off 274; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] 275 %v = call ptr addrspace(3) @llvm.amdgcn.permlanex16.p3(ptr addrspace(3) %src0, ptr addrspace(3) %src0, i32 %src1, i32 %src2, i1 false, i1 false) 276 store ptr addrspace(3) %v, ptr addrspace(1) %out 277 ret void 278} 279 280define void @v_permlane16_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3)> %src0, i32 %src1, i32 %src2) { 281; GFX10-SDAG-LABEL: v_permlane16_v3p3: 282; GFX10-SDAG: ; %bb.0: 283; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 284; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v5 285; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v6 286; GFX10-SDAG-NEXT: v_permlane16_b32 v4, v4, s4, s5 287; GFX10-SDAG-NEXT: v_permlane16_b32 v3, v3, s4, s5 288; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s4, s5 289; GFX10-SDAG-NEXT: global_store_dwordx3 v[0:1], v[2:4], off 290; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] 291; 292; GFX11-SDAG-LABEL: v_permlane16_v3p3: 293; GFX11-SDAG: ; %bb.0: 294; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 295; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v5 296; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v6 297; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) 298; GFX11-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 299; GFX11-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 300; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 301; GFX11-SDAG-NEXT: global_store_b96 v[0:1], v[2:4], off 302; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] 303; 304; GFX12-SDAG-LABEL: v_permlane16_v3p3: 305; GFX12-SDAG: ; %bb.0: 306; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 307; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 308; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 309; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 310; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 311; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v5 312; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v6 313; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) 314; GFX12-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 315; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 316; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 317; GFX12-SDAG-NEXT: global_store_b96 v[0:1], v[2:4], off 318; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] 319 %v = call <3 x ptr addrspace(3)> @llvm.amdgcn.permlane16.v3p3(<3 x ptr addrspace(3)> %src0, <3 x ptr addrspace(3)> %src0, i32 %src1, i32 %src2, i1 false, i1 false) 320 store <3 x ptr addrspace(3)> %v, ptr addrspace(1) %out 321 ret void 322} 323 324define void @v_permlanex16_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3)> %src0, i32 %src1, i32 %src2) { 325; GFX10-SDAG-LABEL: v_permlanex16_v3p3: 326; GFX10-SDAG: ; %bb.0: 327; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 328; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v5 329; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v6 330; GFX10-SDAG-NEXT: v_permlanex16_b32 v4, v4, s4, s5 331; GFX10-SDAG-NEXT: v_permlanex16_b32 v3, v3, s4, s5 332; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v2, s4, s5 333; GFX10-SDAG-NEXT: global_store_dwordx3 v[0:1], v[2:4], off 334; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] 335; 336; GFX11-SDAG-LABEL: v_permlanex16_v3p3: 337; GFX11-SDAG: ; %bb.0: 338; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 339; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v5 340; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v6 341; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) 342; GFX11-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 343; GFX11-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 344; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 345; GFX11-SDAG-NEXT: global_store_b96 v[0:1], v[2:4], off 346; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] 347; 348; GFX12-SDAG-LABEL: v_permlanex16_v3p3: 349; GFX12-SDAG: ; %bb.0: 350; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 351; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 352; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 353; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 354; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 355; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v5 356; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v6 357; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) 358; GFX12-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 359; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 360; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 361; GFX12-SDAG-NEXT: global_store_b96 v[0:1], v[2:4], off 362; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] 363 %v = call <3 x ptr addrspace(3)> @llvm.amdgcn.permlanex16.v3p3(<3 x ptr addrspace(3)> %src0, <3 x ptr addrspace(3)> %src0, i32 %src1, i32 %src2, i1 false, i1 false) 364 store <3 x ptr addrspace(3)> %v, ptr addrspace(1) %out 365 ret void 366} 367 368define void @v_permlane16_p5(ptr addrspace(1) %out, ptr addrspace(5) %src0, i32 %src1, i32 %src2) { 369; GFX10-SDAG-LABEL: v_permlane16_p5: 370; GFX10-SDAG: ; %bb.0: 371; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 372; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v3 373; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v4 374; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s4, s5 375; GFX10-SDAG-NEXT: global_store_dword v[0:1], v2, off 376; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] 377; 378; GFX11-SDAG-LABEL: v_permlane16_p5: 379; GFX11-SDAG: ; %bb.0: 380; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 381; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v3 382; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v4 383; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) 384; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 385; GFX11-SDAG-NEXT: global_store_b32 v[0:1], v2, off 386; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] 387; 388; GFX12-SDAG-LABEL: v_permlane16_p5: 389; GFX12-SDAG: ; %bb.0: 390; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 391; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 392; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 393; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 394; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 395; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v3 396; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v4 397; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) 398; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 399; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off 400; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] 401 %v = call ptr addrspace(5) @llvm.amdgcn.permlane16.p5(ptr addrspace(5) %src0, ptr addrspace(5) %src0, i32 %src1, i32 %src2, i1 false, i1 false) 402 store ptr addrspace(5) %v, ptr addrspace(1) %out 403 ret void 404} 405 406define void @v_permlanex16_p5(ptr addrspace(1) %out, ptr addrspace(5) %src0, i32 %src1, i32 %src2) { 407; GFX10-SDAG-LABEL: v_permlanex16_p5: 408; GFX10-SDAG: ; %bb.0: 409; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 410; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v3 411; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v4 412; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v2, s4, s5 413; GFX10-SDAG-NEXT: global_store_dword v[0:1], v2, off 414; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] 415; 416; GFX11-SDAG-LABEL: v_permlanex16_p5: 417; GFX11-SDAG: ; %bb.0: 418; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 419; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v3 420; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v4 421; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) 422; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 423; GFX11-SDAG-NEXT: global_store_b32 v[0:1], v2, off 424; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] 425; 426; GFX12-SDAG-LABEL: v_permlanex16_p5: 427; GFX12-SDAG: ; %bb.0: 428; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 429; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 430; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 431; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 432; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 433; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v3 434; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v4 435; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) 436; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 437; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off 438; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] 439 %v = call ptr addrspace(5) @llvm.amdgcn.permlanex16.p5(ptr addrspace(5) %src0, ptr addrspace(5) %src0, i32 %src1, i32 %src2, i1 false, i1 false) 440 store ptr addrspace(5) %v, ptr addrspace(1) %out 441 ret void 442} 443 444define void @v_permlane16_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5)> %src0, i32 %src1, i32 %src2) { 445; GFX10-SDAG-LABEL: v_permlane16_v3p5: 446; GFX10-SDAG: ; %bb.0: 447; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 448; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v5 449; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v6 450; GFX10-SDAG-NEXT: v_permlane16_b32 v4, v4, s4, s5 451; GFX10-SDAG-NEXT: v_permlane16_b32 v3, v3, s4, s5 452; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s4, s5 453; GFX10-SDAG-NEXT: global_store_dwordx3 v[0:1], v[2:4], off 454; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] 455; 456; GFX11-SDAG-LABEL: v_permlane16_v3p5: 457; GFX11-SDAG: ; %bb.0: 458; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 459; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v5 460; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v6 461; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) 462; GFX11-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 463; GFX11-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 464; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 465; GFX11-SDAG-NEXT: global_store_b96 v[0:1], v[2:4], off 466; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] 467; 468; GFX12-SDAG-LABEL: v_permlane16_v3p5: 469; GFX12-SDAG: ; %bb.0: 470; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 471; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 472; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 473; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 474; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 475; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v5 476; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v6 477; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) 478; GFX12-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 479; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 480; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 481; GFX12-SDAG-NEXT: global_store_b96 v[0:1], v[2:4], off 482; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] 483 %v = call <3 x ptr addrspace(5)> @llvm.amdgcn.permlane16.v3p5(<3 x ptr addrspace(5)> %src0, <3 x ptr addrspace(5)> %src0, i32 %src1, i32 %src2, i1 false, i1 false) 484 store <3 x ptr addrspace(5)> %v, ptr addrspace(1) %out 485 ret void 486} 487 488define void @v_permlanex16_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5)> %src0, i32 %src1, i32 %src2) { 489; GFX10-SDAG-LABEL: v_permlanex16_v3p5: 490; GFX10-SDAG: ; %bb.0: 491; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 492; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v5 493; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v6 494; GFX10-SDAG-NEXT: v_permlanex16_b32 v4, v4, s4, s5 495; GFX10-SDAG-NEXT: v_permlanex16_b32 v3, v3, s4, s5 496; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v2, s4, s5 497; GFX10-SDAG-NEXT: global_store_dwordx3 v[0:1], v[2:4], off 498; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] 499; 500; GFX11-SDAG-LABEL: v_permlanex16_v3p5: 501; GFX11-SDAG: ; %bb.0: 502; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 503; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v5 504; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v6 505; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) 506; GFX11-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 507; GFX11-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 508; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 509; GFX11-SDAG-NEXT: global_store_b96 v[0:1], v[2:4], off 510; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] 511; 512; GFX12-SDAG-LABEL: v_permlanex16_v3p5: 513; GFX12-SDAG: ; %bb.0: 514; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 515; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 516; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 517; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 518; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 519; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v5 520; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v6 521; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) 522; GFX12-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 523; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 524; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 525; GFX12-SDAG-NEXT: global_store_b96 v[0:1], v[2:4], off 526; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] 527 %v = call <3 x ptr addrspace(5)> @llvm.amdgcn.permlanex16.v3p5(<3 x ptr addrspace(5)> %src0, <3 x ptr addrspace(5)> %src0, i32 %src1, i32 %src2, i1 false, i1 false) 528 store <3 x ptr addrspace(5)> %v, ptr addrspace(1) %out 529 ret void 530} 531 532define void @v_permlane16_p6(ptr addrspace(1) %out, ptr addrspace(6) %src0, i32 %src1, i32 %src2) { 533; GFX10-SDAG-LABEL: v_permlane16_p6: 534; GFX10-SDAG: ; %bb.0: 535; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 536; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v3 537; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v4 538; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s4, s5 539; GFX10-SDAG-NEXT: global_store_dword v[0:1], v2, off 540; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] 541; 542; GFX11-SDAG-LABEL: v_permlane16_p6: 543; GFX11-SDAG: ; %bb.0: 544; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 545; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v3 546; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v4 547; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) 548; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 549; GFX11-SDAG-NEXT: global_store_b32 v[0:1], v2, off 550; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] 551; 552; GFX12-SDAG-LABEL: v_permlane16_p6: 553; GFX12-SDAG: ; %bb.0: 554; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 555; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 556; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 557; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 558; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 559; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v3 560; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v4 561; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) 562; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 563; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off 564; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] 565 %v = call ptr addrspace(6) @llvm.amdgcn.permlane16.p6(ptr addrspace(6) %src0, ptr addrspace(6) %src0, i32 %src1, i32 %src2, i1 false, i1 false) 566 store ptr addrspace(6) %v, ptr addrspace(1) %out 567 ret void 568} 569 570define void @v_permlanex16_p6(ptr addrspace(1) %out, ptr addrspace(6) %src0, i32 %src1, i32 %src2) { 571; GFX10-SDAG-LABEL: v_permlanex16_p6: 572; GFX10-SDAG: ; %bb.0: 573; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 574; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v3 575; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v4 576; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v2, s4, s5 577; GFX10-SDAG-NEXT: global_store_dword v[0:1], v2, off 578; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] 579; 580; GFX11-SDAG-LABEL: v_permlanex16_p6: 581; GFX11-SDAG: ; %bb.0: 582; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 583; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v3 584; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v4 585; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) 586; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 587; GFX11-SDAG-NEXT: global_store_b32 v[0:1], v2, off 588; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] 589; 590; GFX12-SDAG-LABEL: v_permlanex16_p6: 591; GFX12-SDAG: ; %bb.0: 592; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 593; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 594; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 595; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 596; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 597; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v3 598; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v4 599; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) 600; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 601; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off 602; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] 603 %v = call ptr addrspace(6) @llvm.amdgcn.permlanex16.p6(ptr addrspace(6) %src0, ptr addrspace(6) %src0, i32 %src1, i32 %src2, i1 false, i1 false) 604 store ptr addrspace(6) %v, ptr addrspace(1) %out 605 ret void 606} 607 608define void @v_permlane16_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6)> %src0, i32 %src1, i32 %src2) { 609; GFX10-SDAG-LABEL: v_permlane16_v3p6: 610; GFX10-SDAG: ; %bb.0: 611; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 612; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v5 613; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v6 614; GFX10-SDAG-NEXT: v_permlane16_b32 v4, v4, s4, s5 615; GFX10-SDAG-NEXT: v_permlane16_b32 v3, v3, s4, s5 616; GFX10-SDAG-NEXT: v_permlane16_b32 v2, v2, s4, s5 617; GFX10-SDAG-NEXT: global_store_dwordx3 v[0:1], v[2:4], off 618; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] 619; 620; GFX11-SDAG-LABEL: v_permlane16_v3p6: 621; GFX11-SDAG: ; %bb.0: 622; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 623; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v5 624; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v6 625; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) 626; GFX11-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 627; GFX11-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 628; GFX11-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 629; GFX11-SDAG-NEXT: global_store_b96 v[0:1], v[2:4], off 630; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] 631; 632; GFX12-SDAG-LABEL: v_permlane16_v3p6: 633; GFX12-SDAG: ; %bb.0: 634; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 635; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 636; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 637; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 638; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 639; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v5 640; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v6 641; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) 642; GFX12-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 643; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 644; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 645; GFX12-SDAG-NEXT: global_store_b96 v[0:1], v[2:4], off 646; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] 647 %v = call <3 x ptr addrspace(6)> @llvm.amdgcn.permlane16.v3p6(<3 x ptr addrspace(6)> %src0, <3 x ptr addrspace(6)> %src0, i32 %src1, i32 %src2, i1 false, i1 false) 648 store <3 x ptr addrspace(6)> %v, ptr addrspace(1) %out 649 ret void 650} 651 652define void @v_permlanex16_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6)> %src0, i32 %src1, i32 %src2) { 653; GFX10-SDAG-LABEL: v_permlanex16_v3p6: 654; GFX10-SDAG: ; %bb.0: 655; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 656; GFX10-SDAG-NEXT: v_readfirstlane_b32 s4, v5 657; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v6 658; GFX10-SDAG-NEXT: v_permlanex16_b32 v4, v4, s4, s5 659; GFX10-SDAG-NEXT: v_permlanex16_b32 v3, v3, s4, s5 660; GFX10-SDAG-NEXT: v_permlanex16_b32 v2, v2, s4, s5 661; GFX10-SDAG-NEXT: global_store_dwordx3 v[0:1], v[2:4], off 662; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] 663; 664; GFX11-SDAG-LABEL: v_permlanex16_v3p6: 665; GFX11-SDAG: ; %bb.0: 666; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 667; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v5 668; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v6 669; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) 670; GFX11-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 671; GFX11-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 672; GFX11-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 673; GFX11-SDAG-NEXT: global_store_b96 v[0:1], v[2:4], off 674; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] 675; 676; GFX12-SDAG-LABEL: v_permlanex16_v3p6: 677; GFX12-SDAG: ; %bb.0: 678; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 679; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 680; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 681; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 682; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 683; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v5 684; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v6 685; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) 686; GFX12-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 687; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 688; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 689; GFX12-SDAG-NEXT: global_store_b96 v[0:1], v[2:4], off 690; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] 691 %v = call <3 x ptr addrspace(6)> @llvm.amdgcn.permlanex16.v3p6(<3 x ptr addrspace(6)> %src0, <3 x ptr addrspace(6)> %src0, i32 %src1, i32 %src2, i1 false, i1 false) 692 store <3 x ptr addrspace(6)> %v, ptr addrspace(1) %out 693 ret void 694} 695