1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel=0 -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9,GFX9-SDAG %s 3; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel=1 -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9,GFX9-GISEL %s 4; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel=0 -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX940,GFX940-SDAG %s 5; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel=1 -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX940,GFX940-GISEL %s 6; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel=0 -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10,GFX10-SDAG %s 7; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel=1 -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10,GFX10-GISEL %s 8; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel=0 -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-SDAG %s 9; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel=1 -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-GISEL %s 10; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel=0 -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX12,GFX12-SDAG %s 11; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel=1 -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX12,GFX12-GISEL %s 12 13define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, ptr addrspace(7) %out) { 14; GFX9-SDAG-LABEL: buffer_nontemporal_load_store: 15; GFX9-SDAG: ; %bb.0: ; %entry 16; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 17; GFX9-SDAG-NEXT: s_load_dword s11, s[8:9], 0x10 18; GFX9-SDAG-NEXT: s_mov_b32 s10, 0 19; GFX9-SDAG-NEXT: s_add_u32 s0, s0, s15 20; GFX9-SDAG-NEXT: s_mov_b32 s15, s10 21; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) 22; GFX9-SDAG-NEXT: s_mov_b32 s14, s7 23; GFX9-SDAG-NEXT: s_addc_u32 s1, s1, 0 24; GFX9-SDAG-NEXT: s_mov_b32 s12, s5 25; GFX9-SDAG-NEXT: s_or_b64 s[14:15], s[14:15], s[10:11] 26; GFX9-SDAG-NEXT: s_mov_b32 s13, s6 27; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4 28; GFX9-SDAG-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen glc slc 29; GFX9-SDAG-NEXT: s_load_dword s11, s[8:9], 0x30 30; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x20 31; GFX9-SDAG-NEXT: s_mov_b32 s9, s10 32; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) 33; GFX9-SDAG-NEXT: s_mov_b32 s8, s7 34; GFX9-SDAG-NEXT: s_or_b64 s[10:11], s[8:9], s[10:11] 35; GFX9-SDAG-NEXT: s_mov_b32 s8, s5 36; GFX9-SDAG-NEXT: s_mov_b32 s9, s6 37; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s4 38; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) 39; GFX9-SDAG-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen glc slc 40; GFX9-SDAG-NEXT: s_endpgm 41; 42; GFX9-GISEL-LABEL: buffer_nontemporal_load_store: 43; GFX9-GISEL: ; %bb.0: ; %entry 44; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 45; GFX9-GISEL-NEXT: s_load_dword s7, s[8:9], 0x10 46; GFX9-GISEL-NEXT: s_mov_b32 s11, 0 47; GFX9-GISEL-NEXT: s_mov_b32 s4, s11 48; GFX9-GISEL-NEXT: s_mov_b32 s6, s11 49; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 50; GFX9-GISEL-NEXT: s_mov_b32 s10, s1 51; GFX9-GISEL-NEXT: s_mov_b32 s5, s2 52; GFX9-GISEL-NEXT: s_or_b64 s[4:5], s[10:11], s[4:5] 53; GFX9-GISEL-NEXT: s_mov_b32 s10, s3 54; GFX9-GISEL-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7] 55; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 56; GFX9-GISEL-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen glc slc 57; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20 58; GFX9-GISEL-NEXT: s_load_dword s7, s[8:9], 0x30 59; GFX9-GISEL-NEXT: s_mov_b32 s4, s11 60; GFX9-GISEL-NEXT: s_mov_b32 s6, s11 61; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 62; GFX9-GISEL-NEXT: s_mov_b32 s10, s1 63; GFX9-GISEL-NEXT: s_mov_b32 s5, s2 64; GFX9-GISEL-NEXT: s_or_b64 s[4:5], s[10:11], s[4:5] 65; GFX9-GISEL-NEXT: s_mov_b32 s10, s3 66; GFX9-GISEL-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7] 67; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s0 68; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) 69; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen glc slc 70; GFX9-GISEL-NEXT: s_endpgm 71; 72; GFX940-SDAG-LABEL: buffer_nontemporal_load_store: 73; GFX940-SDAG: ; %bb.0: ; %entry 74; GFX940-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 75; GFX940-SDAG-NEXT: s_load_dword s6, s[4:5], 0x10 76; GFX940-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x20 77; GFX940-SDAG-NEXT: s_load_dword s7, s[4:5], 0x30 78; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) 79; GFX940-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 80; GFX940-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] 81; GFX940-SDAG-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:32 sc0 sc1 82; GFX940-SDAG-NEXT: scratch_load_dwordx2 v[10:11], off, off offset:40 83; GFX940-SDAG-NEXT: scratch_load_dword v4, off, off offset:36 84; GFX940-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] 85; GFX940-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] 86; GFX940-SDAG-NEXT: scratch_store_dwordx4 off, v[0:3], off sc0 sc1 87; GFX940-SDAG-NEXT: scratch_load_dwordx2 v[12:13], off, off offset:8 88; GFX940-SDAG-NEXT: s_nop 0 89; GFX940-SDAG-NEXT: scratch_load_dword v0, off, off offset:4 90; GFX940-SDAG-NEXT: v_mov_b32_e32 v7, s6 91; GFX940-SDAG-NEXT: v_mov_b32_e32 v3, s7 92; GFX940-SDAG-NEXT: v_mov_b32_e32 v9, s0 93; GFX940-SDAG-NEXT: s_mov_b64 s[2:3], exec 94; GFX940-SDAG-NEXT: s_waitcnt vmcnt(4) 95; GFX940-SDAG-NEXT: v_mov_b32_e32 v6, v11 96; GFX940-SDAG-NEXT: v_mov_b32_e32 v5, v10 97; GFX940-SDAG-NEXT: s_waitcnt vmcnt(1) 98; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, v13 99; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, v12 100; GFX940-SDAG-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 101; GFX940-SDAG-NEXT: v_readfirstlane_b32 s4, v4 102; GFX940-SDAG-NEXT: v_readfirstlane_b32 s5, v5 103; GFX940-SDAG-NEXT: v_readfirstlane_b32 s6, v6 104; GFX940-SDAG-NEXT: v_readfirstlane_b32 s7, v7 105; GFX940-SDAG-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[4:5] 106; GFX940-SDAG-NEXT: s_nop 0 107; GFX940-SDAG-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[6:7] 108; GFX940-SDAG-NEXT: s_and_b64 s[0:1], vcc, s[0:1] 109; GFX940-SDAG-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] 110; GFX940-SDAG-NEXT: buffer_load_dword v8, v9, s[4:7], 0 offen nt 111; GFX940-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 112; GFX940-SDAG-NEXT: ; implicit-def: $vgpr9 113; GFX940-SDAG-NEXT: s_xor_b64 exec, exec, s[0:1] 114; GFX940-SDAG-NEXT: s_cbranch_execnz .LBB0_1 115; GFX940-SDAG-NEXT: ; %bb.2: 116; GFX940-SDAG-NEXT: s_mov_b64 exec, s[2:3] 117; GFX940-SDAG-NEXT: v_mov_b32_e32 v4, s8 118; GFX940-SDAG-NEXT: s_mov_b64 s[0:1], exec 119; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) 120; GFX940-SDAG-NEXT: .LBB0_3: ; =>This Inner Loop Header: Depth=1 121; GFX940-SDAG-NEXT: v_readfirstlane_b32 s4, v0 122; GFX940-SDAG-NEXT: v_readfirstlane_b32 s5, v1 123; GFX940-SDAG-NEXT: v_readfirstlane_b32 s6, v2 124; GFX940-SDAG-NEXT: v_readfirstlane_b32 s7, v3 125; GFX940-SDAG-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] 126; GFX940-SDAG-NEXT: s_nop 0 127; GFX940-SDAG-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] 128; GFX940-SDAG-NEXT: s_and_b64 s[0:1], vcc, s[0:1] 129; GFX940-SDAG-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] 130; GFX940-SDAG-NEXT: buffer_store_dword v8, v4, s[4:7], 0 offen sc0 nt sc1 131; GFX940-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 132; GFX940-SDAG-NEXT: ; implicit-def: $vgpr8 133; GFX940-SDAG-NEXT: ; implicit-def: $vgpr4 134; GFX940-SDAG-NEXT: s_xor_b64 exec, exec, s[0:1] 135; GFX940-SDAG-NEXT: s_cbranch_execnz .LBB0_3 136; GFX940-SDAG-NEXT: ; %bb.4: 137; GFX940-SDAG-NEXT: s_endpgm 138; 139; GFX940-GISEL-LABEL: buffer_nontemporal_load_store: 140; GFX940-GISEL: ; %bb.0: ; %entry 141; GFX940-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 142; GFX940-GISEL-NEXT: s_load_dword s11, s[4:5], 0x10 143; GFX940-GISEL-NEXT: s_mov_b32 s7, 0 144; GFX940-GISEL-NEXT: s_mov_b32 s8, s7 145; GFX940-GISEL-NEXT: s_mov_b32 s10, s7 146; GFX940-GISEL-NEXT: s_waitcnt lgkmcnt(0) 147; GFX940-GISEL-NEXT: s_mov_b32 s6, s1 148; GFX940-GISEL-NEXT: s_mov_b32 s9, s2 149; GFX940-GISEL-NEXT: s_or_b64 s[8:9], s[6:7], s[8:9] 150; GFX940-GISEL-NEXT: s_mov_b32 s6, s3 151; GFX940-GISEL-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11] 152; GFX940-GISEL-NEXT: v_mov_b32_e32 v0, s0 153; GFX940-GISEL-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen nt 154; GFX940-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 155; GFX940-GISEL-NEXT: s_load_dword s9, s[4:5], 0x30 156; GFX940-GISEL-NEXT: s_mov_b32 s4, s7 157; GFX940-GISEL-NEXT: s_mov_b32 s8, s7 158; GFX940-GISEL-NEXT: s_waitcnt lgkmcnt(0) 159; GFX940-GISEL-NEXT: s_mov_b32 s6, s1 160; GFX940-GISEL-NEXT: s_mov_b32 s5, s2 161; GFX940-GISEL-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] 162; GFX940-GISEL-NEXT: s_mov_b32 s6, s3 163; GFX940-GISEL-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] 164; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, s0 165; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) 166; GFX940-GISEL-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen sc0 nt sc1 167; GFX940-GISEL-NEXT: s_endpgm 168; 169; GFX10-SDAG-LABEL: buffer_nontemporal_load_store: 170; GFX10-SDAG: ; %bb.0: ; %entry 171; GFX10-SDAG-NEXT: s_clause 0x1 172; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 173; GFX10-SDAG-NEXT: s_load_dword s11, s[8:9], 0x10 174; GFX10-SDAG-NEXT: s_mov_b32 s10, 0 175; GFX10-SDAG-NEXT: s_add_u32 s0, s0, s15 176; GFX10-SDAG-NEXT: s_mov_b32 s13, s10 177; GFX10-SDAG-NEXT: s_addc_u32 s1, s1, 0 178; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) 179; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s4 180; GFX10-SDAG-NEXT: s_mov_b32 s12, s7 181; GFX10-SDAG-NEXT: s_or_b64 s[14:15], s[12:13], s[10:11] 182; GFX10-SDAG-NEXT: s_mov_b32 s12, s5 183; GFX10-SDAG-NEXT: s_mov_b32 s13, s6 184; GFX10-SDAG-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen slc 185; GFX10-SDAG-NEXT: s_clause 0x1 186; GFX10-SDAG-NEXT: s_load_dword s11, s[8:9], 0x30 187; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x20 188; GFX10-SDAG-NEXT: s_mov_b32 s9, s10 189; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) 190; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s4 191; GFX10-SDAG-NEXT: s_mov_b32 s8, s7 192; GFX10-SDAG-NEXT: s_or_b64 s[10:11], s[8:9], s[10:11] 193; GFX10-SDAG-NEXT: s_mov_b32 s8, s5 194; GFX10-SDAG-NEXT: s_mov_b32 s9, s6 195; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) 196; GFX10-SDAG-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen glc slc 197; GFX10-SDAG-NEXT: s_endpgm 198; 199; GFX10-GISEL-LABEL: buffer_nontemporal_load_store: 200; GFX10-GISEL: ; %bb.0: ; %entry 201; GFX10-GISEL-NEXT: s_clause 0x1 202; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 203; GFX10-GISEL-NEXT: s_load_dword s5, s[8:9], 0x10 204; GFX10-GISEL-NEXT: s_mov_b32 s7, 0 205; GFX10-GISEL-NEXT: s_mov_b32 s10, s7 206; GFX10-GISEL-NEXT: s_mov_b32 s4, s7 207; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 208; GFX10-GISEL-NEXT: s_mov_b32 s6, s1 209; GFX10-GISEL-NEXT: s_mov_b32 s11, s2 210; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 211; GFX10-GISEL-NEXT: s_or_b64 s[0:1], s[6:7], s[10:11] 212; GFX10-GISEL-NEXT: s_mov_b32 s6, s3 213; GFX10-GISEL-NEXT: s_or_b64 s[2:3], s[6:7], s[4:5] 214; GFX10-GISEL-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen slc 215; GFX10-GISEL-NEXT: s_clause 0x1 216; GFX10-GISEL-NEXT: s_waitcnt_depctr 0xffe3 217; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20 218; GFX10-GISEL-NEXT: s_load_dword s11, s[8:9], 0x30 219; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 220; GFX10-GISEL-NEXT: s_mov_b32 s6, s1 221; GFX10-GISEL-NEXT: s_mov_b32 s5, s2 222; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s0 223; GFX10-GISEL-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] 224; GFX10-GISEL-NEXT: s_mov_b32 s6, s3 225; GFX10-GISEL-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] 226; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 227; GFX10-GISEL-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen glc slc 228; GFX10-GISEL-NEXT: s_endpgm 229; 230; GFX11-SDAG-LABEL: buffer_nontemporal_load_store: 231; GFX11-SDAG: ; %bb.0: ; %entry 232; GFX11-SDAG-NEXT: s_clause 0x2 233; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 234; GFX11-SDAG-NEXT: s_load_b128 s[8:11], s[4:5], 0x20 235; GFX11-SDAG-NEXT: s_load_b32 s6, s[4:5], 0x10 236; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) 237; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 238; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 239; GFX11-SDAG-NEXT: v_dual_mov_b32 v7, s8 :: v_dual_mov_b32 v8, s9 240; GFX11-SDAG-NEXT: v_dual_mov_b32 v9, s10 :: v_dual_mov_b32 v10, s11 241; GFX11-SDAG-NEXT: scratch_store_b128 off, v[0:3], off offset:32 242; GFX11-SDAG-NEXT: s_clause 0x1 243; GFX11-SDAG-NEXT: scratch_load_b64 v[5:6], off, off offset:40 244; GFX11-SDAG-NEXT: scratch_load_b32 v4, off, off offset:36 245; GFX11-SDAG-NEXT: s_load_b32 s1, s[4:5], 0x30 246; GFX11-SDAG-NEXT: scratch_store_b128 off, v[7:10], off 247; GFX11-SDAG-NEXT: s_clause 0x1 248; GFX11-SDAG-NEXT: scratch_load_b64 v[1:2], off, off offset:8 249; GFX11-SDAG-NEXT: scratch_load_b32 v0, off, off offset:4 250; GFX11-SDAG-NEXT: v_mov_b32_e32 v7, s6 251; GFX11-SDAG-NEXT: v_mov_b32_e32 v9, s0 252; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) 253; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, s1 254; GFX11-SDAG-NEXT: s_mov_b32 s1, exec_lo 255; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) 256; GFX11-SDAG-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 257; GFX11-SDAG-NEXT: v_readfirstlane_b32 s4, v4 258; GFX11-SDAG-NEXT: v_readfirstlane_b32 s5, v5 259; GFX11-SDAG-NEXT: v_readfirstlane_b32 s6, v6 260; GFX11-SDAG-NEXT: v_readfirstlane_b32 s7, v7 261; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 262; GFX11-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5] 263; GFX11-SDAG-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[6:7] 264; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 265; GFX11-SDAG-NEXT: s_and_b32 s0, vcc_lo, s0 266; GFX11-SDAG-NEXT: s_and_saveexec_b32 s0, s0 267; GFX11-SDAG-NEXT: buffer_load_b32 v8, v9, s[4:7], 0 offen slc dlc 268; GFX11-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 269; GFX11-SDAG-NEXT: ; implicit-def: $vgpr9 270; GFX11-SDAG-NEXT: s_xor_b32 exec_lo, exec_lo, s0 271; GFX11-SDAG-NEXT: s_cbranch_execnz .LBB0_1 272; GFX11-SDAG-NEXT: ; %bb.2: 273; GFX11-SDAG-NEXT: s_mov_b32 exec_lo, s1 274; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, s8 275; GFX11-SDAG-NEXT: s_mov_b32 s0, exec_lo 276; GFX11-SDAG-NEXT: .LBB0_3: ; =>This Inner Loop Header: Depth=1 277; GFX11-SDAG-NEXT: v_readfirstlane_b32 s4, v0 278; GFX11-SDAG-NEXT: v_readfirstlane_b32 s5, v1 279; GFX11-SDAG-NEXT: v_readfirstlane_b32 s6, v2 280; GFX11-SDAG-NEXT: v_readfirstlane_b32 s7, v3 281; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 282; GFX11-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] 283; GFX11-SDAG-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] 284; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 285; GFX11-SDAG-NEXT: s_and_b32 s0, vcc_lo, s0 286; GFX11-SDAG-NEXT: s_and_saveexec_b32 s0, s0 287; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) 288; GFX11-SDAG-NEXT: buffer_store_b32 v8, v4, s[4:7], 0 offen glc slc dlc 289; GFX11-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 290; GFX11-SDAG-NEXT: ; implicit-def: $vgpr8 291; GFX11-SDAG-NEXT: ; implicit-def: $vgpr4 292; GFX11-SDAG-NEXT: s_xor_b32 exec_lo, exec_lo, s0 293; GFX11-SDAG-NEXT: s_cbranch_execnz .LBB0_3 294; GFX11-SDAG-NEXT: ; %bb.4: 295; GFX11-SDAG-NEXT: s_endpgm 296; 297; GFX11-GISEL-LABEL: buffer_nontemporal_load_store: 298; GFX11-GISEL: ; %bb.0: ; %entry 299; GFX11-GISEL-NEXT: s_clause 0x1 300; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 301; GFX11-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x10 302; GFX11-GISEL-NEXT: s_mov_b32 s9, 0 303; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 304; GFX11-GISEL-NEXT: s_mov_b32 s10, s9 305; GFX11-GISEL-NEXT: s_mov_b32 s6, s9 306; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) 307; GFX11-GISEL-NEXT: s_mov_b32 s8, s1 308; GFX11-GISEL-NEXT: s_mov_b32 s11, s2 309; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s0 310; GFX11-GISEL-NEXT: s_or_b64 s[0:1], s[8:9], s[10:11] 311; GFX11-GISEL-NEXT: s_mov_b32 s8, s3 312; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 313; GFX11-GISEL-NEXT: s_or_b64 s[2:3], s[8:9], s[6:7] 314; GFX11-GISEL-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen slc dlc 315; GFX11-GISEL-NEXT: s_clause 0x1 316; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x20 317; GFX11-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x30 318; GFX11-GISEL-NEXT: s_mov_b32 s4, s9 319; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) 320; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, s0 321; GFX11-GISEL-NEXT: s_mov_b32 s8, s1 322; GFX11-GISEL-NEXT: s_mov_b32 s5, s2 323; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 324; GFX11-GISEL-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] 325; GFX11-GISEL-NEXT: s_mov_b32 s8, s3 326; GFX11-GISEL-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7] 327; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) 328; GFX11-GISEL-NEXT: buffer_store_b32 v0, v1, s[4:7], 0 offen glc slc dlc 329; GFX11-GISEL-NEXT: s_endpgm 330; 331; GFX12-SDAG-LABEL: buffer_nontemporal_load_store: 332; GFX12-SDAG: ; %bb.0: ; %entry 333; GFX12-SDAG-NEXT: s_clause 0x2 334; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 335; GFX12-SDAG-NEXT: s_load_b128 s[8:11], s[4:5], 0x20 336; GFX12-SDAG-NEXT: s_load_b32 s6, s[4:5], 0x10 337; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 338; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 339; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 340; GFX12-SDAG-NEXT: v_dual_mov_b32 v7, s8 :: v_dual_mov_b32 v8, s9 341; GFX12-SDAG-NEXT: v_dual_mov_b32 v9, s10 :: v_dual_mov_b32 v10, s11 342; GFX12-SDAG-NEXT: scratch_store_b128 off, v[0:3], off offset:32 343; GFX12-SDAG-NEXT: s_clause 0x1 344; GFX12-SDAG-NEXT: scratch_load_b64 v[5:6], off, off offset:40 345; GFX12-SDAG-NEXT: scratch_load_b32 v4, off, off offset:36 346; GFX12-SDAG-NEXT: s_load_b32 s1, s[4:5], 0x30 347; GFX12-SDAG-NEXT: scratch_store_b128 off, v[7:10], off 348; GFX12-SDAG-NEXT: s_clause 0x1 349; GFX12-SDAG-NEXT: scratch_load_b64 v[1:2], off, off offset:8 350; GFX12-SDAG-NEXT: scratch_load_b32 v0, off, off offset:4 351; GFX12-SDAG-NEXT: v_mov_b32_e32 v7, s6 352; GFX12-SDAG-NEXT: v_mov_b32_e32 v9, s0 353; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 354; GFX12-SDAG-NEXT: v_mov_b32_e32 v3, s1 355; GFX12-SDAG-NEXT: s_mov_b32 s1, exec_lo 356; GFX12-SDAG-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 357; GFX12-SDAG-NEXT: s_wait_loadcnt 0x2 358; GFX12-SDAG-NEXT: v_readfirstlane_b32 s4, v4 359; GFX12-SDAG-NEXT: v_readfirstlane_b32 s5, v5 360; GFX12-SDAG-NEXT: v_readfirstlane_b32 s6, v6 361; GFX12-SDAG-NEXT: v_readfirstlane_b32 s7, v7 362; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 363; GFX12-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5] 364; GFX12-SDAG-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[6:7] 365; GFX12-SDAG-NEXT: s_wait_alu 0xfffe 366; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) 367; GFX12-SDAG-NEXT: s_and_b32 s0, vcc_lo, s0 368; GFX12-SDAG-NEXT: s_wait_alu 0xfffe 369; GFX12-SDAG-NEXT: s_and_saveexec_b32 s0, s0 370; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 371; GFX12-SDAG-NEXT: buffer_load_b32 v8, v9, s[4:7], null offen th:TH_LOAD_NT 372; GFX12-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 373; GFX12-SDAG-NEXT: ; implicit-def: $vgpr9 374; GFX12-SDAG-NEXT: s_wait_alu 0xfffe 375; GFX12-SDAG-NEXT: s_xor_b32 exec_lo, exec_lo, s0 376; GFX12-SDAG-NEXT: s_cbranch_execnz .LBB0_1 377; GFX12-SDAG-NEXT: ; %bb.2: 378; GFX12-SDAG-NEXT: s_mov_b32 exec_lo, s1 379; GFX12-SDAG-NEXT: v_mov_b32_e32 v4, s8 380; GFX12-SDAG-NEXT: s_mov_b32 s0, exec_lo 381; GFX12-SDAG-NEXT: .LBB0_3: ; =>This Inner Loop Header: Depth=1 382; GFX12-SDAG-NEXT: s_wait_loadcnt 0x1 383; GFX12-SDAG-NEXT: v_readfirstlane_b32 s4, v0 384; GFX12-SDAG-NEXT: v_readfirstlane_b32 s5, v1 385; GFX12-SDAG-NEXT: v_readfirstlane_b32 s6, v2 386; GFX12-SDAG-NEXT: v_readfirstlane_b32 s7, v3 387; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 388; GFX12-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] 389; GFX12-SDAG-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] 390; GFX12-SDAG-NEXT: s_wait_alu 0xfffe 391; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) 392; GFX12-SDAG-NEXT: s_and_b32 s0, vcc_lo, s0 393; GFX12-SDAG-NEXT: s_wait_alu 0xfffe 394; GFX12-SDAG-NEXT: s_and_saveexec_b32 s0, s0 395; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 396; GFX12-SDAG-NEXT: buffer_store_b32 v8, v4, s[4:7], null offen th:TH_STORE_NT 397; GFX12-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 398; GFX12-SDAG-NEXT: ; implicit-def: $vgpr8 399; GFX12-SDAG-NEXT: ; implicit-def: $vgpr4 400; GFX12-SDAG-NEXT: s_wait_alu 0xfffe 401; GFX12-SDAG-NEXT: s_xor_b32 exec_lo, exec_lo, s0 402; GFX12-SDAG-NEXT: s_cbranch_execnz .LBB0_3 403; GFX12-SDAG-NEXT: ; %bb.4: 404; GFX12-SDAG-NEXT: s_endpgm 405; 406; GFX12-GISEL-LABEL: buffer_nontemporal_load_store: 407; GFX12-GISEL: ; %bb.0: ; %entry 408; GFX12-GISEL-NEXT: s_clause 0x1 409; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 410; GFX12-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x10 411; GFX12-GISEL-NEXT: s_mov_b32 s9, 0 412; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 413; GFX12-GISEL-NEXT: s_mov_b32 s10, s9 414; GFX12-GISEL-NEXT: s_mov_b32 s6, s9 415; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 416; GFX12-GISEL-NEXT: s_mov_b32 s8, s1 417; GFX12-GISEL-NEXT: s_mov_b32 s11, s2 418; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 419; GFX12-GISEL-NEXT: s_or_b64 s[0:1], s[8:9], s[10:11] 420; GFX12-GISEL-NEXT: s_mov_b32 s8, s3 421; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 422; GFX12-GISEL-NEXT: s_or_b64 s[2:3], s[8:9], s[6:7] 423; GFX12-GISEL-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen th:TH_LOAD_NT 424; GFX12-GISEL-NEXT: s_clause 0x1 425; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x20 426; GFX12-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x30 427; GFX12-GISEL-NEXT: s_mov_b32 s4, s9 428; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 429; GFX12-GISEL-NEXT: s_wait_alu 0xfffe 430; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s0 431; GFX12-GISEL-NEXT: s_mov_b32 s8, s1 432; GFX12-GISEL-NEXT: s_mov_b32 s5, s2 433; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 434; GFX12-GISEL-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] 435; GFX12-GISEL-NEXT: s_mov_b32 s8, s3 436; GFX12-GISEL-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7] 437; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 438; GFX12-GISEL-NEXT: buffer_store_b32 v0, v1, s[4:7], null offen th:TH_STORE_NT 439; GFX12-GISEL-NEXT: s_endpgm 440entry: 441 %val = load i32, ptr addrspace(7) %in, !nontemporal !0 442 store i32 %val, ptr addrspace(7) %out, !nontemporal !0 443 ret void 444} 445 446define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrspace(7) %in, ptr addrspace(7) %out) { 447; GFX9-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store: 448; GFX9-SDAG: ; %bb.0: ; %entry 449; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 450; GFX9-SDAG-NEXT: s_load_dword s11, s[8:9], 0x10 451; GFX9-SDAG-NEXT: s_mov_b32 s10, 0 452; GFX9-SDAG-NEXT: s_add_u32 s0, s0, s15 453; GFX9-SDAG-NEXT: s_mov_b32 s15, s10 454; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) 455; GFX9-SDAG-NEXT: s_mov_b32 s14, s7 456; GFX9-SDAG-NEXT: s_addc_u32 s1, s1, 0 457; GFX9-SDAG-NEXT: s_mov_b32 s12, s5 458; GFX9-SDAG-NEXT: s_or_b64 s[14:15], s[14:15], s[10:11] 459; GFX9-SDAG-NEXT: s_mov_b32 s13, s6 460; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4 461; GFX9-SDAG-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen glc 462; GFX9-SDAG-NEXT: s_load_dword s11, s[8:9], 0x30 463; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x20 464; GFX9-SDAG-NEXT: s_mov_b32 s9, s10 465; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) 466; GFX9-SDAG-NEXT: s_mov_b32 s8, s7 467; GFX9-SDAG-NEXT: s_or_b64 s[10:11], s[8:9], s[10:11] 468; GFX9-SDAG-NEXT: s_mov_b32 s8, s5 469; GFX9-SDAG-NEXT: s_mov_b32 s9, s6 470; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s4 471; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) 472; GFX9-SDAG-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen 473; GFX9-SDAG-NEXT: s_endpgm 474; 475; GFX9-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store: 476; GFX9-GISEL: ; %bb.0: ; %entry 477; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 478; GFX9-GISEL-NEXT: s_load_dword s7, s[8:9], 0x10 479; GFX9-GISEL-NEXT: s_mov_b32 s11, 0 480; GFX9-GISEL-NEXT: s_mov_b32 s4, s11 481; GFX9-GISEL-NEXT: s_mov_b32 s6, s11 482; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 483; GFX9-GISEL-NEXT: s_mov_b32 s10, s1 484; GFX9-GISEL-NEXT: s_mov_b32 s5, s2 485; GFX9-GISEL-NEXT: s_or_b64 s[4:5], s[10:11], s[4:5] 486; GFX9-GISEL-NEXT: s_mov_b32 s10, s3 487; GFX9-GISEL-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7] 488; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 489; GFX9-GISEL-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen glc 490; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20 491; GFX9-GISEL-NEXT: s_load_dword s7, s[8:9], 0x30 492; GFX9-GISEL-NEXT: s_mov_b32 s4, s11 493; GFX9-GISEL-NEXT: s_mov_b32 s6, s11 494; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 495; GFX9-GISEL-NEXT: s_mov_b32 s10, s1 496; GFX9-GISEL-NEXT: s_mov_b32 s5, s2 497; GFX9-GISEL-NEXT: s_or_b64 s[4:5], s[10:11], s[4:5] 498; GFX9-GISEL-NEXT: s_mov_b32 s10, s3 499; GFX9-GISEL-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7] 500; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s0 501; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) 502; GFX9-GISEL-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen 503; GFX9-GISEL-NEXT: s_endpgm 504; 505; GFX940-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store: 506; GFX940-SDAG: ; %bb.0: ; %entry 507; GFX940-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 508; GFX940-SDAG-NEXT: s_load_dword s6, s[4:5], 0x10 509; GFX940-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x20 510; GFX940-SDAG-NEXT: s_load_dword s7, s[4:5], 0x30 511; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) 512; GFX940-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 513; GFX940-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] 514; GFX940-SDAG-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:32 sc0 sc1 515; GFX940-SDAG-NEXT: scratch_load_dwordx2 v[10:11], off, off offset:40 516; GFX940-SDAG-NEXT: scratch_load_dword v4, off, off offset:36 517; GFX940-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] 518; GFX940-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] 519; GFX940-SDAG-NEXT: scratch_store_dwordx4 off, v[0:3], off sc0 sc1 520; GFX940-SDAG-NEXT: scratch_load_dwordx2 v[12:13], off, off offset:8 521; GFX940-SDAG-NEXT: s_nop 0 522; GFX940-SDAG-NEXT: scratch_load_dword v0, off, off offset:4 523; GFX940-SDAG-NEXT: v_mov_b32_e32 v7, s6 524; GFX940-SDAG-NEXT: v_mov_b32_e32 v3, s7 525; GFX940-SDAG-NEXT: v_mov_b32_e32 v9, s0 526; GFX940-SDAG-NEXT: s_mov_b64 s[2:3], exec 527; GFX940-SDAG-NEXT: s_waitcnt vmcnt(4) 528; GFX940-SDAG-NEXT: v_mov_b32_e32 v6, v11 529; GFX940-SDAG-NEXT: v_mov_b32_e32 v5, v10 530; GFX940-SDAG-NEXT: s_waitcnt vmcnt(1) 531; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, v13 532; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, v12 533; GFX940-SDAG-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 534; GFX940-SDAG-NEXT: v_readfirstlane_b32 s4, v4 535; GFX940-SDAG-NEXT: v_readfirstlane_b32 s5, v5 536; GFX940-SDAG-NEXT: v_readfirstlane_b32 s6, v6 537; GFX940-SDAG-NEXT: v_readfirstlane_b32 s7, v7 538; GFX940-SDAG-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[4:5] 539; GFX940-SDAG-NEXT: s_nop 0 540; GFX940-SDAG-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[6:7] 541; GFX940-SDAG-NEXT: s_and_b64 s[0:1], vcc, s[0:1] 542; GFX940-SDAG-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] 543; GFX940-SDAG-NEXT: buffer_load_dword v8, v9, s[4:7], 0 offen sc0 sc1 544; GFX940-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 545; GFX940-SDAG-NEXT: ; implicit-def: $vgpr9 546; GFX940-SDAG-NEXT: s_xor_b64 exec, exec, s[0:1] 547; GFX940-SDAG-NEXT: s_cbranch_execnz .LBB1_1 548; GFX940-SDAG-NEXT: ; %bb.2: 549; GFX940-SDAG-NEXT: s_mov_b64 exec, s[2:3] 550; GFX940-SDAG-NEXT: v_mov_b32_e32 v4, s8 551; GFX940-SDAG-NEXT: s_mov_b64 s[0:1], exec 552; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) 553; GFX940-SDAG-NEXT: .LBB1_3: ; =>This Inner Loop Header: Depth=1 554; GFX940-SDAG-NEXT: v_readfirstlane_b32 s4, v0 555; GFX940-SDAG-NEXT: v_readfirstlane_b32 s5, v1 556; GFX940-SDAG-NEXT: v_readfirstlane_b32 s6, v2 557; GFX940-SDAG-NEXT: v_readfirstlane_b32 s7, v3 558; GFX940-SDAG-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] 559; GFX940-SDAG-NEXT: s_nop 0 560; GFX940-SDAG-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3] 561; GFX940-SDAG-NEXT: s_and_b64 s[0:1], vcc, s[0:1] 562; GFX940-SDAG-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] 563; GFX940-SDAG-NEXT: buffer_store_dword v8, v4, s[4:7], 0 offen sc0 sc1 564; GFX940-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 565; GFX940-SDAG-NEXT: ; implicit-def: $vgpr8 566; GFX940-SDAG-NEXT: ; implicit-def: $vgpr4 567; GFX940-SDAG-NEXT: s_xor_b64 exec, exec, s[0:1] 568; GFX940-SDAG-NEXT: s_cbranch_execnz .LBB1_3 569; GFX940-SDAG-NEXT: ; %bb.4: 570; GFX940-SDAG-NEXT: s_endpgm 571; 572; GFX940-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store: 573; GFX940-GISEL: ; %bb.0: ; %entry 574; GFX940-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 575; GFX940-GISEL-NEXT: s_load_dword s11, s[4:5], 0x10 576; GFX940-GISEL-NEXT: s_mov_b32 s7, 0 577; GFX940-GISEL-NEXT: s_mov_b32 s8, s7 578; GFX940-GISEL-NEXT: s_mov_b32 s10, s7 579; GFX940-GISEL-NEXT: s_waitcnt lgkmcnt(0) 580; GFX940-GISEL-NEXT: s_mov_b32 s6, s1 581; GFX940-GISEL-NEXT: s_mov_b32 s9, s2 582; GFX940-GISEL-NEXT: s_or_b64 s[8:9], s[6:7], s[8:9] 583; GFX940-GISEL-NEXT: s_mov_b32 s6, s3 584; GFX940-GISEL-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11] 585; GFX940-GISEL-NEXT: v_mov_b32_e32 v0, s0 586; GFX940-GISEL-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen sc0 sc1 587; GFX940-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 588; GFX940-GISEL-NEXT: s_load_dword s9, s[4:5], 0x30 589; GFX940-GISEL-NEXT: s_mov_b32 s4, s7 590; GFX940-GISEL-NEXT: s_mov_b32 s8, s7 591; GFX940-GISEL-NEXT: s_waitcnt lgkmcnt(0) 592; GFX940-GISEL-NEXT: s_mov_b32 s6, s1 593; GFX940-GISEL-NEXT: s_mov_b32 s5, s2 594; GFX940-GISEL-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] 595; GFX940-GISEL-NEXT: s_mov_b32 s6, s3 596; GFX940-GISEL-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] 597; GFX940-GISEL-NEXT: v_mov_b32_e32 v1, s0 598; GFX940-GISEL-NEXT: s_waitcnt vmcnt(0) 599; GFX940-GISEL-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen sc0 sc1 600; GFX940-GISEL-NEXT: s_endpgm 601; 602; GFX10-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store: 603; GFX10-SDAG: ; %bb.0: ; %entry 604; GFX10-SDAG-NEXT: s_clause 0x1 605; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 606; GFX10-SDAG-NEXT: s_load_dword s11, s[8:9], 0x10 607; GFX10-SDAG-NEXT: s_mov_b32 s10, 0 608; GFX10-SDAG-NEXT: s_add_u32 s0, s0, s15 609; GFX10-SDAG-NEXT: s_mov_b32 s13, s10 610; GFX10-SDAG-NEXT: s_addc_u32 s1, s1, 0 611; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) 612; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s4 613; GFX10-SDAG-NEXT: s_mov_b32 s12, s7 614; GFX10-SDAG-NEXT: s_or_b64 s[14:15], s[12:13], s[10:11] 615; GFX10-SDAG-NEXT: s_mov_b32 s12, s5 616; GFX10-SDAG-NEXT: s_mov_b32 s13, s6 617; GFX10-SDAG-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen glc dlc 618; GFX10-SDAG-NEXT: s_clause 0x1 619; GFX10-SDAG-NEXT: s_load_dword s11, s[8:9], 0x30 620; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x20 621; GFX10-SDAG-NEXT: s_mov_b32 s9, s10 622; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) 623; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s4 624; GFX10-SDAG-NEXT: s_mov_b32 s8, s7 625; GFX10-SDAG-NEXT: s_or_b64 s[10:11], s[8:9], s[10:11] 626; GFX10-SDAG-NEXT: s_mov_b32 s8, s5 627; GFX10-SDAG-NEXT: s_mov_b32 s9, s6 628; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) 629; GFX10-SDAG-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen 630; GFX10-SDAG-NEXT: s_endpgm 631; 632; GFX10-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store: 633; GFX10-GISEL: ; %bb.0: ; %entry 634; GFX10-GISEL-NEXT: s_clause 0x1 635; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 636; GFX10-GISEL-NEXT: s_load_dword s5, s[8:9], 0x10 637; GFX10-GISEL-NEXT: s_mov_b32 s7, 0 638; GFX10-GISEL-NEXT: s_mov_b32 s10, s7 639; GFX10-GISEL-NEXT: s_mov_b32 s4, s7 640; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 641; GFX10-GISEL-NEXT: s_mov_b32 s6, s1 642; GFX10-GISEL-NEXT: s_mov_b32 s11, s2 643; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 644; GFX10-GISEL-NEXT: s_or_b64 s[0:1], s[6:7], s[10:11] 645; GFX10-GISEL-NEXT: s_mov_b32 s6, s3 646; GFX10-GISEL-NEXT: s_or_b64 s[2:3], s[6:7], s[4:5] 647; GFX10-GISEL-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen glc dlc 648; GFX10-GISEL-NEXT: s_clause 0x1 649; GFX10-GISEL-NEXT: s_waitcnt_depctr 0xffe3 650; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20 651; GFX10-GISEL-NEXT: s_load_dword s11, s[8:9], 0x30 652; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 653; GFX10-GISEL-NEXT: s_mov_b32 s6, s1 654; GFX10-GISEL-NEXT: s_mov_b32 s5, s2 655; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s0 656; GFX10-GISEL-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] 657; GFX10-GISEL-NEXT: s_mov_b32 s6, s3 658; GFX10-GISEL-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] 659; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 660; GFX10-GISEL-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen 661; GFX10-GISEL-NEXT: s_endpgm 662; 663; GFX11-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store: 664; GFX11-SDAG: ; %bb.0: ; %entry 665; GFX11-SDAG-NEXT: s_clause 0x2 666; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 667; GFX11-SDAG-NEXT: s_load_b128 s[8:11], s[4:5], 0x20 668; GFX11-SDAG-NEXT: s_load_b32 s6, s[4:5], 0x10 669; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) 670; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 671; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 672; GFX11-SDAG-NEXT: v_dual_mov_b32 v7, s8 :: v_dual_mov_b32 v8, s9 673; GFX11-SDAG-NEXT: v_dual_mov_b32 v9, s10 :: v_dual_mov_b32 v10, s11 674; GFX11-SDAG-NEXT: scratch_store_b128 off, v[0:3], off offset:32 675; GFX11-SDAG-NEXT: s_clause 0x1 676; GFX11-SDAG-NEXT: scratch_load_b64 v[5:6], off, off offset:40 677; GFX11-SDAG-NEXT: scratch_load_b32 v4, off, off offset:36 678; GFX11-SDAG-NEXT: s_load_b32 s1, s[4:5], 0x30 679; GFX11-SDAG-NEXT: scratch_store_b128 off, v[7:10], off 680; GFX11-SDAG-NEXT: s_clause 0x1 681; GFX11-SDAG-NEXT: scratch_load_b64 v[1:2], off, off offset:8 682; GFX11-SDAG-NEXT: scratch_load_b32 v0, off, off offset:4 683; GFX11-SDAG-NEXT: v_mov_b32_e32 v7, s6 684; GFX11-SDAG-NEXT: v_mov_b32_e32 v9, s0 685; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) 686; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, s1 687; GFX11-SDAG-NEXT: s_mov_b32 s1, exec_lo 688; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) 689; GFX11-SDAG-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 690; GFX11-SDAG-NEXT: v_readfirstlane_b32 s4, v4 691; GFX11-SDAG-NEXT: v_readfirstlane_b32 s5, v5 692; GFX11-SDAG-NEXT: v_readfirstlane_b32 s6, v6 693; GFX11-SDAG-NEXT: v_readfirstlane_b32 s7, v7 694; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 695; GFX11-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5] 696; GFX11-SDAG-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[6:7] 697; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 698; GFX11-SDAG-NEXT: s_and_b32 s0, vcc_lo, s0 699; GFX11-SDAG-NEXT: s_and_saveexec_b32 s0, s0 700; GFX11-SDAG-NEXT: buffer_load_b32 v8, v9, s[4:7], 0 offen glc dlc 701; GFX11-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 702; GFX11-SDAG-NEXT: ; implicit-def: $vgpr9 703; GFX11-SDAG-NEXT: s_xor_b32 exec_lo, exec_lo, s0 704; GFX11-SDAG-NEXT: s_cbranch_execnz .LBB1_1 705; GFX11-SDAG-NEXT: ; %bb.2: 706; GFX11-SDAG-NEXT: s_mov_b32 exec_lo, s1 707; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, s8 708; GFX11-SDAG-NEXT: s_mov_b32 s0, exec_lo 709; GFX11-SDAG-NEXT: .LBB1_3: ; =>This Inner Loop Header: Depth=1 710; GFX11-SDAG-NEXT: v_readfirstlane_b32 s4, v0 711; GFX11-SDAG-NEXT: v_readfirstlane_b32 s5, v1 712; GFX11-SDAG-NEXT: v_readfirstlane_b32 s6, v2 713; GFX11-SDAG-NEXT: v_readfirstlane_b32 s7, v3 714; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 715; GFX11-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] 716; GFX11-SDAG-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] 717; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 718; GFX11-SDAG-NEXT: s_and_b32 s0, vcc_lo, s0 719; GFX11-SDAG-NEXT: s_and_saveexec_b32 s0, s0 720; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) 721; GFX11-SDAG-NEXT: buffer_store_b32 v8, v4, s[4:7], 0 offen dlc 722; GFX11-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 723; GFX11-SDAG-NEXT: ; implicit-def: $vgpr8 724; GFX11-SDAG-NEXT: ; implicit-def: $vgpr4 725; GFX11-SDAG-NEXT: s_xor_b32 exec_lo, exec_lo, s0 726; GFX11-SDAG-NEXT: s_cbranch_execnz .LBB1_3 727; GFX11-SDAG-NEXT: ; %bb.4: 728; GFX11-SDAG-NEXT: s_endpgm 729; 730; GFX11-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store: 731; GFX11-GISEL: ; %bb.0: ; %entry 732; GFX11-GISEL-NEXT: s_clause 0x1 733; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 734; GFX11-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x10 735; GFX11-GISEL-NEXT: s_mov_b32 s9, 0 736; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 737; GFX11-GISEL-NEXT: s_mov_b32 s10, s9 738; GFX11-GISEL-NEXT: s_mov_b32 s6, s9 739; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) 740; GFX11-GISEL-NEXT: s_mov_b32 s8, s1 741; GFX11-GISEL-NEXT: s_mov_b32 s11, s2 742; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s0 743; GFX11-GISEL-NEXT: s_or_b64 s[0:1], s[8:9], s[10:11] 744; GFX11-GISEL-NEXT: s_mov_b32 s8, s3 745; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 746; GFX11-GISEL-NEXT: s_or_b64 s[2:3], s[8:9], s[6:7] 747; GFX11-GISEL-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen glc dlc 748; GFX11-GISEL-NEXT: s_clause 0x1 749; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x20 750; GFX11-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x30 751; GFX11-GISEL-NEXT: s_mov_b32 s4, s9 752; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) 753; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, s0 754; GFX11-GISEL-NEXT: s_mov_b32 s8, s1 755; GFX11-GISEL-NEXT: s_mov_b32 s5, s2 756; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 757; GFX11-GISEL-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] 758; GFX11-GISEL-NEXT: s_mov_b32 s8, s3 759; GFX11-GISEL-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7] 760; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) 761; GFX11-GISEL-NEXT: buffer_store_b32 v0, v1, s[4:7], 0 offen dlc 762; GFX11-GISEL-NEXT: s_endpgm 763; 764; GFX12-SDAG-LABEL: buffer_nontemporal_and_volatile_load_store: 765; GFX12-SDAG: ; %bb.0: ; %entry 766; GFX12-SDAG-NEXT: s_clause 0x2 767; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 768; GFX12-SDAG-NEXT: s_load_b128 s[8:11], s[4:5], 0x20 769; GFX12-SDAG-NEXT: s_load_b32 s6, s[4:5], 0x10 770; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 771; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 772; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 773; GFX12-SDAG-NEXT: v_dual_mov_b32 v7, s8 :: v_dual_mov_b32 v8, s9 774; GFX12-SDAG-NEXT: v_dual_mov_b32 v9, s10 :: v_dual_mov_b32 v10, s11 775; GFX12-SDAG-NEXT: scratch_store_b128 off, v[0:3], off offset:32 776; GFX12-SDAG-NEXT: s_clause 0x1 777; GFX12-SDAG-NEXT: scratch_load_b64 v[5:6], off, off offset:40 778; GFX12-SDAG-NEXT: scratch_load_b32 v4, off, off offset:36 779; GFX12-SDAG-NEXT: s_load_b32 s1, s[4:5], 0x30 780; GFX12-SDAG-NEXT: scratch_store_b128 off, v[7:10], off 781; GFX12-SDAG-NEXT: s_clause 0x1 782; GFX12-SDAG-NEXT: scratch_load_b64 v[1:2], off, off offset:8 783; GFX12-SDAG-NEXT: scratch_load_b32 v0, off, off offset:4 784; GFX12-SDAG-NEXT: v_mov_b32_e32 v7, s6 785; GFX12-SDAG-NEXT: v_mov_b32_e32 v9, s0 786; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 787; GFX12-SDAG-NEXT: v_mov_b32_e32 v3, s1 788; GFX12-SDAG-NEXT: s_mov_b32 s1, exec_lo 789; GFX12-SDAG-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 790; GFX12-SDAG-NEXT: s_wait_loadcnt 0x2 791; GFX12-SDAG-NEXT: v_readfirstlane_b32 s4, v4 792; GFX12-SDAG-NEXT: v_readfirstlane_b32 s5, v5 793; GFX12-SDAG-NEXT: v_readfirstlane_b32 s6, v6 794; GFX12-SDAG-NEXT: v_readfirstlane_b32 s7, v7 795; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 796; GFX12-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[4:5] 797; GFX12-SDAG-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[6:7] 798; GFX12-SDAG-NEXT: s_wait_alu 0xfffe 799; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) 800; GFX12-SDAG-NEXT: s_and_b32 s0, vcc_lo, s0 801; GFX12-SDAG-NEXT: s_wait_alu 0xfffe 802; GFX12-SDAG-NEXT: s_and_saveexec_b32 s0, s0 803; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 804; GFX12-SDAG-NEXT: buffer_load_b32 v8, v9, s[4:7], null offen th:TH_LOAD_NT scope:SCOPE_SYS 805; GFX12-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 806; GFX12-SDAG-NEXT: ; implicit-def: $vgpr9 807; GFX12-SDAG-NEXT: s_wait_alu 0xfffe 808; GFX12-SDAG-NEXT: s_xor_b32 exec_lo, exec_lo, s0 809; GFX12-SDAG-NEXT: s_cbranch_execnz .LBB1_1 810; GFX12-SDAG-NEXT: ; %bb.2: 811; GFX12-SDAG-NEXT: s_mov_b32 exec_lo, s1 812; GFX12-SDAG-NEXT: v_mov_b32_e32 v4, s8 813; GFX12-SDAG-NEXT: s_mov_b32 s0, exec_lo 814; GFX12-SDAG-NEXT: .LBB1_3: ; =>This Inner Loop Header: Depth=1 815; GFX12-SDAG-NEXT: s_wait_loadcnt 0x1 816; GFX12-SDAG-NEXT: v_readfirstlane_b32 s4, v0 817; GFX12-SDAG-NEXT: v_readfirstlane_b32 s5, v1 818; GFX12-SDAG-NEXT: v_readfirstlane_b32 s6, v2 819; GFX12-SDAG-NEXT: v_readfirstlane_b32 s7, v3 820; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 821; GFX12-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] 822; GFX12-SDAG-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] 823; GFX12-SDAG-NEXT: s_wait_alu 0xfffe 824; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) 825; GFX12-SDAG-NEXT: s_and_b32 s0, vcc_lo, s0 826; GFX12-SDAG-NEXT: s_wait_alu 0xfffe 827; GFX12-SDAG-NEXT: s_and_saveexec_b32 s0, s0 828; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 829; GFX12-SDAG-NEXT: s_wait_storecnt 0x0 830; GFX12-SDAG-NEXT: buffer_store_b32 v8, v4, s[4:7], null offen th:TH_STORE_NT scope:SCOPE_SYS 831; GFX12-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 832; GFX12-SDAG-NEXT: ; implicit-def: $vgpr8 833; GFX12-SDAG-NEXT: ; implicit-def: $vgpr4 834; GFX12-SDAG-NEXT: s_wait_alu 0xfffe 835; GFX12-SDAG-NEXT: s_xor_b32 exec_lo, exec_lo, s0 836; GFX12-SDAG-NEXT: s_cbranch_execnz .LBB1_3 837; GFX12-SDAG-NEXT: ; %bb.4: 838; GFX12-SDAG-NEXT: s_endpgm 839; 840; GFX12-GISEL-LABEL: buffer_nontemporal_and_volatile_load_store: 841; GFX12-GISEL: ; %bb.0: ; %entry 842; GFX12-GISEL-NEXT: s_clause 0x1 843; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 844; GFX12-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x10 845; GFX12-GISEL-NEXT: s_mov_b32 s9, 0 846; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 847; GFX12-GISEL-NEXT: s_mov_b32 s10, s9 848; GFX12-GISEL-NEXT: s_mov_b32 s6, s9 849; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 850; GFX12-GISEL-NEXT: s_mov_b32 s8, s1 851; GFX12-GISEL-NEXT: s_mov_b32 s11, s2 852; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 853; GFX12-GISEL-NEXT: s_or_b64 s[0:1], s[8:9], s[10:11] 854; GFX12-GISEL-NEXT: s_mov_b32 s8, s3 855; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 856; GFX12-GISEL-NEXT: s_or_b64 s[2:3], s[8:9], s[6:7] 857; GFX12-GISEL-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen th:TH_LOAD_NT scope:SCOPE_SYS 858; GFX12-GISEL-NEXT: s_clause 0x1 859; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x20 860; GFX12-GISEL-NEXT: s_load_b32 s7, s[4:5], 0x30 861; GFX12-GISEL-NEXT: s_mov_b32 s4, s9 862; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 863; GFX12-GISEL-NEXT: s_wait_alu 0xfffe 864; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s0 865; GFX12-GISEL-NEXT: s_mov_b32 s8, s1 866; GFX12-GISEL-NEXT: s_mov_b32 s5, s2 867; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 868; GFX12-GISEL-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] 869; GFX12-GISEL-NEXT: s_mov_b32 s8, s3 870; GFX12-GISEL-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7] 871; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 872; GFX12-GISEL-NEXT: buffer_store_b32 v0, v1, s[4:7], null offen th:TH_STORE_NT scope:SCOPE_SYS 873; GFX12-GISEL-NEXT: s_endpgm 874entry: 875 %val = load volatile i32, ptr addrspace(7) %in, !nontemporal !0 876 store volatile i32 %val, ptr addrspace(7) %out, !nontemporal !0 877 ret void 878} 879 880!0 = !{i32 1} 881;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: 882; GFX10: {{.*}} 883; GFX11: {{.*}} 884; GFX12: {{.*}} 885; GFX9: {{.*}} 886; GFX940: {{.*}} 887