1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s 3; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s 4; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s 5; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s 6; RUN: llc -mtriple=amdgcn-amd-amdpal -O0 -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s 7; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s 8; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s 9; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s 10; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s 11; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s 12; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s 13; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s 14; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s 15 16define amdgpu_kernel void @private_nontemporal_load_0( 17; GFX6-LABEL: private_nontemporal_load_0: 18; GFX6: ; %bb.0: ; %entry 19; GFX6-NEXT: s_add_u32 s0, s0, s15 20; GFX6-NEXT: s_addc_u32 s1, s1, 0 21; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] 22; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 23; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 24; GFX6-NEXT: s_waitcnt lgkmcnt(0) 25; GFX6-NEXT: s_mov_b32 s11, s5 26; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 27; GFX6-NEXT: s_mov_b32 s9, 0x100f000 28; GFX6-NEXT: s_mov_b32 s10, -1 29; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 30; GFX6-NEXT: s_mov_b32 s5, s11 31; GFX6-NEXT: s_mov_b32 s6, s10 32; GFX6-NEXT: s_mov_b32 s7, s9 33; GFX6-NEXT: v_mov_b32_e32 v0, s8 34; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen glc slc 35; GFX6-NEXT: s_waitcnt vmcnt(0) 36; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 37; GFX6-NEXT: s_endpgm 38; 39; GFX7-LABEL: private_nontemporal_load_0: 40; GFX7: ; %bb.0: ; %entry 41; GFX7-NEXT: s_add_u32 s0, s0, s15 42; GFX7-NEXT: s_addc_u32 s1, s1, 0 43; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 44; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 45; GFX7-NEXT: s_waitcnt lgkmcnt(0) 46; GFX7-NEXT: v_mov_b32_e32 v0, s6 47; GFX7-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen glc slc 48; GFX7-NEXT: v_mov_b32_e32 v0, s4 49; GFX7-NEXT: v_mov_b32_e32 v1, s5 50; GFX7-NEXT: s_waitcnt vmcnt(0) 51; GFX7-NEXT: flat_store_dword v[0:1], v2 52; GFX7-NEXT: s_endpgm 53; 54; GFX10-WGP-LABEL: private_nontemporal_load_0: 55; GFX10-WGP: ; %bb.0: ; %entry 56; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15 57; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 58; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 59; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 60; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 61; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 62; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 63; GFX10-WGP-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen slc 64; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 65; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] 66; GFX10-WGP-NEXT: s_endpgm 67; 68; GFX10-CU-LABEL: private_nontemporal_load_0: 69; GFX10-CU: ; %bb.0: ; %entry 70; GFX10-CU-NEXT: s_add_u32 s0, s0, s15 71; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 72; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 73; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 74; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 75; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 76; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 77; GFX10-CU-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen slc 78; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 79; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] 80; GFX10-CU-NEXT: s_endpgm 81; 82; SKIP-CACHE-INV-LABEL: private_nontemporal_load_0: 83; SKIP-CACHE-INV: ; %bb.0: ; %entry 84; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] 85; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 86; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 87; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 88; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 89; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 90; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] 91; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 92; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 93; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 94; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 95; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 96; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0xf000 97; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 98; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 99; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7 100; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 101; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 102; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 103; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen glc slc 104; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 105; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 106; SKIP-CACHE-INV-NEXT: s_endpgm 107; 108; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_load_0: 109; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 110; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s15 111; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 112; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 113; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 114; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 115; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 116; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 117; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen glc slc 118; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 119; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] 120; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 121; 122; GFX90A-TGSPLIT-LABEL: private_nontemporal_load_0: 123; GFX90A-TGSPLIT: ; %bb.0: ; %entry 124; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s15 125; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 126; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 127; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 128; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 129; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 130; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 131; GFX90A-TGSPLIT-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen glc slc 132; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 133; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] 134; GFX90A-TGSPLIT-NEXT: s_endpgm 135; 136; GFX940-NOTTGSPLIT-LABEL: private_nontemporal_load_0: 137; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 138; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 139; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 140; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 141; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 142; GFX940-NOTTGSPLIT-NEXT: scratch_load_dword v1, off, s2 nt 143; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 144; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 145; GFX940-NOTTGSPLIT-NEXT: s_endpgm 146; 147; GFX940-TGSPLIT-LABEL: private_nontemporal_load_0: 148; GFX940-TGSPLIT: ; %bb.0: ; %entry 149; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 150; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 151; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 152; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 153; GFX940-TGSPLIT-NEXT: scratch_load_dword v1, off, s2 nt 154; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 155; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 156; GFX940-TGSPLIT-NEXT: s_endpgm 157; 158; GFX11-WGP-LABEL: private_nontemporal_load_0: 159; GFX11-WGP: ; %bb.0: ; %entry 160; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 161; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 162; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 163; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 164; GFX11-WGP-NEXT: scratch_load_b32 v1, off, s2 slc dlc 165; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) 166; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] 167; GFX11-WGP-NEXT: s_endpgm 168; 169; GFX11-CU-LABEL: private_nontemporal_load_0: 170; GFX11-CU: ; %bb.0: ; %entry 171; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 172; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 173; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 174; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 175; GFX11-CU-NEXT: scratch_load_b32 v1, off, s2 slc dlc 176; GFX11-CU-NEXT: s_waitcnt vmcnt(0) 177; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] 178; GFX11-CU-NEXT: s_endpgm 179; 180; GFX12-WGP-LABEL: private_nontemporal_load_0: 181; GFX12-WGP: ; %bb.0: ; %entry 182; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 183; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 184; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 185; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 186; GFX12-WGP-NEXT: scratch_load_b32 v1, off, s2 th:TH_LOAD_NT 187; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 188; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] 189; GFX12-WGP-NEXT: s_endpgm 190; 191; GFX12-CU-LABEL: private_nontemporal_load_0: 192; GFX12-CU: ; %bb.0: ; %entry 193; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 194; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 195; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 196; GFX12-CU-NEXT: s_wait_kmcnt 0x0 197; GFX12-CU-NEXT: scratch_load_b32 v1, off, s2 th:TH_LOAD_NT 198; GFX12-CU-NEXT: s_wait_loadcnt 0x0 199; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] 200; GFX12-CU-NEXT: s_endpgm 201 ptr addrspace(5) %in, ptr addrspace(1) %out) { 202entry: 203 %val = load i32, ptr addrspace(5) %in, align 4, !nontemporal !0 204 store i32 %val, ptr addrspace(1) %out 205 ret void 206} 207 208define amdgpu_kernel void @private_nontemporal_load_1( 209; GFX6-LABEL: private_nontemporal_load_1: 210; GFX6: ; %bb.0: ; %entry 211; GFX6-NEXT: s_add_u32 s0, s0, s15 212; GFX6-NEXT: s_addc_u32 s1, s1, 0 213; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] 214; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 215; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 216; GFX6-NEXT: s_waitcnt lgkmcnt(0) 217; GFX6-NEXT: s_mov_b32 s11, s5 218; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 219; GFX6-NEXT: s_mov_b32 s9, 0x100f000 220; GFX6-NEXT: s_mov_b32 s10, -1 221; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 222; GFX6-NEXT: s_mov_b32 s5, s11 223; GFX6-NEXT: s_mov_b32 s6, s10 224; GFX6-NEXT: s_mov_b32 s7, s9 225; GFX6-NEXT: s_mov_b32 s9, 2 226; GFX6-NEXT: v_lshlrev_b32_e64 v0, s9, v0 227; GFX6-NEXT: v_add_i32_e64 v0, s[8:9], s8, v0 228; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen glc slc 229; GFX6-NEXT: s_waitcnt vmcnt(0) 230; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 231; GFX6-NEXT: s_endpgm 232; 233; GFX7-LABEL: private_nontemporal_load_1: 234; GFX7: ; %bb.0: ; %entry 235; GFX7-NEXT: s_add_u32 s0, s0, s15 236; GFX7-NEXT: s_addc_u32 s1, s1, 0 237; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 238; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 239; GFX7-NEXT: s_mov_b32 s7, 2 240; GFX7-NEXT: v_lshlrev_b32_e64 v0, s7, v0 241; GFX7-NEXT: s_waitcnt lgkmcnt(0) 242; GFX7-NEXT: v_add_i32_e64 v0, s[6:7], s6, v0 243; GFX7-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen glc slc 244; GFX7-NEXT: v_mov_b32_e32 v0, s4 245; GFX7-NEXT: v_mov_b32_e32 v1, s5 246; GFX7-NEXT: s_waitcnt vmcnt(0) 247; GFX7-NEXT: flat_store_dword v[0:1], v2 248; GFX7-NEXT: s_endpgm 249; 250; GFX10-WGP-LABEL: private_nontemporal_load_1: 251; GFX10-WGP: ; %bb.0: ; %entry 252; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15 253; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 254; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0 255; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x0 256; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 257; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 258; GFX10-WGP-NEXT: s_mov_b32 s6, 2 259; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 260; GFX10-WGP-NEXT: v_lshl_add_u32 v1, v1, s6, s7 261; GFX10-WGP-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen slc 262; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 263; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] 264; GFX10-WGP-NEXT: s_endpgm 265; 266; GFX10-CU-LABEL: private_nontemporal_load_1: 267; GFX10-CU: ; %bb.0: ; %entry 268; GFX10-CU-NEXT: s_add_u32 s0, s0, s15 269; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 270; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0 271; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x0 272; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 273; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 274; GFX10-CU-NEXT: s_mov_b32 s6, 2 275; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 276; GFX10-CU-NEXT: v_lshl_add_u32 v1, v1, s6, s7 277; GFX10-CU-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen slc 278; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 279; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] 280; GFX10-CU-NEXT: s_endpgm 281; 282; SKIP-CACHE-INV-LABEL: private_nontemporal_load_1: 283; SKIP-CACHE-INV: ; %bb.0: ; %entry 284; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] 285; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 286; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 287; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 288; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 289; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 290; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] 291; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 292; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 293; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 294; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 295; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 296; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0xf000 297; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 298; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 299; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7 300; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 301; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 302; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 2 303; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e64 v0, s5, v0 304; SKIP-CACHE-INV-NEXT: v_add_i32_e64 v0, s[4:5], s4, v0 305; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen glc slc 306; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 307; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 308; SKIP-CACHE-INV-NEXT: s_endpgm 309; 310; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_load_1: 311; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 312; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s15 313; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 314; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 315; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x0 316; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 317; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 318; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s6, 0x3ff 319; GFX90A-NOTTGSPLIT-NEXT: v_and_b32_e64 v1, v1, s6 320; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s6, 2 321; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 322; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 323; GFX90A-NOTTGSPLIT-NEXT: v_lshl_add_u32 v1, v1, s6, v2 324; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen glc slc 325; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 326; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] 327; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 328; 329; GFX90A-TGSPLIT-LABEL: private_nontemporal_load_1: 330; GFX90A-TGSPLIT: ; %bb.0: ; %entry 331; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s15 332; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 333; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 334; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x0 335; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 336; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 337; GFX90A-TGSPLIT-NEXT: s_mov_b32 s6, 0x3ff 338; GFX90A-TGSPLIT-NEXT: v_and_b32_e64 v1, v1, s6 339; GFX90A-TGSPLIT-NEXT: s_mov_b32 s6, 2 340; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 341; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 342; GFX90A-TGSPLIT-NEXT: v_lshl_add_u32 v1, v1, s6, v2 343; GFX90A-TGSPLIT-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen glc slc 344; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 345; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] 346; GFX90A-TGSPLIT-NEXT: s_endpgm 347; 348; GFX940-NOTTGSPLIT-LABEL: private_nontemporal_load_1: 349; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 350; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 351; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x0 352; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 353; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 354; GFX940-NOTTGSPLIT-NEXT: s_mov_b32 s2, 0x3ff 355; GFX940-NOTTGSPLIT-NEXT: v_and_b32_e64 v1, v1, s2 356; GFX940-NOTTGSPLIT-NEXT: s_mov_b32 s2, 2 357; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 358; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 359; GFX940-NOTTGSPLIT-NEXT: v_lshl_add_u32 v1, v1, s2, v2 360; GFX940-NOTTGSPLIT-NEXT: scratch_load_dword v1, v1, off nt 361; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 362; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 363; GFX940-NOTTGSPLIT-NEXT: s_endpgm 364; 365; GFX940-TGSPLIT-LABEL: private_nontemporal_load_1: 366; GFX940-TGSPLIT: ; %bb.0: ; %entry 367; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 368; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x0 369; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 370; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 371; GFX940-TGSPLIT-NEXT: s_mov_b32 s2, 0x3ff 372; GFX940-TGSPLIT-NEXT: v_and_b32_e64 v1, v1, s2 373; GFX940-TGSPLIT-NEXT: s_mov_b32 s2, 2 374; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 375; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 376; GFX940-TGSPLIT-NEXT: v_lshl_add_u32 v1, v1, s2, v2 377; GFX940-TGSPLIT-NEXT: scratch_load_dword v1, v1, off nt 378; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 379; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 380; GFX940-TGSPLIT-NEXT: s_endpgm 381; 382; GFX11-WGP-LABEL: private_nontemporal_load_1: 383; GFX11-WGP: ; %bb.0: ; %entry 384; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v0 385; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x0 386; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 387; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 388; GFX11-WGP-NEXT: s_mov_b32 s2, 0x3ff 389; GFX11-WGP-NEXT: v_and_b32_e64 v1, v1, s2 390; GFX11-WGP-NEXT: s_mov_b32 s2, 2 391; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 392; GFX11-WGP-NEXT: v_lshl_add_u32 v1, v1, s2, s3 393; GFX11-WGP-NEXT: scratch_load_b32 v1, v1, off slc dlc 394; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) 395; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] 396; GFX11-WGP-NEXT: s_endpgm 397; 398; GFX11-CU-LABEL: private_nontemporal_load_1: 399; GFX11-CU: ; %bb.0: ; %entry 400; GFX11-CU-NEXT: v_mov_b32_e32 v1, v0 401; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x0 402; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 403; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 404; GFX11-CU-NEXT: s_mov_b32 s2, 0x3ff 405; GFX11-CU-NEXT: v_and_b32_e64 v1, v1, s2 406; GFX11-CU-NEXT: s_mov_b32 s2, 2 407; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 408; GFX11-CU-NEXT: v_lshl_add_u32 v1, v1, s2, s3 409; GFX11-CU-NEXT: scratch_load_b32 v1, v1, off slc dlc 410; GFX11-CU-NEXT: s_waitcnt vmcnt(0) 411; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] 412; GFX11-CU-NEXT: s_endpgm 413; 414; GFX12-WGP-LABEL: private_nontemporal_load_1: 415; GFX12-WGP: ; %bb.0: ; %entry 416; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v0 417; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 418; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 419; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 420; GFX12-WGP-NEXT: s_mov_b32 s3, 0x3ff 421; GFX12-WGP-NEXT: s_wait_alu 0xfffe 422; GFX12-WGP-NEXT: v_and_b32_e64 v1, v1, s3 423; GFX12-WGP-NEXT: s_mov_b32 s3, 2 424; GFX12-WGP-NEXT: s_wait_alu 0xfffe 425; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v1, s3, v1 426; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 427; GFX12-WGP-NEXT: scratch_load_b32 v1, v1, s2 th:TH_LOAD_NT 428; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 429; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] 430; GFX12-WGP-NEXT: s_endpgm 431; 432; GFX12-CU-LABEL: private_nontemporal_load_1: 433; GFX12-CU: ; %bb.0: ; %entry 434; GFX12-CU-NEXT: v_mov_b32_e32 v1, v0 435; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 436; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 437; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 438; GFX12-CU-NEXT: s_mov_b32 s3, 0x3ff 439; GFX12-CU-NEXT: s_wait_alu 0xfffe 440; GFX12-CU-NEXT: v_and_b32_e64 v1, v1, s3 441; GFX12-CU-NEXT: s_mov_b32 s3, 2 442; GFX12-CU-NEXT: s_wait_alu 0xfffe 443; GFX12-CU-NEXT: v_lshlrev_b32_e64 v1, s3, v1 444; GFX12-CU-NEXT: s_wait_kmcnt 0x0 445; GFX12-CU-NEXT: scratch_load_b32 v1, v1, s2 th:TH_LOAD_NT 446; GFX12-CU-NEXT: s_wait_loadcnt 0x0 447; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] 448; GFX12-CU-NEXT: s_endpgm 449 ptr addrspace(5) %in, ptr addrspace(1) %out) { 450entry: 451 %tid = call i32 @llvm.amdgcn.workitem.id.x() 452 %val.gep = getelementptr inbounds i32, ptr addrspace(5) %in, i32 %tid 453 %val = load i32, ptr addrspace(5) %val.gep, align 4, !nontemporal !0 454 store i32 %val, ptr addrspace(1) %out 455 ret void 456} 457 458define amdgpu_kernel void @private_nontemporal_store_0( 459; GFX6-LABEL: private_nontemporal_store_0: 460; GFX6: ; %bb.0: ; %entry 461; GFX6-NEXT: s_add_u32 s0, s0, s15 462; GFX6-NEXT: s_addc_u32 s1, s1, 0 463; GFX6-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 464; GFX6-NEXT: s_load_dword s4, s[8:9], 0x2 465; GFX6-NEXT: s_waitcnt lgkmcnt(0) 466; GFX6-NEXT: s_load_dword s5, s[6:7], 0x0 467; GFX6-NEXT: s_waitcnt lgkmcnt(0) 468; GFX6-NEXT: v_mov_b32_e32 v0, s5 469; GFX6-NEXT: v_mov_b32_e32 v1, s4 470; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen glc slc 471; GFX6-NEXT: s_endpgm 472; 473; GFX7-LABEL: private_nontemporal_store_0: 474; GFX7: ; %bb.0: ; %entry 475; GFX7-NEXT: s_add_u32 s0, s0, s15 476; GFX7-NEXT: s_addc_u32 s1, s1, 0 477; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 478; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2 479; GFX7-NEXT: s_waitcnt lgkmcnt(0) 480; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 481; GFX7-NEXT: s_waitcnt lgkmcnt(0) 482; GFX7-NEXT: v_mov_b32_e32 v0, s5 483; GFX7-NEXT: v_mov_b32_e32 v1, s4 484; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen glc slc 485; GFX7-NEXT: s_endpgm 486; 487; GFX10-WGP-LABEL: private_nontemporal_store_0: 488; GFX10-WGP: ; %bb.0: ; %entry 489; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15 490; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 491; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 492; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8 493; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 494; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0 495; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 496; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 497; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 498; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen glc slc 499; GFX10-WGP-NEXT: s_endpgm 500; 501; GFX10-CU-LABEL: private_nontemporal_store_0: 502; GFX10-CU: ; %bb.0: ; %entry 503; GFX10-CU-NEXT: s_add_u32 s0, s0, s15 504; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 505; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 506; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8 507; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 508; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0 509; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 510; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 511; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 512; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen glc slc 513; GFX10-CU-NEXT: s_endpgm 514; 515; SKIP-CACHE-INV-LABEL: private_nontemporal_store_0: 516; SKIP-CACHE-INV: ; %bb.0: ; %entry 517; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] 518; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 519; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 520; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 521; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 522; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 523; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 524; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2 525; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 526; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0 527; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 528; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 529; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 530; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen glc slc 531; SKIP-CACHE-INV-NEXT: s_endpgm 532; 533; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_store_0: 534; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 535; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s15 536; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 537; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 538; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 539; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 540; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 541; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 542; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 543; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 544; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen glc slc 545; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 546; 547; GFX90A-TGSPLIT-LABEL: private_nontemporal_store_0: 548; GFX90A-TGSPLIT: ; %bb.0: ; %entry 549; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s15 550; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 551; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 552; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8 553; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 554; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[6:7], 0x0 555; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 556; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 557; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 558; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen glc slc 559; GFX90A-TGSPLIT-NEXT: s_endpgm 560; 561; GFX940-NOTTGSPLIT-LABEL: private_nontemporal_store_0: 562; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 563; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 564; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 565; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 566; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 567; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 568; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 569; GFX940-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s0 sc0 nt sc1 570; GFX940-NOTTGSPLIT-NEXT: s_endpgm 571; 572; GFX940-TGSPLIT-LABEL: private_nontemporal_store_0: 573; GFX940-TGSPLIT: ; %bb.0: ; %entry 574; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 575; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8 576; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 577; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[2:3], 0x0 578; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 579; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 580; GFX940-TGSPLIT-NEXT: scratch_store_dword off, v0, s0 sc0 nt sc1 581; GFX940-TGSPLIT-NEXT: s_endpgm 582; 583; GFX11-WGP-LABEL: private_nontemporal_store_0: 584; GFX11-WGP: ; %bb.0: ; %entry 585; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 586; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 587; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 588; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 589; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 590; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 591; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 glc slc dlc 592; GFX11-WGP-NEXT: s_endpgm 593; 594; GFX11-CU-LABEL: private_nontemporal_store_0: 595; GFX11-CU: ; %bb.0: ; %entry 596; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 597; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 598; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 599; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 600; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 601; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 602; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 glc slc dlc 603; GFX11-CU-NEXT: s_endpgm 604; 605; GFX12-WGP-LABEL: private_nontemporal_store_0: 606; GFX12-WGP: ; %bb.0: ; %entry 607; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 608; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 609; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 610; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 611; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 612; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 613; GFX12-WGP-NEXT: scratch_store_b32 off, v0, s0 th:TH_STORE_NT 614; GFX12-WGP-NEXT: s_endpgm 615; 616; GFX12-CU-LABEL: private_nontemporal_store_0: 617; GFX12-CU: ; %bb.0: ; %entry 618; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 619; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 620; GFX12-CU-NEXT: s_wait_kmcnt 0x0 621; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 622; GFX12-CU-NEXT: s_wait_kmcnt 0x0 623; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 624; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 th:TH_STORE_NT 625; GFX12-CU-NEXT: s_endpgm 626 ptr addrspace(1) %in, ptr addrspace(5) %out) { 627entry: 628 %val = load i32, ptr addrspace(1) %in, align 4 629 store i32 %val, ptr addrspace(5) %out, !nontemporal !0 630 ret void 631} 632 633define amdgpu_kernel void @private_nontemporal_store_1( 634; GFX6-LABEL: private_nontemporal_store_1: 635; GFX6: ; %bb.0: ; %entry 636; GFX6-NEXT: s_add_u32 s0, s0, s15 637; GFX6-NEXT: s_addc_u32 s1, s1, 0 638; GFX6-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 639; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 640; GFX6-NEXT: s_waitcnt lgkmcnt(0) 641; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 642; GFX6-NEXT: s_mov_b32 s6, 2 643; GFX6-NEXT: v_lshlrev_b32_e64 v0, s6, v0 644; GFX6-NEXT: v_add_i32_e64 v1, s[6:7], s5, v0 645; GFX6-NEXT: s_waitcnt lgkmcnt(0) 646; GFX6-NEXT: v_mov_b32_e32 v0, s4 647; GFX6-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen glc slc 648; GFX6-NEXT: s_endpgm 649; 650; GFX7-LABEL: private_nontemporal_store_1: 651; GFX7: ; %bb.0: ; %entry 652; GFX7-NEXT: s_add_u32 s0, s0, s15 653; GFX7-NEXT: s_addc_u32 s1, s1, 0 654; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 655; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 656; GFX7-NEXT: s_waitcnt lgkmcnt(0) 657; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 658; GFX7-NEXT: s_mov_b32 s6, 2 659; GFX7-NEXT: v_lshlrev_b32_e64 v0, s6, v0 660; GFX7-NEXT: v_add_i32_e64 v1, s[6:7], s5, v0 661; GFX7-NEXT: s_waitcnt lgkmcnt(0) 662; GFX7-NEXT: v_mov_b32_e32 v0, s4 663; GFX7-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen glc slc 664; GFX7-NEXT: s_endpgm 665; 666; GFX10-WGP-LABEL: private_nontemporal_store_1: 667; GFX10-WGP: ; %bb.0: ; %entry 668; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15 669; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 670; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 671; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 672; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 673; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x0 674; GFX10-WGP-NEXT: s_mov_b32 s5, 2 675; GFX10-WGP-NEXT: v_lshl_add_u32 v1, v0, s5, s6 676; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 677; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 678; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen glc slc 679; GFX10-WGP-NEXT: s_endpgm 680; 681; GFX10-CU-LABEL: private_nontemporal_store_1: 682; GFX10-CU: ; %bb.0: ; %entry 683; GFX10-CU-NEXT: s_add_u32 s0, s0, s15 684; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 685; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 686; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 687; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 688; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x0 689; GFX10-CU-NEXT: s_mov_b32 s5, 2 690; GFX10-CU-NEXT: v_lshl_add_u32 v1, v0, s5, s6 691; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 692; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 693; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen glc slc 694; GFX10-CU-NEXT: s_endpgm 695; 696; SKIP-CACHE-INV-LABEL: private_nontemporal_store_1: 697; SKIP-CACHE-INV: ; %bb.0: ; %entry 698; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] 699; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 700; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 701; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 702; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 703; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 704; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 705; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 706; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 707; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 708; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, 2 709; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e64 v0, s2, v0 710; SKIP-CACHE-INV-NEXT: v_add_i32_e64 v1, s[2:3], s1, v0 711; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 712; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 713; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[12:15], 0 offen glc slc 714; SKIP-CACHE-INV-NEXT: s_endpgm 715; 716; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_store_1: 717; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 718; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s15 719; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 720; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 721; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 722; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 723; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x0 724; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s5, 0x3ff 725; GFX90A-NOTTGSPLIT-NEXT: v_and_b32_e64 v0, v0, s5 726; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s5, 2 727; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 728; GFX90A-NOTTGSPLIT-NEXT: v_lshl_add_u32 v1, v0, s5, v1 729; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 730; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4 731; GFX90A-NOTTGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen glc slc 732; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 733; 734; GFX90A-TGSPLIT-LABEL: private_nontemporal_store_1: 735; GFX90A-TGSPLIT: ; %bb.0: ; %entry 736; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s15 737; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 738; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 739; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 740; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 741; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x0 742; GFX90A-TGSPLIT-NEXT: s_mov_b32 s5, 0x3ff 743; GFX90A-TGSPLIT-NEXT: v_and_b32_e64 v0, v0, s5 744; GFX90A-TGSPLIT-NEXT: s_mov_b32 s5, 2 745; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 746; GFX90A-TGSPLIT-NEXT: v_lshl_add_u32 v1, v0, s5, v1 747; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 748; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4 749; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen glc slc 750; GFX90A-TGSPLIT-NEXT: s_endpgm 751; 752; GFX940-NOTTGSPLIT-LABEL: private_nontemporal_store_1: 753; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 754; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 755; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 756; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 757; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0 758; GFX940-NOTTGSPLIT-NEXT: s_mov_b32 s1, 0x3ff 759; GFX940-NOTTGSPLIT-NEXT: v_and_b32_e64 v0, v0, s1 760; GFX940-NOTTGSPLIT-NEXT: s_mov_b32 s1, 2 761; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 762; GFX940-NOTTGSPLIT-NEXT: v_lshl_add_u32 v1, v0, s1, v1 763; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 764; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 765; GFX940-NOTTGSPLIT-NEXT: scratch_store_dword v1, v0, off sc0 nt sc1 766; GFX940-NOTTGSPLIT-NEXT: s_endpgm 767; 768; GFX940-TGSPLIT-LABEL: private_nontemporal_store_1: 769; GFX940-TGSPLIT: ; %bb.0: ; %entry 770; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 771; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 772; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 773; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0 774; GFX940-TGSPLIT-NEXT: s_mov_b32 s1, 0x3ff 775; GFX940-TGSPLIT-NEXT: v_and_b32_e64 v0, v0, s1 776; GFX940-TGSPLIT-NEXT: s_mov_b32 s1, 2 777; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 778; GFX940-TGSPLIT-NEXT: v_lshl_add_u32 v1, v0, s1, v1 779; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 780; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 781; GFX940-TGSPLIT-NEXT: scratch_store_dword v1, v0, off sc0 nt sc1 782; GFX940-TGSPLIT-NEXT: s_endpgm 783; 784; GFX11-WGP-LABEL: private_nontemporal_store_1: 785; GFX11-WGP: ; %bb.0: ; %entry 786; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 787; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 788; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 789; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 790; GFX11-WGP-NEXT: s_mov_b32 s1, 0x3ff 791; GFX11-WGP-NEXT: v_and_b32_e64 v0, v0, s1 792; GFX11-WGP-NEXT: s_mov_b32 s1, 2 793; GFX11-WGP-NEXT: v_lshl_add_u32 v1, v0, s1, s2 794; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 795; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 796; GFX11-WGP-NEXT: scratch_store_b32 v1, v0, off glc slc dlc 797; GFX11-WGP-NEXT: s_endpgm 798; 799; GFX11-CU-LABEL: private_nontemporal_store_1: 800; GFX11-CU: ; %bb.0: ; %entry 801; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 802; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 803; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 804; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 805; GFX11-CU-NEXT: s_mov_b32 s1, 0x3ff 806; GFX11-CU-NEXT: v_and_b32_e64 v0, v0, s1 807; GFX11-CU-NEXT: s_mov_b32 s1, 2 808; GFX11-CU-NEXT: v_lshl_add_u32 v1, v0, s1, s2 809; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 810; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 811; GFX11-CU-NEXT: scratch_store_b32 v1, v0, off glc slc dlc 812; GFX11-CU-NEXT: s_endpgm 813; 814; GFX12-WGP-LABEL: private_nontemporal_store_1: 815; GFX12-WGP: ; %bb.0: ; %entry 816; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 817; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8 818; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 819; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 820; GFX12-WGP-NEXT: s_mov_b32 s2, 0x3ff 821; GFX12-WGP-NEXT: s_wait_alu 0xfffe 822; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s2 823; GFX12-WGP-NEXT: s_mov_b32 s2, 2 824; GFX12-WGP-NEXT: s_wait_alu 0xfffe 825; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v1, s2, v0 826; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 827; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 828; GFX12-WGP-NEXT: scratch_store_b32 v1, v0, s0 th:TH_STORE_NT 829; GFX12-WGP-NEXT: s_endpgm 830; 831; GFX12-CU-LABEL: private_nontemporal_store_1: 832; GFX12-CU: ; %bb.0: ; %entry 833; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 834; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 835; GFX12-CU-NEXT: s_wait_kmcnt 0x0 836; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 837; GFX12-CU-NEXT: s_mov_b32 s2, 0x3ff 838; GFX12-CU-NEXT: s_wait_alu 0xfffe 839; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s2 840; GFX12-CU-NEXT: s_mov_b32 s2, 2 841; GFX12-CU-NEXT: s_wait_alu 0xfffe 842; GFX12-CU-NEXT: v_lshlrev_b32_e64 v1, s2, v0 843; GFX12-CU-NEXT: s_wait_kmcnt 0x0 844; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 845; GFX12-CU-NEXT: scratch_store_b32 v1, v0, s0 th:TH_STORE_NT 846; GFX12-CU-NEXT: s_endpgm 847 ptr addrspace(1) %in, ptr addrspace(5) %out) { 848entry: 849 %tid = call i32 @llvm.amdgcn.workitem.id.x() 850 %val = load i32, ptr addrspace(1) %in, align 4 851 %out.gep = getelementptr inbounds i32, ptr addrspace(5) %out, i32 %tid 852 store i32 %val, ptr addrspace(5) %out.gep, !nontemporal !0 853 ret void 854} 855 856define amdgpu_kernel void @private_nontemporal_volatile_load( 857; GFX6-LABEL: private_nontemporal_volatile_load: 858; GFX6: ; %bb.0: ; %entry 859; GFX6-NEXT: s_add_u32 s0, s0, s15 860; GFX6-NEXT: s_addc_u32 s1, s1, 0 861; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] 862; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 863; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 864; GFX6-NEXT: s_waitcnt lgkmcnt(0) 865; GFX6-NEXT: s_mov_b32 s11, s5 866; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 867; GFX6-NEXT: s_mov_b32 s9, 0x100f000 868; GFX6-NEXT: s_mov_b32 s10, -1 869; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 870; GFX6-NEXT: s_mov_b32 s5, s11 871; GFX6-NEXT: s_mov_b32 s6, s10 872; GFX6-NEXT: s_mov_b32 s7, s9 873; GFX6-NEXT: v_mov_b32_e32 v0, s8 874; GFX6-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen glc 875; GFX6-NEXT: s_waitcnt vmcnt(0) 876; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 877; GFX6-NEXT: s_endpgm 878; 879; GFX7-LABEL: private_nontemporal_volatile_load: 880; GFX7: ; %bb.0: ; %entry 881; GFX7-NEXT: s_add_u32 s0, s0, s15 882; GFX7-NEXT: s_addc_u32 s1, s1, 0 883; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 884; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 885; GFX7-NEXT: s_waitcnt lgkmcnt(0) 886; GFX7-NEXT: v_mov_b32_e32 v0, s6 887; GFX7-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen glc 888; GFX7-NEXT: s_waitcnt vmcnt(0) 889; GFX7-NEXT: v_mov_b32_e32 v0, s4 890; GFX7-NEXT: v_mov_b32_e32 v1, s5 891; GFX7-NEXT: flat_store_dword v[0:1], v2 892; GFX7-NEXT: s_endpgm 893; 894; GFX10-WGP-LABEL: private_nontemporal_volatile_load: 895; GFX10-WGP: ; %bb.0: ; %entry 896; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15 897; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 898; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 899; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 900; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 901; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 902; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 903; GFX10-WGP-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen glc dlc 904; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) 905; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] 906; GFX10-WGP-NEXT: s_endpgm 907; 908; GFX10-CU-LABEL: private_nontemporal_volatile_load: 909; GFX10-CU: ; %bb.0: ; %entry 910; GFX10-CU-NEXT: s_add_u32 s0, s0, s15 911; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 912; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 913; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 914; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 915; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 916; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 917; GFX10-CU-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen glc dlc 918; GFX10-CU-NEXT: s_waitcnt vmcnt(0) 919; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] 920; GFX10-CU-NEXT: s_endpgm 921; 922; SKIP-CACHE-INV-LABEL: private_nontemporal_volatile_load: 923; SKIP-CACHE-INV: ; %bb.0: ; %entry 924; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[12:13] 925; SKIP-CACHE-INV-NEXT: s_mov_b32 s12, s0 926; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0 927; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 928; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11 929; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0 930; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] 931; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 932; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 933; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 934; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 935; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 936; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0xf000 937; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 938; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 939; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7 940; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 941; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 942; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 943; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v0, s[12:15], 0 offen glc 944; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) 945; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 946; SKIP-CACHE-INV-NEXT: s_endpgm 947; 948; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_volatile_load: 949; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 950; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s15 951; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0 952; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 953; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 954; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 955; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 956; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 957; GFX90A-NOTTGSPLIT-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen glc 958; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 959; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] 960; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 961; 962; GFX90A-TGSPLIT-LABEL: private_nontemporal_volatile_load: 963; GFX90A-TGSPLIT: ; %bb.0: ; %entry 964; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s15 965; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0 966; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 967; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 968; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 969; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 970; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 971; GFX90A-TGSPLIT-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen glc 972; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 973; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] 974; GFX90A-TGSPLIT-NEXT: s_endpgm 975; 976; GFX940-NOTTGSPLIT-LABEL: private_nontemporal_volatile_load: 977; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 978; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 979; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 980; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 981; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 982; GFX940-NOTTGSPLIT-NEXT: scratch_load_dword v1, off, s2 sc0 sc1 983; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) 984; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 985; GFX940-NOTTGSPLIT-NEXT: s_endpgm 986; 987; GFX940-TGSPLIT-LABEL: private_nontemporal_volatile_load: 988; GFX940-TGSPLIT: ; %bb.0: ; %entry 989; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 990; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 991; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 992; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 993; GFX940-TGSPLIT-NEXT: scratch_load_dword v1, off, s2 sc0 sc1 994; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) 995; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 996; GFX940-TGSPLIT-NEXT: s_endpgm 997; 998; GFX11-WGP-LABEL: private_nontemporal_volatile_load: 999; GFX11-WGP: ; %bb.0: ; %entry 1000; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 1001; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 1002; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 1003; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 1004; GFX11-WGP-NEXT: scratch_load_b32 v1, off, s2 glc dlc 1005; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) 1006; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] 1007; GFX11-WGP-NEXT: s_endpgm 1008; 1009; GFX11-CU-LABEL: private_nontemporal_volatile_load: 1010; GFX11-CU: ; %bb.0: ; %entry 1011; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 1012; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 1013; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 1014; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 1015; GFX11-CU-NEXT: scratch_load_b32 v1, off, s2 glc dlc 1016; GFX11-CU-NEXT: s_waitcnt vmcnt(0) 1017; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] 1018; GFX11-CU-NEXT: s_endpgm 1019; 1020; GFX12-WGP-LABEL: private_nontemporal_volatile_load: 1021; GFX12-WGP: ; %bb.0: ; %entry 1022; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 1023; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 1024; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 1025; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 1026; GFX12-WGP-NEXT: scratch_load_b32 v1, off, s2 th:TH_LOAD_NT scope:SCOPE_SYS 1027; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 1028; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 1029; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 1030; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] 1031; GFX12-WGP-NEXT: s_endpgm 1032; 1033; GFX12-CU-LABEL: private_nontemporal_volatile_load: 1034; GFX12-CU: ; %bb.0: ; %entry 1035; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 1036; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 1037; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 1038; GFX12-CU-NEXT: s_wait_kmcnt 0x0 1039; GFX12-CU-NEXT: scratch_load_b32 v1, off, s2 th:TH_LOAD_NT scope:SCOPE_SYS 1040; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 1041; GFX12-CU-NEXT: s_wait_samplecnt 0x0 1042; GFX12-CU-NEXT: s_wait_loadcnt 0x0 1043; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] 1044; GFX12-CU-NEXT: s_endpgm 1045 ptr addrspace(5) %in, ptr addrspace(1) %out) { 1046entry: 1047 %val = load volatile i32, ptr addrspace(5) %in, align 4, !nontemporal !0 1048 store i32 %val, ptr addrspace(1) %out 1049 ret void 1050} 1051 1052!0 = !{i32 1} 1053declare i32 @llvm.amdgcn.workitem.id.x() 1054