1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s 3; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s 4; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s 5; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s 6; RUN: llc -mtriple=amdgcn-amd-amdpal -O0 -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s 7; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s 8; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s 9; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s 10; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s 11; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s 12; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s 13; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s 14; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s 15 16define amdgpu_kernel void @local_nontemporal_load_0( 17; GFX6-LABEL: local_nontemporal_load_0: 18; GFX6: ; %bb.0: ; %entry 19; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] 20; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 21; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr8 22; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 23; GFX6-NEXT: s_waitcnt lgkmcnt(0) 24; GFX6-NEXT: s_mov_b32 s11, s5 25; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 26; GFX6-NEXT: s_mov_b32 s9, 0x100f000 27; GFX6-NEXT: s_mov_b32 s10, -1 28; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 29; GFX6-NEXT: s_mov_b32 s5, s11 30; GFX6-NEXT: s_mov_b32 s6, s10 31; GFX6-NEXT: s_mov_b32 s7, s9 32; GFX6-NEXT: s_mov_b32 m0, -1 33; GFX6-NEXT: v_mov_b32_e32 v0, s8 34; GFX6-NEXT: ds_read_b32 v0, v0 35; GFX6-NEXT: s_waitcnt lgkmcnt(0) 36; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 37; GFX6-NEXT: s_endpgm 38; 39; GFX7-LABEL: local_nontemporal_load_0: 40; GFX7: ; %bb.0: ; %entry 41; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 42; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 43; GFX7-NEXT: s_mov_b32 m0, -1 44; GFX7-NEXT: s_waitcnt lgkmcnt(0) 45; GFX7-NEXT: v_mov_b32_e32 v0, s6 46; GFX7-NEXT: ds_read_b32 v2, v0 47; GFX7-NEXT: v_mov_b32_e32 v0, s4 48; GFX7-NEXT: v_mov_b32_e32 v1, s5 49; GFX7-NEXT: s_waitcnt lgkmcnt(0) 50; GFX7-NEXT: flat_store_dword v[0:1], v2 51; GFX7-NEXT: s_endpgm 52; 53; GFX10-WGP-LABEL: local_nontemporal_load_0: 54; GFX10-WGP: ; %bb.0: ; %entry 55; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 56; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 57; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 58; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 59; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 60; GFX10-WGP-NEXT: ds_read_b32 v1, v1 61; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 62; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] 63; GFX10-WGP-NEXT: s_endpgm 64; 65; GFX10-CU-LABEL: local_nontemporal_load_0: 66; GFX10-CU: ; %bb.0: ; %entry 67; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 68; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 69; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 70; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 71; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 72; GFX10-CU-NEXT: ds_read_b32 v1, v1 73; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 74; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] 75; GFX10-CU-NEXT: s_endpgm 76; 77; SKIP-CACHE-INV-LABEL: local_nontemporal_load_0: 78; SKIP-CACHE-INV: ; %bb.0: ; %entry 79; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] 80; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 81; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 82; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 83; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 84; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 85; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0xf000 86; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 87; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 88; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7 89; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 90; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 91; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 92; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 93; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 94; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 95; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 96; SKIP-CACHE-INV-NEXT: s_endpgm 97; 98; GFX90A-NOTTGSPLIT-LABEL: local_nontemporal_load_0: 99; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 100; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 101; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 102; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 103; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 104; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 105; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v1 106; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 107; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] 108; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 109; 110; GFX90A-TGSPLIT-LABEL: local_nontemporal_load_0: 111; GFX90A-TGSPLIT: ; %bb.0: ; %entry 112; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 113; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 114; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 115; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 116; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 117; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v1 118; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 119; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] 120; GFX90A-TGSPLIT-NEXT: s_endpgm 121; 122; GFX940-NOTTGSPLIT-LABEL: local_nontemporal_load_0: 123; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 124; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 125; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 126; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 127; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 128; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 129; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v1, v1 130; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 131; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 132; GFX940-NOTTGSPLIT-NEXT: s_endpgm 133; 134; GFX940-TGSPLIT-LABEL: local_nontemporal_load_0: 135; GFX940-TGSPLIT: ; %bb.0: ; %entry 136; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 137; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 138; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 139; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 140; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 141; GFX940-TGSPLIT-NEXT: ds_read_b32 v1, v1 142; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 143; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 144; GFX940-TGSPLIT-NEXT: s_endpgm 145; 146; GFX11-WGP-LABEL: local_nontemporal_load_0: 147; GFX11-WGP: ; %bb.0: ; %entry 148; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 149; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 150; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 151; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 152; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 153; GFX11-WGP-NEXT: ds_load_b32 v1, v1 154; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 155; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] 156; GFX11-WGP-NEXT: s_endpgm 157; 158; GFX11-CU-LABEL: local_nontemporal_load_0: 159; GFX11-CU: ; %bb.0: ; %entry 160; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 161; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 162; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 163; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 164; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 165; GFX11-CU-NEXT: ds_load_b32 v1, v1 166; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 167; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] 168; GFX11-CU-NEXT: s_endpgm 169; 170; GFX12-WGP-LABEL: local_nontemporal_load_0: 171; GFX12-WGP: ; %bb.0: ; %entry 172; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 173; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 174; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 175; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 176; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 177; GFX12-WGP-NEXT: ds_load_b32 v1, v1 178; GFX12-WGP-NEXT: s_wait_dscnt 0x0 179; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] 180; GFX12-WGP-NEXT: s_endpgm 181; 182; GFX12-CU-LABEL: local_nontemporal_load_0: 183; GFX12-CU: ; %bb.0: ; %entry 184; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 185; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 186; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 187; GFX12-CU-NEXT: s_wait_kmcnt 0x0 188; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 189; GFX12-CU-NEXT: ds_load_b32 v1, v1 190; GFX12-CU-NEXT: s_wait_dscnt 0x0 191; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] 192; GFX12-CU-NEXT: s_endpgm 193 ptr addrspace(3) %in, ptr addrspace(1) %out) { 194entry: 195 %val = load i32, ptr addrspace(3) %in, align 4, !nontemporal !0 196 store i32 %val, ptr addrspace(1) %out 197 ret void 198} 199 200define amdgpu_kernel void @local_nontemporal_load_1( 201; GFX6-LABEL: local_nontemporal_load_1: 202; GFX6: ; %bb.0: ; %entry 203; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] 204; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 205; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr8 206; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 207; GFX6-NEXT: s_waitcnt lgkmcnt(0) 208; GFX6-NEXT: s_mov_b32 s11, s5 209; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 210; GFX6-NEXT: s_mov_b32 s9, 0x100f000 211; GFX6-NEXT: s_mov_b32 s10, -1 212; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 213; GFX6-NEXT: s_mov_b32 s5, s11 214; GFX6-NEXT: s_mov_b32 s6, s10 215; GFX6-NEXT: s_mov_b32 s7, s9 216; GFX6-NEXT: s_mov_b32 s9, 2 217; GFX6-NEXT: v_lshlrev_b32_e64 v0, s9, v0 218; GFX6-NEXT: v_add_i32_e64 v0, s[8:9], s8, v0 219; GFX6-NEXT: s_mov_b32 m0, -1 220; GFX6-NEXT: ds_read_b32 v0, v0 221; GFX6-NEXT: s_waitcnt lgkmcnt(0) 222; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 223; GFX6-NEXT: s_endpgm 224; 225; GFX7-LABEL: local_nontemporal_load_1: 226; GFX7: ; %bb.0: ; %entry 227; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 228; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 229; GFX7-NEXT: s_mov_b32 s7, 2 230; GFX7-NEXT: v_lshlrev_b32_e64 v0, s7, v0 231; GFX7-NEXT: s_waitcnt lgkmcnt(0) 232; GFX7-NEXT: v_add_i32_e64 v0, s[6:7], s6, v0 233; GFX7-NEXT: s_mov_b32 m0, -1 234; GFX7-NEXT: ds_read_b32 v2, v0 235; GFX7-NEXT: v_mov_b32_e32 v0, s4 236; GFX7-NEXT: v_mov_b32_e32 v1, s5 237; GFX7-NEXT: s_waitcnt lgkmcnt(0) 238; GFX7-NEXT: flat_store_dword v[0:1], v2 239; GFX7-NEXT: s_endpgm 240; 241; GFX10-WGP-LABEL: local_nontemporal_load_1: 242; GFX10-WGP: ; %bb.0: ; %entry 243; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0 244; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x0 245; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 246; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 247; GFX10-WGP-NEXT: s_mov_b32 s6, 2 248; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 249; GFX10-WGP-NEXT: v_lshl_add_u32 v1, v1, s6, s7 250; GFX10-WGP-NEXT: ds_read_b32 v1, v1 251; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 252; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] 253; GFX10-WGP-NEXT: s_endpgm 254; 255; GFX10-CU-LABEL: local_nontemporal_load_1: 256; GFX10-CU: ; %bb.0: ; %entry 257; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0 258; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x0 259; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 260; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 261; GFX10-CU-NEXT: s_mov_b32 s6, 2 262; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 263; GFX10-CU-NEXT: v_lshl_add_u32 v1, v1, s6, s7 264; GFX10-CU-NEXT: ds_read_b32 v1, v1 265; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 266; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] 267; GFX10-CU-NEXT: s_endpgm 268; 269; SKIP-CACHE-INV-LABEL: local_nontemporal_load_1: 270; SKIP-CACHE-INV: ; %bb.0: ; %entry 271; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] 272; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 273; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 274; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 275; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 276; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 277; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0xf000 278; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 279; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 280; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7 281; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 282; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 283; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 2 284; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e64 v0, s5, v0 285; SKIP-CACHE-INV-NEXT: v_add_i32_e64 v0, s[4:5], s4, v0 286; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 287; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 288; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 289; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 290; SKIP-CACHE-INV-NEXT: s_endpgm 291; 292; GFX90A-NOTTGSPLIT-LABEL: local_nontemporal_load_1: 293; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 294; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 295; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x0 296; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 297; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 298; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s6, 0x3ff 299; GFX90A-NOTTGSPLIT-NEXT: v_and_b32_e64 v1, v1, s6 300; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s6, 2 301; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 302; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7 303; GFX90A-NOTTGSPLIT-NEXT: v_lshl_add_u32 v1, v1, s6, v2 304; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v1 305; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 306; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] 307; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 308; 309; GFX90A-TGSPLIT-LABEL: local_nontemporal_load_1: 310; GFX90A-TGSPLIT: ; %bb.0: ; %entry 311; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 312; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x0 313; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 314; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 315; GFX90A-TGSPLIT-NEXT: s_mov_b32 s6, 0x3ff 316; GFX90A-TGSPLIT-NEXT: v_and_b32_e64 v1, v1, s6 317; GFX90A-TGSPLIT-NEXT: s_mov_b32 s6, 2 318; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 319; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7 320; GFX90A-TGSPLIT-NEXT: v_lshl_add_u32 v1, v1, s6, v2 321; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v1 322; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 323; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] 324; GFX90A-TGSPLIT-NEXT: s_endpgm 325; 326; GFX940-NOTTGSPLIT-LABEL: local_nontemporal_load_1: 327; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 328; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0 329; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x0 330; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 331; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 332; GFX940-NOTTGSPLIT-NEXT: s_mov_b32 s2, 0x3ff 333; GFX940-NOTTGSPLIT-NEXT: v_and_b32_e64 v1, v1, s2 334; GFX940-NOTTGSPLIT-NEXT: s_mov_b32 s2, 2 335; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 336; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3 337; GFX940-NOTTGSPLIT-NEXT: v_lshl_add_u32 v1, v1, s2, v2 338; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v1, v1 339; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 340; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 341; GFX940-NOTTGSPLIT-NEXT: s_endpgm 342; 343; GFX940-TGSPLIT-LABEL: local_nontemporal_load_1: 344; GFX940-TGSPLIT: ; %bb.0: ; %entry 345; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0 346; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x0 347; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 348; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 349; GFX940-TGSPLIT-NEXT: s_mov_b32 s2, 0x3ff 350; GFX940-TGSPLIT-NEXT: v_and_b32_e64 v1, v1, s2 351; GFX940-TGSPLIT-NEXT: s_mov_b32 s2, 2 352; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 353; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3 354; GFX940-TGSPLIT-NEXT: v_lshl_add_u32 v1, v1, s2, v2 355; GFX940-TGSPLIT-NEXT: ds_read_b32 v1, v1 356; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 357; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 358; GFX940-TGSPLIT-NEXT: s_endpgm 359; 360; GFX11-WGP-LABEL: local_nontemporal_load_1: 361; GFX11-WGP: ; %bb.0: ; %entry 362; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v0 363; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x0 364; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 365; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 366; GFX11-WGP-NEXT: s_mov_b32 s2, 0x3ff 367; GFX11-WGP-NEXT: v_and_b32_e64 v1, v1, s2 368; GFX11-WGP-NEXT: s_mov_b32 s2, 2 369; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 370; GFX11-WGP-NEXT: v_lshl_add_u32 v1, v1, s2, s3 371; GFX11-WGP-NEXT: ds_load_b32 v1, v1 372; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 373; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] 374; GFX11-WGP-NEXT: s_endpgm 375; 376; GFX11-CU-LABEL: local_nontemporal_load_1: 377; GFX11-CU: ; %bb.0: ; %entry 378; GFX11-CU-NEXT: v_mov_b32_e32 v1, v0 379; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x0 380; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 381; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 382; GFX11-CU-NEXT: s_mov_b32 s2, 0x3ff 383; GFX11-CU-NEXT: v_and_b32_e64 v1, v1, s2 384; GFX11-CU-NEXT: s_mov_b32 s2, 2 385; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 386; GFX11-CU-NEXT: v_lshl_add_u32 v1, v1, s2, s3 387; GFX11-CU-NEXT: ds_load_b32 v1, v1 388; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 389; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] 390; GFX11-CU-NEXT: s_endpgm 391; 392; GFX12-WGP-LABEL: local_nontemporal_load_1: 393; GFX12-WGP: ; %bb.0: ; %entry 394; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v0 395; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x0 396; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 397; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 398; GFX12-WGP-NEXT: s_mov_b32 s2, 0x3ff 399; GFX12-WGP-NEXT: s_wait_alu 0xfffe 400; GFX12-WGP-NEXT: v_and_b32_e64 v1, v1, s2 401; GFX12-WGP-NEXT: s_mov_b32 s2, 2 402; GFX12-WGP-NEXT: s_wait_alu 0xfffe 403; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 404; GFX12-WGP-NEXT: v_lshl_add_u32 v1, v1, s2, s3 405; GFX12-WGP-NEXT: ds_load_b32 v1, v1 406; GFX12-WGP-NEXT: s_wait_dscnt 0x0 407; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] 408; GFX12-WGP-NEXT: s_endpgm 409; 410; GFX12-CU-LABEL: local_nontemporal_load_1: 411; GFX12-CU: ; %bb.0: ; %entry 412; GFX12-CU-NEXT: v_mov_b32_e32 v1, v0 413; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x0 414; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 415; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 416; GFX12-CU-NEXT: s_mov_b32 s2, 0x3ff 417; GFX12-CU-NEXT: s_wait_alu 0xfffe 418; GFX12-CU-NEXT: v_and_b32_e64 v1, v1, s2 419; GFX12-CU-NEXT: s_mov_b32 s2, 2 420; GFX12-CU-NEXT: s_wait_alu 0xfffe 421; GFX12-CU-NEXT: s_wait_kmcnt 0x0 422; GFX12-CU-NEXT: v_lshl_add_u32 v1, v1, s2, s3 423; GFX12-CU-NEXT: ds_load_b32 v1, v1 424; GFX12-CU-NEXT: s_wait_dscnt 0x0 425; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] 426; GFX12-CU-NEXT: s_endpgm 427 ptr addrspace(3) %in, ptr addrspace(1) %out) { 428entry: 429 %tid = call i32 @llvm.amdgcn.workitem.id.x() 430 %val.gep = getelementptr inbounds i32, ptr addrspace(3) %in, i32 %tid 431 %val = load i32, ptr addrspace(3) %val.gep, align 4, !nontemporal !0 432 store i32 %val, ptr addrspace(1) %out 433 ret void 434} 435 436define amdgpu_kernel void @local_nontemporal_store_0( 437; GFX6-LABEL: local_nontemporal_store_0: 438; GFX6: ; %bb.0: ; %entry 439; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 440; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 441; GFX6-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 442; GFX6-NEXT: s_waitcnt lgkmcnt(0) 443; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 444; GFX6-NEXT: s_mov_b32 m0, -1 445; GFX6-NEXT: v_mov_b32_e32 v0, s5 446; GFX6-NEXT: s_waitcnt lgkmcnt(0) 447; GFX6-NEXT: v_mov_b32_e32 v1, s4 448; GFX6-NEXT: ds_write_b32 v0, v1 449; GFX6-NEXT: s_endpgm 450; 451; GFX7-LABEL: local_nontemporal_store_0: 452; GFX7: ; %bb.0: ; %entry 453; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 454; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 455; GFX7-NEXT: s_waitcnt lgkmcnt(0) 456; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 457; GFX7-NEXT: s_mov_b32 m0, -1 458; GFX7-NEXT: v_mov_b32_e32 v0, s5 459; GFX7-NEXT: s_waitcnt lgkmcnt(0) 460; GFX7-NEXT: v_mov_b32_e32 v1, s4 461; GFX7-NEXT: ds_write_b32 v0, v1 462; GFX7-NEXT: s_endpgm 463; 464; GFX10-WGP-LABEL: local_nontemporal_store_0: 465; GFX10-WGP: ; %bb.0: ; %entry 466; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 467; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8 468; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 469; GFX10-WGP-NEXT: s_load_dword s4, s[6:7], 0x0 470; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5 471; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 472; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 473; GFX10-WGP-NEXT: ds_write_b32 v0, v1 474; GFX10-WGP-NEXT: s_endpgm 475; 476; GFX10-CU-LABEL: local_nontemporal_store_0: 477; GFX10-CU: ; %bb.0: ; %entry 478; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 479; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8 480; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 481; GFX10-CU-NEXT: s_load_dword s4, s[6:7], 0x0 482; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5 483; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 484; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 485; GFX10-CU-NEXT: ds_write_b32 v0, v1 486; GFX10-CU-NEXT: s_endpgm 487; 488; SKIP-CACHE-INV-LABEL: local_nontemporal_store_0: 489; SKIP-CACHE-INV: ; %bb.0: ; %entry 490; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 491; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 492; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 493; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 494; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 495; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 496; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 497; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 498; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 499; SKIP-CACHE-INV-NEXT: s_endpgm 500; 501; GFX90A-NOTTGSPLIT-LABEL: local_nontemporal_store_0: 502; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 503; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 504; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 505; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 506; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 507; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s5 508; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 509; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 510; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 511; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 512; 513; GFX90A-TGSPLIT-LABEL: local_nontemporal_store_0: 514; GFX90A-TGSPLIT: ; %bb.0: ; %entry 515; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 516; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8 517; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 518; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[6:7], 0x0 519; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s5 520; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 521; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 522; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 523; GFX90A-TGSPLIT-NEXT: s_endpgm 524; 525; GFX940-NOTTGSPLIT-LABEL: local_nontemporal_store_0: 526; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 527; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 528; GFX940-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 529; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 530; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 531; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s1 532; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 533; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 534; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 535; GFX940-NOTTGSPLIT-NEXT: s_endpgm 536; 537; GFX940-TGSPLIT-LABEL: local_nontemporal_store_0: 538; GFX940-TGSPLIT: ; %bb.0: ; %entry 539; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 540; GFX940-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8 541; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 542; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0 543; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s1 544; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 545; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 546; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 547; GFX940-TGSPLIT-NEXT: s_endpgm 548; 549; GFX11-WGP-LABEL: local_nontemporal_store_0: 550; GFX11-WGP: ; %bb.0: ; %entry 551; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 552; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 553; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 554; GFX11-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 555; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 556; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 557; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 558; GFX11-WGP-NEXT: ds_store_b32 v0, v1 559; GFX11-WGP-NEXT: s_endpgm 560; 561; GFX11-CU-LABEL: local_nontemporal_store_0: 562; GFX11-CU: ; %bb.0: ; %entry 563; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 564; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 565; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 566; GFX11-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 567; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 568; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 569; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 570; GFX11-CU-NEXT: ds_store_b32 v0, v1 571; GFX11-CU-NEXT: s_endpgm 572; 573; GFX12-WGP-LABEL: local_nontemporal_store_0: 574; GFX12-WGP: ; %bb.0: ; %entry 575; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 576; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8 577; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 578; GFX12-WGP-NEXT: s_load_b32 s0, s[2:3], 0x0 579; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 580; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 581; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 582; GFX12-WGP-NEXT: ds_store_b32 v0, v1 583; GFX12-WGP-NEXT: s_endpgm 584; 585; GFX12-CU-LABEL: local_nontemporal_store_0: 586; GFX12-CU: ; %bb.0: ; %entry 587; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 588; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 589; GFX12-CU-NEXT: s_wait_kmcnt 0x0 590; GFX12-CU-NEXT: s_load_b32 s0, s[2:3], 0x0 591; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 592; GFX12-CU-NEXT: s_wait_kmcnt 0x0 593; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 594; GFX12-CU-NEXT: ds_store_b32 v0, v1 595; GFX12-CU-NEXT: s_endpgm 596 ptr addrspace(1) %in, ptr addrspace(3) %out) { 597entry: 598 %val = load i32, ptr addrspace(1) %in, align 4 599 store i32 %val, ptr addrspace(3) %out, !nontemporal !0 600 ret void 601} 602 603define amdgpu_kernel void @local_nontemporal_store_1( 604; GFX6-LABEL: local_nontemporal_store_1: 605; GFX6: ; %bb.0: ; %entry 606; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2 607; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5 608; GFX6-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 609; GFX6-NEXT: s_waitcnt lgkmcnt(0) 610; GFX6-NEXT: s_load_dword s4, s[6:7], 0x0 611; GFX6-NEXT: s_mov_b32 s6, 2 612; GFX6-NEXT: v_lshlrev_b32_e64 v0, s6, v0 613; GFX6-NEXT: v_add_i32_e64 v0, s[6:7], s5, v0 614; GFX6-NEXT: s_mov_b32 m0, -1 615; GFX6-NEXT: s_waitcnt lgkmcnt(0) 616; GFX6-NEXT: v_mov_b32_e32 v1, s4 617; GFX6-NEXT: ds_write_b32 v0, v1 618; GFX6-NEXT: s_endpgm 619; 620; GFX7-LABEL: local_nontemporal_store_1: 621; GFX7: ; %bb.0: ; %entry 622; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 623; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2 624; GFX7-NEXT: s_waitcnt lgkmcnt(0) 625; GFX7-NEXT: s_load_dword s4, s[6:7], 0x0 626; GFX7-NEXT: s_mov_b32 s6, 2 627; GFX7-NEXT: v_lshlrev_b32_e64 v0, s6, v0 628; GFX7-NEXT: v_add_i32_e64 v0, s[6:7], s5, v0 629; GFX7-NEXT: s_mov_b32 m0, -1 630; GFX7-NEXT: s_waitcnt lgkmcnt(0) 631; GFX7-NEXT: v_mov_b32_e32 v1, s4 632; GFX7-NEXT: ds_write_b32 v0, v1 633; GFX7-NEXT: s_endpgm 634; 635; GFX10-WGP-LABEL: local_nontemporal_store_1: 636; GFX10-WGP: ; %bb.0: ; %entry 637; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 638; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8 639; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 640; GFX10-WGP-NEXT: s_load_dword s4, s[4:5], 0x0 641; GFX10-WGP-NEXT: s_mov_b32 s5, 2 642; GFX10-WGP-NEXT: v_lshl_add_u32 v0, v0, s5, s6 643; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 644; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s4 645; GFX10-WGP-NEXT: ds_write_b32 v0, v1 646; GFX10-WGP-NEXT: s_endpgm 647; 648; GFX10-CU-LABEL: local_nontemporal_store_1: 649; GFX10-CU: ; %bb.0: ; %entry 650; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 651; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8 652; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 653; GFX10-CU-NEXT: s_load_dword s4, s[4:5], 0x0 654; GFX10-CU-NEXT: s_mov_b32 s5, 2 655; GFX10-CU-NEXT: v_lshl_add_u32 v0, v0, s5, s6 656; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 657; GFX10-CU-NEXT: v_mov_b32_e32 v1, s4 658; GFX10-CU-NEXT: ds_write_b32 v0, v1 659; GFX10-CU-NEXT: s_endpgm 660; 661; SKIP-CACHE-INV-LABEL: local_nontemporal_store_1: 662; SKIP-CACHE-INV: ; %bb.0: ; %entry 663; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 664; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2 665; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 666; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[2:3], 0x0 667; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, 2 668; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e64 v0, s2, v0 669; SKIP-CACHE-INV-NEXT: v_add_i32_e64 v0, s[2:3], s1, v0 670; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 671; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 672; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 673; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 674; SKIP-CACHE-INV-NEXT: s_endpgm 675; 676; GFX90A-NOTTGSPLIT-LABEL: local_nontemporal_store_1: 677; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 678; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 679; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 680; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 681; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x0 682; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s5, 0x3ff 683; GFX90A-NOTTGSPLIT-NEXT: v_and_b32_e64 v0, v0, s5 684; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s5, 2 685; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 686; GFX90A-NOTTGSPLIT-NEXT: v_lshl_add_u32 v0, v0, s5, v1 687; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 688; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s4 689; GFX90A-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 690; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 691; 692; GFX90A-TGSPLIT-LABEL: local_nontemporal_store_1: 693; GFX90A-TGSPLIT: ; %bb.0: ; %entry 694; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 695; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8 696; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 697; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[4:5], 0x0 698; GFX90A-TGSPLIT-NEXT: s_mov_b32 s5, 0x3ff 699; GFX90A-TGSPLIT-NEXT: v_and_b32_e64 v0, v0, s5 700; GFX90A-TGSPLIT-NEXT: s_mov_b32 s5, 2 701; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 702; GFX90A-TGSPLIT-NEXT: v_lshl_add_u32 v0, v0, s5, v1 703; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 704; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4 705; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1 706; GFX90A-TGSPLIT-NEXT: s_endpgm 707; 708; GFX940-NOTTGSPLIT-LABEL: local_nontemporal_store_1: 709; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 710; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 711; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 712; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 713; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0 714; GFX940-NOTTGSPLIT-NEXT: s_mov_b32 s1, 0x3ff 715; GFX940-NOTTGSPLIT-NEXT: v_and_b32_e64 v0, v0, s1 716; GFX940-NOTTGSPLIT-NEXT: s_mov_b32 s1, 2 717; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 718; GFX940-NOTTGSPLIT-NEXT: v_lshl_add_u32 v0, v0, s1, v1 719; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 720; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 721; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1 722; GFX940-NOTTGSPLIT-NEXT: s_endpgm 723; 724; GFX940-TGSPLIT-LABEL: local_nontemporal_store_1: 725; GFX940-TGSPLIT: ; %bb.0: ; %entry 726; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 727; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 728; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 729; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0 730; GFX940-TGSPLIT-NEXT: s_mov_b32 s1, 0x3ff 731; GFX940-TGSPLIT-NEXT: v_and_b32_e64 v0, v0, s1 732; GFX940-TGSPLIT-NEXT: s_mov_b32 s1, 2 733; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 734; GFX940-TGSPLIT-NEXT: v_lshl_add_u32 v0, v0, s1, v1 735; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 736; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 737; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1 738; GFX940-TGSPLIT-NEXT: s_endpgm 739; 740; GFX11-WGP-LABEL: local_nontemporal_store_1: 741; GFX11-WGP: ; %bb.0: ; %entry 742; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 743; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 744; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 745; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 746; GFX11-WGP-NEXT: s_mov_b32 s1, 0x3ff 747; GFX11-WGP-NEXT: v_and_b32_e64 v0, v0, s1 748; GFX11-WGP-NEXT: s_mov_b32 s1, 2 749; GFX11-WGP-NEXT: v_lshl_add_u32 v0, v0, s1, s2 750; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 751; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 752; GFX11-WGP-NEXT: ds_store_b32 v0, v1 753; GFX11-WGP-NEXT: s_endpgm 754; 755; GFX11-CU-LABEL: local_nontemporal_store_1: 756; GFX11-CU: ; %bb.0: ; %entry 757; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 758; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 759; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 760; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 761; GFX11-CU-NEXT: s_mov_b32 s1, 0x3ff 762; GFX11-CU-NEXT: v_and_b32_e64 v0, v0, s1 763; GFX11-CU-NEXT: s_mov_b32 s1, 2 764; GFX11-CU-NEXT: v_lshl_add_u32 v0, v0, s1, s2 765; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 766; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 767; GFX11-CU-NEXT: ds_store_b32 v0, v1 768; GFX11-CU-NEXT: s_endpgm 769; 770; GFX12-WGP-LABEL: local_nontemporal_store_1: 771; GFX12-WGP: ; %bb.0: ; %entry 772; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 773; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8 774; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 775; GFX12-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 776; GFX12-WGP-NEXT: s_mov_b32 s1, 0x3ff 777; GFX12-WGP-NEXT: s_wait_alu 0xfffe 778; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s1 779; GFX12-WGP-NEXT: s_mov_b32 s1, 2 780; GFX12-WGP-NEXT: s_wait_alu 0xfffe 781; GFX12-WGP-NEXT: v_lshl_add_u32 v0, v0, s1, s2 782; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 783; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 784; GFX12-WGP-NEXT: ds_store_b32 v0, v1 785; GFX12-WGP-NEXT: s_endpgm 786; 787; GFX12-CU-LABEL: local_nontemporal_store_1: 788; GFX12-CU: ; %bb.0: ; %entry 789; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 790; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8 791; GFX12-CU-NEXT: s_wait_kmcnt 0x0 792; GFX12-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 793; GFX12-CU-NEXT: s_mov_b32 s1, 0x3ff 794; GFX12-CU-NEXT: s_wait_alu 0xfffe 795; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s1 796; GFX12-CU-NEXT: s_mov_b32 s1, 2 797; GFX12-CU-NEXT: s_wait_alu 0xfffe 798; GFX12-CU-NEXT: v_lshl_add_u32 v0, v0, s1, s2 799; GFX12-CU-NEXT: s_wait_kmcnt 0x0 800; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 801; GFX12-CU-NEXT: ds_store_b32 v0, v1 802; GFX12-CU-NEXT: s_endpgm 803 ptr addrspace(1) %in, ptr addrspace(3) %out) { 804entry: 805 %tid = call i32 @llvm.amdgcn.workitem.id.x() 806 %val = load i32, ptr addrspace(1) %in, align 4 807 %out.gep = getelementptr inbounds i32, ptr addrspace(3) %out, i32 %tid 808 store i32 %val, ptr addrspace(3) %out.gep, !nontemporal !0 809 ret void 810} 811 812define amdgpu_kernel void @local_nontemporal_volatile_load( 813; GFX6-LABEL: local_nontemporal_volatile_load: 814; GFX6: ; %bb.0: ; %entry 815; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9] 816; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 817; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr8 818; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 819; GFX6-NEXT: s_waitcnt lgkmcnt(0) 820; GFX6-NEXT: s_mov_b32 s11, s5 821; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5 822; GFX6-NEXT: s_mov_b32 s9, 0x100f000 823; GFX6-NEXT: s_mov_b32 s10, -1 824; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7 825; GFX6-NEXT: s_mov_b32 s5, s11 826; GFX6-NEXT: s_mov_b32 s6, s10 827; GFX6-NEXT: s_mov_b32 s7, s9 828; GFX6-NEXT: s_mov_b32 m0, -1 829; GFX6-NEXT: v_mov_b32_e32 v0, s8 830; GFX6-NEXT: ds_read_b32 v0, v0 831; GFX6-NEXT: s_waitcnt lgkmcnt(0) 832; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 833; GFX6-NEXT: s_endpgm 834; 835; GFX7-LABEL: local_nontemporal_volatile_load: 836; GFX7: ; %bb.0: ; %entry 837; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0 838; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2 839; GFX7-NEXT: s_mov_b32 m0, -1 840; GFX7-NEXT: s_waitcnt lgkmcnt(0) 841; GFX7-NEXT: v_mov_b32_e32 v0, s6 842; GFX7-NEXT: ds_read_b32 v2, v0 843; GFX7-NEXT: v_mov_b32_e32 v0, s4 844; GFX7-NEXT: v_mov_b32_e32 v1, s5 845; GFX7-NEXT: s_waitcnt lgkmcnt(0) 846; GFX7-NEXT: flat_store_dword v[0:1], v2 847; GFX7-NEXT: s_endpgm 848; 849; GFX10-WGP-LABEL: local_nontemporal_volatile_load: 850; GFX10-WGP: ; %bb.0: ; %entry 851; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0 852; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 853; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0 854; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 855; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6 856; GFX10-WGP-NEXT: ds_read_b32 v1, v1 857; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) 858; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5] 859; GFX10-WGP-NEXT: s_endpgm 860; 861; GFX10-CU-LABEL: local_nontemporal_volatile_load: 862; GFX10-CU: ; %bb.0: ; %entry 863; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0 864; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 865; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0 866; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 867; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6 868; GFX10-CU-NEXT: ds_read_b32 v1, v1 869; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) 870; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5] 871; GFX10-CU-NEXT: s_endpgm 872; 873; SKIP-CACHE-INV-LABEL: local_nontemporal_volatile_load: 874; SKIP-CACHE-INV: ; %bb.0: ; %entry 875; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5] 876; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0 877; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 878; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 879; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1 880; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1 881; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0xf000 882; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 883; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3 884; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7 885; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6 886; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5 887; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 888; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 889; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 890; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) 891; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 892; SKIP-CACHE-INV-NEXT: s_endpgm 893; 894; GFX90A-NOTTGSPLIT-LABEL: local_nontemporal_volatile_load: 895; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry 896; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 897; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 898; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 899; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 900; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6 901; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v1 902; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 903; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] 904; GFX90A-NOTTGSPLIT-NEXT: s_endpgm 905; 906; GFX90A-TGSPLIT-LABEL: local_nontemporal_volatile_load: 907; GFX90A-TGSPLIT: ; %bb.0: ; %entry 908; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0 909; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8 910; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 911; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 912; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6 913; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v1 914; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 915; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5] 916; GFX90A-TGSPLIT-NEXT: s_endpgm 917; 918; GFX940-NOTTGSPLIT-LABEL: local_nontemporal_volatile_load: 919; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry 920; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 921; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 922; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0 923; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 924; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 925; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v1, v1 926; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 927; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 928; GFX940-NOTTGSPLIT-NEXT: s_endpgm 929; 930; GFX940-TGSPLIT-LABEL: local_nontemporal_volatile_load: 931; GFX940-TGSPLIT: ; %bb.0: ; %entry 932; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0 933; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 934; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 935; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 936; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 937; GFX940-TGSPLIT-NEXT: ds_read_b32 v1, v1 938; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) 939; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 940; GFX940-TGSPLIT-NEXT: s_endpgm 941; 942; GFX11-WGP-LABEL: local_nontemporal_volatile_load: 943; GFX11-WGP: ; %bb.0: ; %entry 944; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 945; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 946; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 947; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 948; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 949; GFX11-WGP-NEXT: ds_load_b32 v1, v1 950; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) 951; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] 952; GFX11-WGP-NEXT: s_endpgm 953; 954; GFX11-CU-LABEL: local_nontemporal_volatile_load: 955; GFX11-CU: ; %bb.0: ; %entry 956; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 957; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 958; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 959; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 960; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 961; GFX11-CU-NEXT: ds_load_b32 v1, v1 962; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) 963; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] 964; GFX11-CU-NEXT: s_endpgm 965; 966; GFX12-WGP-LABEL: local_nontemporal_volatile_load: 967; GFX12-WGP: ; %bb.0: ; %entry 968; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0 969; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 970; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0 971; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 972; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2 973; GFX12-WGP-NEXT: ds_load_b32 v1, v1 974; GFX12-WGP-NEXT: s_wait_dscnt 0x0 975; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] 976; GFX12-WGP-NEXT: s_endpgm 977; 978; GFX12-CU-LABEL: local_nontemporal_volatile_load: 979; GFX12-CU: ; %bb.0: ; %entry 980; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 981; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 982; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0 983; GFX12-CU-NEXT: s_wait_kmcnt 0x0 984; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 985; GFX12-CU-NEXT: ds_load_b32 v1, v1 986; GFX12-CU-NEXT: s_wait_dscnt 0x0 987; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] 988; GFX12-CU-NEXT: s_endpgm 989 ptr addrspace(3) %in, ptr addrspace(1) %out) { 990entry: 991 %val = load volatile i32, ptr addrspace(3) %in, align 4, !nontemporal !0 992 store i32 %val, ptr addrspace(1) %out 993 ret void 994} 995 996!0 = !{i32 1} 997declare i32 @llvm.amdgcn.workitem.id.x() 998