1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn--amdpal -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=CI %s 3; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s 4; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s 5; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s 6 7declare i32 @llvm.amdgcn.workitem.id.x() #0 8 9@lds.obj = addrspace(3) global [256 x i32] undef, align 4 10 11define amdgpu_kernel void @write_ds_sub0_offset0_global() #0 { 12; CI-LABEL: write_ds_sub0_offset0_global: 13; CI: ; %bb.0: ; %entry 14; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 15; CI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 16; CI-NEXT: v_mov_b32_e32 v1, 0x7b 17; CI-NEXT: s_mov_b32 m0, -1 18; CI-NEXT: ds_write_b32 v0, v1 offset:12 19; CI-NEXT: s_endpgm 20; 21; GFX9-LABEL: write_ds_sub0_offset0_global: 22; GFX9: ; %bb.0: ; %entry 23; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 24; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0 25; GFX9-NEXT: v_mov_b32_e32 v1, 0x7b 26; GFX9-NEXT: ds_write_b32 v0, v1 offset:12 27; GFX9-NEXT: s_endpgm 28; 29; GFX10-LABEL: write_ds_sub0_offset0_global: 30; GFX10: ; %bb.0: ; %entry 31; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 32; GFX10-NEXT: v_mov_b32_e32 v1, 0x7b 33; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0, v0 34; GFX10-NEXT: ds_write_b32 v0, v1 offset:12 35; GFX10-NEXT: s_endpgm 36; 37; GFX11-LABEL: write_ds_sub0_offset0_global: 38; GFX11: ; %bb.0: ; %entry 39; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 40; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 41; GFX11-NEXT: v_dual_mov_b32 v1, 0x7b :: v_dual_lshlrev_b32 v0, 2, v0 42; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0, v0 43; GFX11-NEXT: ds_store_b32 v0, v1 offset:12 44; GFX11-NEXT: s_endpgm 45entry: 46 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #1 47 %sub1 = sub i32 0, %x.i 48 %tmp0 = getelementptr [256 x i32], ptr addrspace(3) @lds.obj, i32 0, i32 %sub1 49 %arrayidx = getelementptr inbounds i32, ptr addrspace(3) %tmp0, i32 3 50 store i32 123, ptr addrspace(3) %arrayidx 51 ret void 52} 53 54define amdgpu_kernel void @write_ds_sub0_offset0_global_clamp_bit(float %dummy.val) #0 { 55; CI-LABEL: write_ds_sub0_offset0_global_clamp_bit: 56; CI: ; %bb.0: ; %entry 57; CI-NEXT: s_load_dword s0, s[4:5], 0x0 58; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 59; CI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 60; CI-NEXT: s_mov_b64 vcc, 0 61; CI-NEXT: s_waitcnt lgkmcnt(0) 62; CI-NEXT: v_mov_b32_e32 v1, s0 63; CI-NEXT: s_mov_b32 s0, 0 64; CI-NEXT: v_div_fmas_f32 v1, v1, v1, v1 65; CI-NEXT: v_mov_b32_e32 v2, 0x7b 66; CI-NEXT: s_mov_b32 m0, -1 67; CI-NEXT: s_mov_b32 s3, 0xf000 68; CI-NEXT: s_mov_b32 s2, -1 69; CI-NEXT: s_mov_b32 s1, s0 70; CI-NEXT: ds_write_b32 v0, v2 offset:12 71; CI-NEXT: buffer_store_dword v1, off, s[0:3], 0 72; CI-NEXT: s_waitcnt vmcnt(0) 73; CI-NEXT: s_endpgm 74; 75; GFX9-LABEL: write_ds_sub0_offset0_global_clamp_bit: 76; GFX9: ; %bb.0: ; %entry 77; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 78; GFX9-NEXT: s_mov_b64 vcc, 0 79; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 80; GFX9-NEXT: v_sub_u32_e32 v3, 0, v0 81; GFX9-NEXT: v_mov_b32_e32 v4, 0x7b 82; GFX9-NEXT: s_waitcnt lgkmcnt(0) 83; GFX9-NEXT: v_mov_b32_e32 v1, s0 84; GFX9-NEXT: v_div_fmas_f32 v2, v1, v1, v1 85; GFX9-NEXT: v_mov_b32_e32 v0, 0 86; GFX9-NEXT: v_mov_b32_e32 v1, 0 87; GFX9-NEXT: ds_write_b32 v3, v4 offset:12 88; GFX9-NEXT: global_store_dword v[0:1], v2, off 89; GFX9-NEXT: s_waitcnt vmcnt(0) 90; GFX9-NEXT: s_endpgm 91; 92; GFX10-LABEL: write_ds_sub0_offset0_global_clamp_bit: 93; GFX10: ; %bb.0: ; %entry 94; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 95; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 96; GFX10-NEXT: s_mov_b32 vcc_lo, 0 97; GFX10-NEXT: v_mov_b32_e32 v3, 0x7b 98; GFX10-NEXT: v_sub_nc_u32_e32 v2, 0, v0 99; GFX10-NEXT: v_mov_b32_e32 v0, 0 100; GFX10-NEXT: v_mov_b32_e32 v1, 0 101; GFX10-NEXT: ds_write_b32 v2, v3 offset:12 102; GFX10-NEXT: s_waitcnt lgkmcnt(0) 103; GFX10-NEXT: v_div_fmas_f32 v4, s0, s0, s0 104; GFX10-NEXT: global_store_dword v[0:1], v4, off 105; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 106; GFX10-NEXT: s_endpgm 107; 108; GFX11-LABEL: write_ds_sub0_offset0_global_clamp_bit: 109; GFX11: ; %bb.0: ; %entry 110; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0 111; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 112; GFX11-NEXT: s_mov_b32 vcc_lo, 0 113; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 114; GFX11-NEXT: v_dual_mov_b32 v3, 0x7b :: v_dual_lshlrev_b32 v0, 2, v0 115; GFX11-NEXT: v_sub_nc_u32_e32 v2, 0, v0 116; GFX11-NEXT: v_mov_b32_e32 v0, 0 117; GFX11-NEXT: v_mov_b32_e32 v1, 0 118; GFX11-NEXT: ds_store_b32 v2, v3 offset:12 119; GFX11-NEXT: s_waitcnt lgkmcnt(0) 120; GFX11-NEXT: v_div_fmas_f32 v4, s0, s0, s0 121; GFX11-NEXT: global_store_b32 v[0:1], v4, off dlc 122; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 123; GFX11-NEXT: s_endpgm 124entry: 125 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #1 126 %sub1 = sub i32 0, %x.i 127 %tmp0 = getelementptr [256 x i32], ptr addrspace(3) @lds.obj, i32 0, i32 %sub1 128 %arrayidx = getelementptr inbounds i32, ptr addrspace(3) %tmp0, i32 3 129 store i32 123, ptr addrspace(3) %arrayidx 130 %fmas = call float @llvm.amdgcn.div.fmas.f32(float %dummy.val, float %dummy.val, float %dummy.val, i1 false) 131 store volatile float %fmas, ptr addrspace(1) null 132 ret void 133} 134 135define amdgpu_kernel void @write_ds_sub_max_offset_global_clamp_bit(float %dummy.val) #0 { 136; CI-LABEL: write_ds_sub_max_offset_global_clamp_bit: 137; CI: ; %bb.0: 138; CI-NEXT: s_load_dword s0, s[4:5], 0x0 139; CI-NEXT: s_mov_b64 vcc, 0 140; CI-NEXT: v_mov_b32_e32 v1, 0x7b 141; CI-NEXT: v_mov_b32_e32 v2, 0 142; CI-NEXT: s_mov_b32 m0, -1 143; CI-NEXT: s_waitcnt lgkmcnt(0) 144; CI-NEXT: v_mov_b32_e32 v0, s0 145; CI-NEXT: v_div_fmas_f32 v0, v0, v0, v0 146; CI-NEXT: s_mov_b32 s0, 0 147; CI-NEXT: s_mov_b32 s3, 0xf000 148; CI-NEXT: s_mov_b32 s2, -1 149; CI-NEXT: s_mov_b32 s1, s0 150; CI-NEXT: ds_write_b32 v2, v1 151; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 152; CI-NEXT: s_waitcnt vmcnt(0) 153; CI-NEXT: s_endpgm 154; 155; GFX9-LABEL: write_ds_sub_max_offset_global_clamp_bit: 156; GFX9: ; %bb.0: 157; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 158; GFX9-NEXT: s_mov_b64 vcc, 0 159; GFX9-NEXT: v_mov_b32_e32 v3, 0x7b 160; GFX9-NEXT: v_mov_b32_e32 v4, 0 161; GFX9-NEXT: ds_write_b32 v4, v3 162; GFX9-NEXT: s_waitcnt lgkmcnt(0) 163; GFX9-NEXT: v_mov_b32_e32 v0, s0 164; GFX9-NEXT: v_div_fmas_f32 v2, v0, v0, v0 165; GFX9-NEXT: v_mov_b32_e32 v0, 0 166; GFX9-NEXT: v_mov_b32_e32 v1, 0 167; GFX9-NEXT: global_store_dword v[0:1], v2, off 168; GFX9-NEXT: s_waitcnt vmcnt(0) 169; GFX9-NEXT: s_endpgm 170; 171; GFX10-LABEL: write_ds_sub_max_offset_global_clamp_bit: 172; GFX10: ; %bb.0: 173; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 174; GFX10-NEXT: s_mov_b32 vcc_lo, 0 175; GFX10-NEXT: v_mov_b32_e32 v0, 0 176; GFX10-NEXT: v_mov_b32_e32 v2, 0x7b 177; GFX10-NEXT: v_mov_b32_e32 v3, 0 178; GFX10-NEXT: v_mov_b32_e32 v1, 0 179; GFX10-NEXT: ds_write_b32 v3, v2 180; GFX10-NEXT: s_waitcnt lgkmcnt(0) 181; GFX10-NEXT: v_div_fmas_f32 v4, s0, s0, s0 182; GFX10-NEXT: global_store_dword v[0:1], v4, off 183; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 184; GFX10-NEXT: s_endpgm 185; 186; GFX11-LABEL: write_ds_sub_max_offset_global_clamp_bit: 187; GFX11: ; %bb.0: 188; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0 189; GFX11-NEXT: s_mov_b32 vcc_lo, 0 190; GFX11-NEXT: v_mov_b32_e32 v0, 0 191; GFX11-NEXT: v_dual_mov_b32 v2, 0x7b :: v_dual_mov_b32 v3, 0 192; GFX11-NEXT: v_mov_b32_e32 v1, 0 193; GFX11-NEXT: ds_store_b32 v3, v2 194; GFX11-NEXT: s_waitcnt lgkmcnt(0) 195; GFX11-NEXT: v_div_fmas_f32 v4, s0, s0, s0 196; GFX11-NEXT: global_store_b32 v[0:1], v4, off dlc 197; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 198; GFX11-NEXT: s_endpgm 199 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #1 200 %sub1 = sub i32 -1, %x.i 201 %tmp0 = getelementptr [256 x i32], ptr addrspace(3) @lds.obj, i32 0, i32 %sub1 202 %arrayidx = getelementptr inbounds i32, ptr addrspace(3) %tmp0, i32 16383 203 store i32 123, ptr addrspace(3) %arrayidx 204 %fmas = call float @llvm.amdgcn.div.fmas.f32(float %dummy.val, float %dummy.val, float %dummy.val, i1 false) 205 store volatile float %fmas, ptr addrspace(1) null 206 ret void 207} 208 209define amdgpu_kernel void @add_x_shl_max_offset() #1 { 210; CI-LABEL: add_x_shl_max_offset: 211; CI: ; %bb.0: 212; CI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 213; CI-NEXT: v_mov_b32_e32 v1, 13 214; CI-NEXT: s_mov_b32 m0, -1 215; CI-NEXT: ds_write_b8 v0, v1 offset:65535 216; CI-NEXT: s_endpgm 217; 218; GFX9-LABEL: add_x_shl_max_offset: 219; GFX9: ; %bb.0: 220; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 221; GFX9-NEXT: v_mov_b32_e32 v1, 13 222; GFX9-NEXT: ds_write_b8 v0, v1 offset:65535 223; GFX9-NEXT: s_endpgm 224; 225; GFX10-LABEL: add_x_shl_max_offset: 226; GFX10: ; %bb.0: 227; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 228; GFX10-NEXT: v_mov_b32_e32 v1, 13 229; GFX10-NEXT: ds_write_b8 v0, v1 offset:65535 230; GFX10-NEXT: s_endpgm 231; 232; GFX11-LABEL: add_x_shl_max_offset: 233; GFX11: ; %bb.0: 234; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0 235; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 236; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0 237; GFX11-NEXT: ds_store_b8 v0, v1 offset:65535 238; GFX11-NEXT: s_endpgm 239 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() 240 %shl = shl i32 %x.i, 4 241 %add = add i32 %shl, 65535 242 %z = zext i32 %add to i64 243 %ptr = inttoptr i64 %z to ptr addrspace(3) 244 store i8 13, ptr addrspace(3) %ptr, align 1 245 ret void 246} 247 248; this could have the offset transform, but sub became xor 249 250define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_alt() #1 { 251; CI-LABEL: add_x_shl_neg_to_sub_max_offset_alt: 252; CI: ; %bb.0: 253; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 254; CI-NEXT: v_xor_b32_e32 v0, 0xffff, v0 255; CI-NEXT: v_mov_b32_e32 v1, 13 256; CI-NEXT: s_mov_b32 m0, -1 257; CI-NEXT: ds_write_b8 v0, v1 258; CI-NEXT: s_endpgm 259; 260; GFX9-LABEL: add_x_shl_neg_to_sub_max_offset_alt: 261; GFX9: ; %bb.0: 262; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 263; GFX9-NEXT: v_xor_b32_e32 v0, 0xffff, v0 264; GFX9-NEXT: v_mov_b32_e32 v1, 13 265; GFX9-NEXT: ds_write_b8 v0, v1 266; GFX9-NEXT: s_endpgm 267; 268; GFX10-LABEL: add_x_shl_neg_to_sub_max_offset_alt: 269; GFX10: ; %bb.0: 270; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 271; GFX10-NEXT: v_mov_b32_e32 v1, 13 272; GFX10-NEXT: v_xor_b32_e32 v0, 0xffff, v0 273; GFX10-NEXT: ds_write_b8 v0, v1 274; GFX10-NEXT: s_endpgm 275; 276; GFX11-LABEL: add_x_shl_neg_to_sub_max_offset_alt: 277; GFX11: ; %bb.0: 278; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0 279; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 280; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 281; GFX11-NEXT: v_xor_b32_e32 v0, 0xffff, v0 282; GFX11-NEXT: ds_store_b8 v0, v1 283; GFX11-NEXT: s_endpgm 284 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() 285 %.neg = mul i32 %x.i, -4 286 %add = add i32 %.neg, 65535 287 %z = zext i32 %add to i64 288 %ptr = inttoptr i64 %z to ptr addrspace(3) 289 store i8 13, ptr addrspace(3) %ptr, align 1 290 ret void 291} 292 293; this could have the offset transform, but sub became xor 294 295define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_not_canonical() #1 { 296; CI-LABEL: add_x_shl_neg_to_sub_max_offset_not_canonical: 297; CI: ; %bb.0: 298; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 299; CI-NEXT: v_xor_b32_e32 v0, 0xffff, v0 300; CI-NEXT: v_mov_b32_e32 v1, 13 301; CI-NEXT: s_mov_b32 m0, -1 302; CI-NEXT: ds_write_b8 v0, v1 303; CI-NEXT: s_endpgm 304; 305; GFX9-LABEL: add_x_shl_neg_to_sub_max_offset_not_canonical: 306; GFX9: ; %bb.0: 307; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 308; GFX9-NEXT: v_xor_b32_e32 v0, 0xffff, v0 309; GFX9-NEXT: v_mov_b32_e32 v1, 13 310; GFX9-NEXT: ds_write_b8 v0, v1 311; GFX9-NEXT: s_endpgm 312; 313; GFX10-LABEL: add_x_shl_neg_to_sub_max_offset_not_canonical: 314; GFX10: ; %bb.0: 315; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 316; GFX10-NEXT: v_mov_b32_e32 v1, 13 317; GFX10-NEXT: v_xor_b32_e32 v0, 0xffff, v0 318; GFX10-NEXT: ds_write_b8 v0, v1 319; GFX10-NEXT: s_endpgm 320; 321; GFX11-LABEL: add_x_shl_neg_to_sub_max_offset_not_canonical: 322; GFX11: ; %bb.0: 323; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0 324; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 325; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 326; GFX11-NEXT: v_xor_b32_e32 v0, 0xffff, v0 327; GFX11-NEXT: ds_store_b8 v0, v1 328; GFX11-NEXT: s_endpgm 329 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0 330 %neg = sub i32 0, %x.i 331 %shl = shl i32 %neg, 2 332 %add = add i32 65535, %shl 333 %ptr = inttoptr i32 %add to ptr addrspace(3) 334 store i8 13, ptr addrspace(3) %ptr 335 ret void 336} 337 338define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_p1() #1 { 339; CI-LABEL: add_x_shl_neg_to_sub_max_offset_p1: 340; CI: ; %bb.0: 341; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 342; CI-NEXT: v_sub_i32_e32 v0, vcc, 0x10000, v0 343; CI-NEXT: v_mov_b32_e32 v1, 13 344; CI-NEXT: s_mov_b32 m0, -1 345; CI-NEXT: ds_write_b8 v0, v1 346; CI-NEXT: s_endpgm 347; 348; GFX9-LABEL: add_x_shl_neg_to_sub_max_offset_p1: 349; GFX9: ; %bb.0: 350; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 351; GFX9-NEXT: v_sub_u32_e32 v0, 0x10000, v0 352; GFX9-NEXT: v_mov_b32_e32 v1, 13 353; GFX9-NEXT: ds_write_b8 v0, v1 354; GFX9-NEXT: s_endpgm 355; 356; GFX10-LABEL: add_x_shl_neg_to_sub_max_offset_p1: 357; GFX10: ; %bb.0: 358; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 359; GFX10-NEXT: v_mov_b32_e32 v1, 13 360; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0x10000, v0 361; GFX10-NEXT: ds_write_b8 v0, v1 362; GFX10-NEXT: s_endpgm 363; 364; GFX11-LABEL: add_x_shl_neg_to_sub_max_offset_p1: 365; GFX11: ; %bb.0: 366; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0 367; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 368; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 369; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0x10000, v0 370; GFX11-NEXT: ds_store_b8 v0, v1 371; GFX11-NEXT: s_endpgm 372 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0 373 %neg = sub i32 0, %x.i 374 %shl = shl i32 %neg, 2 375 %add = add i32 65536, %shl 376 %ptr = inttoptr i32 %add to ptr addrspace(3) 377 store i8 13, ptr addrspace(3) %ptr 378 ret void 379} 380 381define amdgpu_kernel void @add_x_shl_neg_to_sub_multi_use() #1 { 382; CI-LABEL: add_x_shl_neg_to_sub_multi_use: 383; CI: ; %bb.0: 384; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 385; CI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 386; CI-NEXT: v_mov_b32_e32 v1, 13 387; CI-NEXT: s_mov_b32 m0, -1 388; CI-NEXT: ds_write_b32 v0, v1 offset:123 389; CI-NEXT: ds_write_b32 v0, v1 offset:456 390; CI-NEXT: s_endpgm 391; 392; GFX9-LABEL: add_x_shl_neg_to_sub_multi_use: 393; GFX9: ; %bb.0: 394; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 395; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0 396; GFX9-NEXT: v_mov_b32_e32 v1, 13 397; GFX9-NEXT: ds_write_b32 v0, v1 offset:123 398; GFX9-NEXT: ds_write_b32 v0, v1 offset:456 399; GFX9-NEXT: s_endpgm 400; 401; GFX10-LABEL: add_x_shl_neg_to_sub_multi_use: 402; GFX10: ; %bb.0: 403; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 404; GFX10-NEXT: v_mov_b32_e32 v1, 13 405; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0, v0 406; GFX10-NEXT: ds_write_b32 v0, v1 offset:123 407; GFX10-NEXT: ds_write_b32 v0, v1 offset:456 408; GFX10-NEXT: s_endpgm 409; 410; GFX11-LABEL: add_x_shl_neg_to_sub_multi_use: 411; GFX11: ; %bb.0: 412; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_lshlrev_b32 v0, 2, v0 413; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 414; GFX11-NEXT: v_and_b32_e32 v0, 0xffc, v0 415; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0, v0 416; GFX11-NEXT: ds_store_b32 v0, v1 offset:123 417; GFX11-NEXT: ds_store_b32 v0, v1 offset:456 418; GFX11-NEXT: s_endpgm 419 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0 420 %neg = sub i32 0, %x.i 421 %shl = shl i32 %neg, 2 422 %add0 = add i32 123, %shl 423 %add1 = add i32 456, %shl 424 %ptr0 = inttoptr i32 %add0 to ptr addrspace(3) 425 store volatile i32 13, ptr addrspace(3) %ptr0 426 %ptr1 = inttoptr i32 %add1 to ptr addrspace(3) 427 store volatile i32 13, ptr addrspace(3) %ptr1 428 ret void 429} 430 431define amdgpu_kernel void @add_x_shl_neg_to_sub_multi_use_same_offset() #1 { 432; CI-LABEL: add_x_shl_neg_to_sub_multi_use_same_offset: 433; CI: ; %bb.0: 434; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 435; CI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 436; CI-NEXT: v_mov_b32_e32 v1, 13 437; CI-NEXT: s_mov_b32 m0, -1 438; CI-NEXT: ds_write_b32 v0, v1 offset:123 439; CI-NEXT: ds_write_b32 v0, v1 offset:123 440; CI-NEXT: s_endpgm 441; 442; GFX9-LABEL: add_x_shl_neg_to_sub_multi_use_same_offset: 443; GFX9: ; %bb.0: 444; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 445; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0 446; GFX9-NEXT: v_mov_b32_e32 v1, 13 447; GFX9-NEXT: ds_write_b32 v0, v1 offset:123 448; GFX9-NEXT: ds_write_b32 v0, v1 offset:123 449; GFX9-NEXT: s_endpgm 450; 451; GFX10-LABEL: add_x_shl_neg_to_sub_multi_use_same_offset: 452; GFX10: ; %bb.0: 453; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 454; GFX10-NEXT: v_mov_b32_e32 v1, 13 455; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0, v0 456; GFX10-NEXT: ds_write_b32 v0, v1 offset:123 457; GFX10-NEXT: ds_write_b32 v0, v1 offset:123 458; GFX10-NEXT: s_endpgm 459; 460; GFX11-LABEL: add_x_shl_neg_to_sub_multi_use_same_offset: 461; GFX11: ; %bb.0: 462; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0 463; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 464; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 465; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0, v0 466; GFX11-NEXT: ds_store_b32 v0, v1 offset:123 467; GFX11-NEXT: ds_store_b32 v0, v1 offset:123 468; GFX11-NEXT: s_endpgm 469 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0 470 %neg = sub i32 0, %x.i 471 %shl = shl i32 %neg, 2 472 %add = add i32 123, %shl 473 %ptr = inttoptr i32 %add to ptr addrspace(3) 474 store volatile i32 13, ptr addrspace(3) %ptr 475 store volatile i32 13, ptr addrspace(3) %ptr 476 ret void 477} 478 479define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset() #1 { 480; CI-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset: 481; CI: ; %bb.0: 482; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 483; CI-NEXT: v_sub_i32_e32 v0, vcc, 0x3fb, v0 484; CI-NEXT: v_mov_b32_e32 v1, 0x7b 485; CI-NEXT: v_mov_b32_e32 v2, 0 486; CI-NEXT: s_mov_b32 m0, -1 487; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 488; CI-NEXT: s_endpgm 489; 490; GFX9-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset: 491; GFX9: ; %bb.0: 492; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 493; GFX9-NEXT: v_sub_u32_e32 v0, 0x3fb, v0 494; GFX9-NEXT: v_mov_b32_e32 v1, 0x7b 495; GFX9-NEXT: v_mov_b32_e32 v2, 0 496; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 497; GFX9-NEXT: s_endpgm 498; 499; GFX10-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset: 500; GFX10: ; %bb.0: 501; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 502; GFX10-NEXT: v_mov_b32_e32 v1, 0 503; GFX10-NEXT: v_mov_b32_e32 v2, 0x7b 504; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0, v0 505; GFX10-NEXT: ds_write_b32 v0, v1 offset:1023 506; GFX10-NEXT: ds_write_b32 v0, v2 offset:1019 507; GFX10-NEXT: s_endpgm 508; 509; GFX11-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset: 510; GFX11: ; %bb.0: 511; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 512; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x7b 513; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 514; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 515; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0x3fb, v0 516; GFX11-NEXT: ds_store_2addr_b32 v0, v1, v2 offset1:1 517; GFX11-NEXT: s_endpgm 518 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0 519 %neg = sub i32 0, %x.i 520 %shl = shl i32 %neg, 2 521 %add = add i32 1019, %shl 522 %ptr = inttoptr i32 %add to ptr addrspace(3) 523 store i64 123, ptr addrspace(3) %ptr, align 4 524 ret void 525} 526 527define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit(float %dummy.val) #1 { 528; CI-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit: 529; CI: ; %bb.0: 530; CI-NEXT: s_load_dword s0, s[4:5], 0x0 531; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 532; CI-NEXT: v_sub_i32_e32 v0, vcc, 0x3fb, v0 533; CI-NEXT: s_mov_b64 vcc, 0 534; CI-NEXT: s_waitcnt lgkmcnt(0) 535; CI-NEXT: v_mov_b32_e32 v1, s0 536; CI-NEXT: s_mov_b32 s0, 0 537; CI-NEXT: v_div_fmas_f32 v1, v1, v1, v1 538; CI-NEXT: v_mov_b32_e32 v2, 0x7b 539; CI-NEXT: v_mov_b32_e32 v3, 0 540; CI-NEXT: s_mov_b32 m0, -1 541; CI-NEXT: s_mov_b32 s3, 0xf000 542; CI-NEXT: s_mov_b32 s2, -1 543; CI-NEXT: s_mov_b32 s1, s0 544; CI-NEXT: ds_write2_b32 v0, v2, v3 offset1:1 545; CI-NEXT: buffer_store_dword v1, off, s[0:3], 0 546; CI-NEXT: s_waitcnt vmcnt(0) 547; CI-NEXT: s_endpgm 548; 549; GFX9-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit: 550; GFX9: ; %bb.0: 551; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 552; GFX9-NEXT: s_mov_b64 vcc, 0 553; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 554; GFX9-NEXT: v_sub_u32_e32 v3, 0x3fb, v0 555; GFX9-NEXT: v_mov_b32_e32 v4, 0x7b 556; GFX9-NEXT: s_waitcnt lgkmcnt(0) 557; GFX9-NEXT: v_mov_b32_e32 v1, s0 558; GFX9-NEXT: v_div_fmas_f32 v2, v1, v1, v1 559; GFX9-NEXT: v_mov_b32_e32 v0, 0 560; GFX9-NEXT: v_mov_b32_e32 v5, 0 561; GFX9-NEXT: v_mov_b32_e32 v1, 0 562; GFX9-NEXT: ds_write2_b32 v3, v4, v5 offset1:1 563; GFX9-NEXT: global_store_dword v[0:1], v2, off 564; GFX9-NEXT: s_waitcnt vmcnt(0) 565; GFX9-NEXT: s_endpgm 566; 567; GFX10-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit: 568; GFX10: ; %bb.0: 569; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 570; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 571; GFX10-NEXT: s_mov_b32 vcc_lo, 0 572; GFX10-NEXT: v_mov_b32_e32 v3, 0 573; GFX10-NEXT: v_mov_b32_e32 v4, 0x7b 574; GFX10-NEXT: v_sub_nc_u32_e32 v2, 0, v0 575; GFX10-NEXT: v_mov_b32_e32 v0, 0 576; GFX10-NEXT: v_mov_b32_e32 v1, 0 577; GFX10-NEXT: ds_write_b32 v2, v3 offset:1023 578; GFX10-NEXT: ds_write_b32 v2, v4 offset:1019 579; GFX10-NEXT: s_waitcnt lgkmcnt(0) 580; GFX10-NEXT: v_div_fmas_f32 v5, s0, s0, s0 581; GFX10-NEXT: global_store_dword v[0:1], v5, off 582; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 583; GFX10-NEXT: s_endpgm 584; 585; GFX11-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit: 586; GFX11: ; %bb.0: 587; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0 588; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 589; GFX11-NEXT: s_mov_b32 vcc_lo, 0 590; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, 0x7b 591; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 592; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 593; GFX11-NEXT: v_sub_nc_u32_e32 v2, 0x3fb, v0 594; GFX11-NEXT: v_mov_b32_e32 v0, 0 595; GFX11-NEXT: v_mov_b32_e32 v1, 0 596; GFX11-NEXT: ds_store_2addr_b32 v2, v3, v4 offset1:1 597; GFX11-NEXT: s_waitcnt lgkmcnt(0) 598; GFX11-NEXT: v_div_fmas_f32 v5, s0, s0, s0 599; GFX11-NEXT: global_store_b32 v[0:1], v5, off dlc 600; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 601; GFX11-NEXT: s_endpgm 602 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0 603 %neg = sub i32 0, %x.i 604 %shl = shl i32 %neg, 2 605 %add = add i32 1019, %shl 606 %ptr = inttoptr i32 %add to ptr addrspace(3) 607 store i64 123, ptr addrspace(3) %ptr, align 4 608 %fmas = call float @llvm.amdgcn.div.fmas.f32(float %dummy.val, float %dummy.val, float %dummy.val, i1 false) 609 store volatile float %fmas, ptr addrspace(1) null 610 ret void 611} 612 613define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1() #1 { 614; CI-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1: 615; CI: ; %bb.0: 616; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 617; CI-NEXT: v_sub_i32_e32 v0, vcc, 0x3fc, v0 618; CI-NEXT: v_mov_b32_e32 v1, 0x7b 619; CI-NEXT: v_mov_b32_e32 v2, 0 620; CI-NEXT: s_mov_b32 m0, -1 621; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 622; CI-NEXT: s_endpgm 623; 624; GFX9-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1: 625; GFX9: ; %bb.0: 626; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 627; GFX9-NEXT: v_sub_u32_e32 v0, 0x3fc, v0 628; GFX9-NEXT: v_mov_b32_e32 v1, 0x7b 629; GFX9-NEXT: v_mov_b32_e32 v2, 0 630; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 631; GFX9-NEXT: s_endpgm 632; 633; GFX10-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1: 634; GFX10: ; %bb.0: 635; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 636; GFX10-NEXT: v_mov_b32_e32 v1, 0 637; GFX10-NEXT: v_mov_b32_e32 v2, 0x7b 638; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0, v0 639; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x200, v0 640; GFX10-NEXT: ds_write2_b32 v0, v2, v1 offset0:127 offset1:128 641; GFX10-NEXT: s_endpgm 642; 643; GFX11-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1: 644; GFX11: ; %bb.0: 645; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 646; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x7b 647; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 648; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 649; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0x3fc, v0 650; GFX11-NEXT: ds_store_2addr_b32 v0, v1, v2 offset1:1 651; GFX11-NEXT: s_endpgm 652 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0 653 %neg = sub i32 0, %x.i 654 %shl = shl i32 %neg, 2 655 %add = add i32 1020, %shl 656 %ptr = inttoptr i32 %add to ptr addrspace(3) 657 store i64 123, ptr addrspace(3) %ptr, align 4 658 ret void 659} 660 661declare float @llvm.amdgcn.div.fmas.f32(float, float, float, i1) 662 663attributes #0 = { nounwind readnone } 664attributes #1 = { nounwind } 665attributes #2 = { nounwind convergent } 666