1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 2; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GFX6 %s 3; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s 4; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s 5 6define amdgpu_kernel void @or_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { 7; GFX6-LABEL: or_v2i32: 8; GFX6: ; %bb.0: 9; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 10; GFX6-NEXT: s_mov_b32 s7, 0xf000 11; GFX6-NEXT: s_mov_b32 s6, -1 12; GFX6-NEXT: s_mov_b32 s10, s6 13; GFX6-NEXT: s_mov_b32 s11, s7 14; GFX6-NEXT: s_waitcnt lgkmcnt(0) 15; GFX6-NEXT: s_mov_b32 s8, s2 16; GFX6-NEXT: s_mov_b32 s9, s3 17; GFX6-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 18; GFX6-NEXT: s_mov_b32 s4, s0 19; GFX6-NEXT: s_mov_b32 s5, s1 20; GFX6-NEXT: s_waitcnt vmcnt(0) 21; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 22; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 23; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 24; GFX6-NEXT: s_endpgm 25; 26; GFX8-LABEL: or_v2i32: 27; GFX8: ; %bb.0: 28; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 29; GFX8-NEXT: s_mov_b32 s7, 0xf000 30; GFX8-NEXT: s_mov_b32 s6, -1 31; GFX8-NEXT: s_mov_b32 s10, s6 32; GFX8-NEXT: s_mov_b32 s11, s7 33; GFX8-NEXT: s_waitcnt lgkmcnt(0) 34; GFX8-NEXT: s_mov_b32 s8, s2 35; GFX8-NEXT: s_mov_b32 s9, s3 36; GFX8-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 37; GFX8-NEXT: s_mov_b32 s4, s0 38; GFX8-NEXT: s_mov_b32 s5, s1 39; GFX8-NEXT: s_waitcnt vmcnt(0) 40; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 41; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 42; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 43; GFX8-NEXT: s_endpgm 44; 45; EG-LABEL: or_v2i32: 46; EG: ; %bb.0: 47; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 48; EG-NEXT: TEX 0 @6 49; EG-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[] 50; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 51; EG-NEXT: CF_END 52; EG-NEXT: PAD 53; EG-NEXT: Fetch clause starting at 6: 54; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 55; EG-NEXT: ALU clause starting at 8: 56; EG-NEXT: MOV * T0.X, KC0[2].Z, 57; EG-NEXT: ALU clause starting at 9: 58; EG-NEXT: OR_INT * T0.Y, T0.Y, T0.W, 59; EG-NEXT: OR_INT T0.X, T0.X, T0.Z, 60; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 61; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 62 %b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1 63 %a = load <2 x i32>, ptr addrspace(1) %in 64 %b = load <2 x i32>, ptr addrspace(1) %b_ptr 65 %result = or <2 x i32> %a, %b 66 store <2 x i32> %result, ptr addrspace(1) %out 67 ret void 68} 69 70define amdgpu_kernel void @or_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { 71; GFX6-LABEL: or_v4i32: 72; GFX6: ; %bb.0: 73; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 74; GFX6-NEXT: s_mov_b32 s7, 0xf000 75; GFX6-NEXT: s_mov_b32 s6, -1 76; GFX6-NEXT: s_mov_b32 s10, s6 77; GFX6-NEXT: s_mov_b32 s11, s7 78; GFX6-NEXT: s_waitcnt lgkmcnt(0) 79; GFX6-NEXT: s_mov_b32 s8, s2 80; GFX6-NEXT: s_mov_b32 s9, s3 81; GFX6-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 82; GFX6-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 83; GFX6-NEXT: s_mov_b32 s4, s0 84; GFX6-NEXT: s_mov_b32 s5, s1 85; GFX6-NEXT: s_waitcnt vmcnt(0) 86; GFX6-NEXT: v_or_b32_e32 v3, v3, v7 87; GFX6-NEXT: v_or_b32_e32 v2, v2, v6 88; GFX6-NEXT: v_or_b32_e32 v1, v1, v5 89; GFX6-NEXT: v_or_b32_e32 v0, v0, v4 90; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 91; GFX6-NEXT: s_endpgm 92; 93; GFX8-LABEL: or_v4i32: 94; GFX8: ; %bb.0: 95; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 96; GFX8-NEXT: s_mov_b32 s7, 0xf000 97; GFX8-NEXT: s_mov_b32 s6, -1 98; GFX8-NEXT: s_mov_b32 s10, s6 99; GFX8-NEXT: s_mov_b32 s11, s7 100; GFX8-NEXT: s_waitcnt lgkmcnt(0) 101; GFX8-NEXT: s_mov_b32 s8, s2 102; GFX8-NEXT: s_mov_b32 s9, s3 103; GFX8-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 104; GFX8-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 105; GFX8-NEXT: s_mov_b32 s4, s0 106; GFX8-NEXT: s_mov_b32 s5, s1 107; GFX8-NEXT: s_waitcnt vmcnt(0) 108; GFX8-NEXT: v_or_b32_e32 v3, v3, v7 109; GFX8-NEXT: v_or_b32_e32 v2, v2, v6 110; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 111; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 112; GFX8-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 113; GFX8-NEXT: s_endpgm 114; 115; EG-LABEL: or_v4i32: 116; EG: ; %bb.0: 117; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] 118; EG-NEXT: TEX 1 @6 119; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[] 120; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 121; EG-NEXT: CF_END 122; EG-NEXT: PAD 123; EG-NEXT: Fetch clause starting at 6: 124; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1 125; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 126; EG-NEXT: ALU clause starting at 10: 127; EG-NEXT: MOV * T0.X, KC0[2].Z, 128; EG-NEXT: ALU clause starting at 11: 129; EG-NEXT: OR_INT * T0.W, T0.W, T1.W, 130; EG-NEXT: OR_INT * T0.Z, T0.Z, T1.Z, 131; EG-NEXT: OR_INT * T0.Y, T0.Y, T1.Y, 132; EG-NEXT: OR_INT T0.X, T0.X, T1.X, 133; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 134; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 135 %b_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1 136 %a = load <4 x i32>, ptr addrspace(1) %in 137 %b = load <4 x i32>, ptr addrspace(1) %b_ptr 138 %result = or <4 x i32> %a, %b 139 store <4 x i32> %result, ptr addrspace(1) %out 140 ret void 141} 142 143define amdgpu_kernel void @scalar_or_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { 144; GFX6-LABEL: scalar_or_i32: 145; GFX6: ; %bb.0: 146; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 147; GFX6-NEXT: s_mov_b32 s7, 0xf000 148; GFX6-NEXT: s_mov_b32 s6, -1 149; GFX6-NEXT: s_waitcnt lgkmcnt(0) 150; GFX6-NEXT: s_mov_b32 s4, s0 151; GFX6-NEXT: s_or_b32 s0, s2, s3 152; GFX6-NEXT: s_mov_b32 s5, s1 153; GFX6-NEXT: v_mov_b32_e32 v0, s0 154; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 155; GFX6-NEXT: s_endpgm 156; 157; GFX8-LABEL: scalar_or_i32: 158; GFX8: ; %bb.0: 159; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 160; GFX8-NEXT: s_mov_b32 s7, 0xf000 161; GFX8-NEXT: s_mov_b32 s6, -1 162; GFX8-NEXT: s_waitcnt lgkmcnt(0) 163; GFX8-NEXT: s_mov_b32 s4, s0 164; GFX8-NEXT: s_or_b32 s0, s2, s3 165; GFX8-NEXT: s_mov_b32 s5, s1 166; GFX8-NEXT: v_mov_b32_e32 v0, s0 167; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 168; GFX8-NEXT: s_endpgm 169; 170; EG-LABEL: scalar_or_i32: 171; EG: ; %bb.0: 172; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 173; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 174; EG-NEXT: CF_END 175; EG-NEXT: PAD 176; EG-NEXT: ALU clause starting at 4: 177; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, 178; EG-NEXT: OR_INT * T1.X, KC0[2].Z, KC0[2].W, 179; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 180 %or = or i32 %a, %b 181 store i32 %or, ptr addrspace(1) %out 182 ret void 183} 184 185define amdgpu_kernel void @vector_or_i32(ptr addrspace(1) %out, ptr addrspace(1) %a, i32 %b) { 186; GFX6-LABEL: vector_or_i32: 187; GFX6: ; %bb.0: 188; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 189; GFX6-NEXT: s_load_dword s12, s[4:5], 0xd 190; GFX6-NEXT: s_mov_b32 s7, 0xf000 191; GFX6-NEXT: s_mov_b32 s6, -1 192; GFX6-NEXT: s_mov_b32 s10, s6 193; GFX6-NEXT: s_waitcnt lgkmcnt(0) 194; GFX6-NEXT: s_mov_b32 s8, s2 195; GFX6-NEXT: s_mov_b32 s9, s3 196; GFX6-NEXT: s_mov_b32 s11, s7 197; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0 198; GFX6-NEXT: s_mov_b32 s4, s0 199; GFX6-NEXT: s_mov_b32 s5, s1 200; GFX6-NEXT: s_waitcnt vmcnt(0) 201; GFX6-NEXT: v_or_b32_e32 v0, s12, v0 202; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 203; GFX6-NEXT: s_endpgm 204; 205; GFX8-LABEL: vector_or_i32: 206; GFX8: ; %bb.0: 207; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 208; GFX8-NEXT: s_load_dword s12, s[4:5], 0x34 209; GFX8-NEXT: s_mov_b32 s7, 0xf000 210; GFX8-NEXT: s_mov_b32 s6, -1 211; GFX8-NEXT: s_mov_b32 s10, s6 212; GFX8-NEXT: s_waitcnt lgkmcnt(0) 213; GFX8-NEXT: s_mov_b32 s8, s2 214; GFX8-NEXT: s_mov_b32 s9, s3 215; GFX8-NEXT: s_mov_b32 s11, s7 216; GFX8-NEXT: buffer_load_dword v0, off, s[8:11], 0 217; GFX8-NEXT: s_mov_b32 s4, s0 218; GFX8-NEXT: s_mov_b32 s5, s1 219; GFX8-NEXT: s_waitcnt vmcnt(0) 220; GFX8-NEXT: v_or_b32_e32 v0, s12, v0 221; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 222; GFX8-NEXT: s_endpgm 223; 224; EG-LABEL: vector_or_i32: 225; EG: ; %bb.0: 226; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 227; EG-NEXT: TEX 0 @6 228; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] 229; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 230; EG-NEXT: CF_END 231; EG-NEXT: PAD 232; EG-NEXT: Fetch clause starting at 6: 233; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 234; EG-NEXT: ALU clause starting at 8: 235; EG-NEXT: MOV * T0.X, KC0[2].Z, 236; EG-NEXT: ALU clause starting at 9: 237; EG-NEXT: OR_INT T0.X, T0.X, KC0[2].W, 238; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 239; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 240 %loada = load i32, ptr addrspace(1) %a 241 %or = or i32 %loada, %b 242 store i32 %or, ptr addrspace(1) %out 243 ret void 244} 245 246define amdgpu_kernel void @scalar_or_literal_i32(ptr addrspace(1) %out, i32 %a) { 247; GFX6-LABEL: scalar_or_literal_i32: 248; GFX6: ; %bb.0: 249; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb 250; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 251; GFX6-NEXT: s_mov_b32 s3, 0xf000 252; GFX6-NEXT: s_mov_b32 s2, -1 253; GFX6-NEXT: s_waitcnt lgkmcnt(0) 254; GFX6-NEXT: s_or_b32 s4, s6, 0x1869f 255; GFX6-NEXT: v_mov_b32_e32 v0, s4 256; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 257; GFX6-NEXT: s_endpgm 258; 259; GFX8-LABEL: scalar_or_literal_i32: 260; GFX8: ; %bb.0: 261; GFX8-NEXT: s_load_dword s6, s[4:5], 0x2c 262; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 263; GFX8-NEXT: s_mov_b32 s3, 0xf000 264; GFX8-NEXT: s_mov_b32 s2, -1 265; GFX8-NEXT: s_waitcnt lgkmcnt(0) 266; GFX8-NEXT: s_or_b32 s4, s6, 0x1869f 267; GFX8-NEXT: v_mov_b32_e32 v0, s4 268; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 269; GFX8-NEXT: s_endpgm 270; 271; EG-LABEL: scalar_or_literal_i32: 272; EG: ; %bb.0: 273; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 274; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 275; EG-NEXT: CF_END 276; EG-NEXT: PAD 277; EG-NEXT: ALU clause starting at 4: 278; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, 279; EG-NEXT: OR_INT * T1.X, KC0[2].Z, literal.y, 280; EG-NEXT: 2(2.802597e-45), 99999(1.401284e-40) 281 %or = or i32 %a, 99999 282 store i32 %or, ptr addrspace(1) %out, align 4 283 ret void 284} 285 286define amdgpu_kernel void @scalar_or_literal_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { 287; GFX6-LABEL: scalar_or_literal_i64: 288; GFX6: ; %bb.0: 289; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 290; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 291; GFX6-NEXT: s_mov_b32 s3, 0xf000 292; GFX6-NEXT: s_mov_b32 s2, -1 293; GFX6-NEXT: s_waitcnt lgkmcnt(0) 294; GFX6-NEXT: s_or_b32 s4, s7, 0xf237b 295; GFX6-NEXT: s_or_b32 s5, s6, 0x3039 296; GFX6-NEXT: v_mov_b32_e32 v0, s5 297; GFX6-NEXT: v_mov_b32_e32 v1, s4 298; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 299; GFX6-NEXT: s_endpgm 300; 301; GFX8-LABEL: scalar_or_literal_i64: 302; GFX8: ; %bb.0: 303; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c 304; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 305; GFX8-NEXT: s_mov_b32 s3, 0xf000 306; GFX8-NEXT: s_mov_b32 s2, -1 307; GFX8-NEXT: s_waitcnt lgkmcnt(0) 308; GFX8-NEXT: s_or_b32 s4, s7, 0xf237b 309; GFX8-NEXT: s_or_b32 s5, s6, 0x3039 310; GFX8-NEXT: v_mov_b32_e32 v0, s5 311; GFX8-NEXT: v_mov_b32_e32 v1, s4 312; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 313; GFX8-NEXT: s_endpgm 314; 315; EG-LABEL: scalar_or_literal_i64: 316; EG: ; %bb.0: 317; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] 318; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 319; EG-NEXT: CF_END 320; EG-NEXT: PAD 321; EG-NEXT: ALU clause starting at 4: 322; EG-NEXT: OR_INT * T0.Y, KC0[5].X, literal.x, 323; EG-NEXT: 992123(1.390260e-39), 0(0.000000e+00) 324; EG-NEXT: OR_INT T0.X, KC0[4].W, literal.x, 325; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 326; EG-NEXT: 12345(1.729903e-41), 2(2.802597e-45) 327 %or = or i64 %a, 4261135838621753 328 store i64 %or, ptr addrspace(1) %out 329 ret void 330} 331 332define amdgpu_kernel void @scalar_or_literal_multi_use_i64(ptr addrspace(1) %out, [8 x i32], i64 %a, [8 x i32], i64 %b) { 333; GFX6-LABEL: scalar_or_literal_multi_use_i64: 334; GFX6: ; %bb.0: 335; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 336; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 337; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x1d 338; GFX6-NEXT: s_movk_i32 s8, 0x3039 339; GFX6-NEXT: s_mov_b32 s9, 0xf237b 340; GFX6-NEXT: s_mov_b32 s3, 0xf000 341; GFX6-NEXT: s_waitcnt lgkmcnt(0) 342; GFX6-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] 343; GFX6-NEXT: v_mov_b32_e32 v0, s6 344; GFX6-NEXT: s_mov_b32 s2, -1 345; GFX6-NEXT: v_mov_b32_e32 v1, s7 346; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 347; GFX6-NEXT: s_add_u32 s0, s4, 0x3039 348; GFX6-NEXT: s_addc_u32 s1, s5, 0xf237b 349; GFX6-NEXT: s_waitcnt expcnt(0) 350; GFX6-NEXT: v_mov_b32_e32 v0, s0 351; GFX6-NEXT: v_mov_b32_e32 v1, s1 352; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 353; GFX6-NEXT: s_waitcnt vmcnt(0) 354; GFX6-NEXT: s_endpgm 355; 356; GFX8-LABEL: scalar_or_literal_multi_use_i64: 357; GFX8: ; %bb.0: 358; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 359; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c 360; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x74 361; GFX8-NEXT: s_movk_i32 s8, 0x3039 362; GFX8-NEXT: s_mov_b32 s9, 0xf237b 363; GFX8-NEXT: s_mov_b32 s3, 0xf000 364; GFX8-NEXT: s_waitcnt lgkmcnt(0) 365; GFX8-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] 366; GFX8-NEXT: v_mov_b32_e32 v0, s6 367; GFX8-NEXT: s_mov_b32 s2, -1 368; GFX8-NEXT: v_mov_b32_e32 v1, s7 369; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 370; GFX8-NEXT: s_add_u32 s0, s4, 0x3039 371; GFX8-NEXT: s_addc_u32 s1, s5, 0xf237b 372; GFX8-NEXT: v_mov_b32_e32 v0, s0 373; GFX8-NEXT: v_mov_b32_e32 v1, s1 374; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 375; GFX8-NEXT: s_waitcnt vmcnt(0) 376; GFX8-NEXT: s_endpgm 377; 378; EG-LABEL: scalar_or_literal_multi_use_i64: 379; EG: ; %bb.0: 380; EG-NEXT: ALU 12, @6, KC0[CB0:0-32], KC1[] 381; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XY, T4.X, 0 382; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 0 383; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T2.X, 1 384; EG-NEXT: CF_END 385; EG-NEXT: PAD 386; EG-NEXT: ALU clause starting at 6: 387; EG-NEXT: ADDC_UINT * T0.W, KC0[7].Y, literal.x, 388; EG-NEXT: 12345(1.729903e-41), 0(0.000000e+00) 389; EG-NEXT: ADD_INT T0.X, KC0[7].Y, literal.x, 390; EG-NEXT: ADD_INT * T0.W, KC0[7].Z, PV.W, 391; EG-NEXT: 12345(1.729903e-41), 0(0.000000e+00) 392; EG-NEXT: ADD_INT T1.X, PV.W, literal.x, 393; EG-NEXT: MOV * T2.X, literal.y, 394; EG-NEXT: 992123(1.390260e-39), 0(0.000000e+00) 395; EG-NEXT: OR_INT * T3.Y, KC0[5].X, literal.x, 396; EG-NEXT: 992123(1.390260e-39), 0(0.000000e+00) 397; EG-NEXT: OR_INT T3.X, KC0[4].W, literal.x, 398; EG-NEXT: LSHR * T4.X, KC0[2].Y, literal.y, 399; EG-NEXT: 12345(1.729903e-41), 2(2.802597e-45) 400 %or = or i64 %a, 4261135838621753 401 store i64 %or, ptr addrspace(1) %out 402 403 %foo = add i64 %b, 4261135838621753 404 store volatile i64 %foo, ptr addrspace(1) undef 405 ret void 406} 407 408define amdgpu_kernel void @scalar_or_inline_imm_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { 409; GFX6-LABEL: scalar_or_inline_imm_i64: 410; GFX6: ; %bb.0: 411; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 412; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 413; GFX6-NEXT: s_mov_b32 s3, 0xf000 414; GFX6-NEXT: s_mov_b32 s2, -1 415; GFX6-NEXT: s_waitcnt lgkmcnt(0) 416; GFX6-NEXT: s_or_b32 s4, s6, 63 417; GFX6-NEXT: v_mov_b32_e32 v0, s4 418; GFX6-NEXT: v_mov_b32_e32 v1, s7 419; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 420; GFX6-NEXT: s_endpgm 421; 422; GFX8-LABEL: scalar_or_inline_imm_i64: 423; GFX8: ; %bb.0: 424; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c 425; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 426; GFX8-NEXT: s_mov_b32 s3, 0xf000 427; GFX8-NEXT: s_mov_b32 s2, -1 428; GFX8-NEXT: s_waitcnt lgkmcnt(0) 429; GFX8-NEXT: s_or_b32 s4, s6, 63 430; GFX8-NEXT: v_mov_b32_e32 v0, s4 431; GFX8-NEXT: v_mov_b32_e32 v1, s7 432; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 433; GFX8-NEXT: s_endpgm 434; 435; EG-LABEL: scalar_or_inline_imm_i64: 436; EG: ; %bb.0: 437; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 438; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 439; EG-NEXT: CF_END 440; EG-NEXT: PAD 441; EG-NEXT: ALU clause starting at 4: 442; EG-NEXT: MOV * T0.Y, KC0[5].X, 443; EG-NEXT: OR_INT T0.X, KC0[4].W, literal.x, 444; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 445; EG-NEXT: 63(8.828180e-44), 2(2.802597e-45) 446 %or = or i64 %a, 63 447 store i64 %or, ptr addrspace(1) %out 448 ret void 449} 450 451define amdgpu_kernel void @scalar_or_inline_imm_multi_use_i64(ptr addrspace(1) %out, i64 %a, i64 %b) { 452; GFX6-LABEL: scalar_or_inline_imm_multi_use_i64: 453; GFX6: ; %bb.0: 454; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 455; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 456; GFX6-NEXT: s_mov_b32 s7, 0xf000 457; GFX6-NEXT: s_mov_b32 s6, -1 458; GFX6-NEXT: s_waitcnt lgkmcnt(0) 459; GFX6-NEXT: s_mov_b32 s4, s0 460; GFX6-NEXT: s_or_b32 s0, s2, 63 461; GFX6-NEXT: s_mov_b32 s5, s1 462; GFX6-NEXT: v_mov_b32_e32 v0, s0 463; GFX6-NEXT: v_mov_b32_e32 v1, s3 464; GFX6-NEXT: s_add_u32 s0, s8, 63 465; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 466; GFX6-NEXT: s_addc_u32 s1, s9, 0 467; GFX6-NEXT: s_waitcnt expcnt(0) 468; GFX6-NEXT: v_mov_b32_e32 v0, s0 469; GFX6-NEXT: v_mov_b32_e32 v1, s1 470; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 471; GFX6-NEXT: s_waitcnt vmcnt(0) 472; GFX6-NEXT: s_endpgm 473; 474; GFX8-LABEL: scalar_or_inline_imm_multi_use_i64: 475; GFX8: ; %bb.0: 476; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 477; GFX8-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 478; GFX8-NEXT: s_mov_b32 s7, 0xf000 479; GFX8-NEXT: s_mov_b32 s6, -1 480; GFX8-NEXT: s_waitcnt lgkmcnt(0) 481; GFX8-NEXT: s_mov_b32 s4, s0 482; GFX8-NEXT: s_or_b32 s0, s2, 63 483; GFX8-NEXT: s_mov_b32 s5, s1 484; GFX8-NEXT: v_mov_b32_e32 v0, s0 485; GFX8-NEXT: v_mov_b32_e32 v1, s3 486; GFX8-NEXT: s_add_u32 s0, s8, 63 487; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 488; GFX8-NEXT: s_addc_u32 s1, s9, 0 489; GFX8-NEXT: v_mov_b32_e32 v0, s0 490; GFX8-NEXT: v_mov_b32_e32 v1, s1 491; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 492; GFX8-NEXT: s_waitcnt vmcnt(0) 493; GFX8-NEXT: s_endpgm 494; 495; EG-LABEL: scalar_or_inline_imm_multi_use_i64: 496; EG: ; %bb.0: 497; EG-NEXT: ALU 9, @6, KC0[CB0:0-32], KC1[] 498; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XY, T4.X, 0 499; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 0 500; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T2.X, 1 501; EG-NEXT: CF_END 502; EG-NEXT: PAD 503; EG-NEXT: ALU clause starting at 6: 504; EG-NEXT: ADD_INT T0.X, KC0[3].Y, literal.x, 505; EG-NEXT: ADDC_UINT * T0.W, KC0[3].Y, literal.x, 506; EG-NEXT: 63(8.828180e-44), 0(0.000000e+00) 507; EG-NEXT: ADD_INT T1.X, KC0[3].Z, PV.W, 508; EG-NEXT: MOV * T2.X, literal.x, 509; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00) 510; EG-NEXT: MOV * T3.Y, KC0[3].X, 511; EG-NEXT: OR_INT T3.X, KC0[2].W, literal.x, 512; EG-NEXT: LSHR * T4.X, KC0[2].Y, literal.y, 513; EG-NEXT: 63(8.828180e-44), 2(2.802597e-45) 514 %or = or i64 %a, 63 515 store i64 %or, ptr addrspace(1) %out 516 %foo = add i64 %b, 63 517 store volatile i64 %foo, ptr addrspace(1) undef 518 ret void 519} 520 521define amdgpu_kernel void @scalar_or_neg_inline_imm_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { 522; GFX6-LABEL: scalar_or_neg_inline_imm_i64: 523; GFX6: ; %bb.0: 524; GFX6-NEXT: s_load_dword s6, s[4:5], 0x13 525; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 526; GFX6-NEXT: s_mov_b32 s3, 0xf000 527; GFX6-NEXT: s_mov_b32 s2, -1 528; GFX6-NEXT: v_mov_b32_e32 v1, -1 529; GFX6-NEXT: s_waitcnt lgkmcnt(0) 530; GFX6-NEXT: s_or_b32 s4, s6, -8 531; GFX6-NEXT: v_mov_b32_e32 v0, s4 532; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 533; GFX6-NEXT: s_endpgm 534; 535; GFX8-LABEL: scalar_or_neg_inline_imm_i64: 536; GFX8: ; %bb.0: 537; GFX8-NEXT: s_load_dword s6, s[4:5], 0x4c 538; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 539; GFX8-NEXT: s_mov_b32 s3, 0xf000 540; GFX8-NEXT: s_mov_b32 s2, -1 541; GFX8-NEXT: v_mov_b32_e32 v1, -1 542; GFX8-NEXT: s_waitcnt lgkmcnt(0) 543; GFX8-NEXT: s_or_b32 s4, s6, -8 544; GFX8-NEXT: v_mov_b32_e32 v0, s4 545; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 546; GFX8-NEXT: s_endpgm 547; 548; EG-LABEL: scalar_or_neg_inline_imm_i64: 549; EG: ; %bb.0: 550; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] 551; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 552; EG-NEXT: CF_END 553; EG-NEXT: PAD 554; EG-NEXT: ALU clause starting at 4: 555; EG-NEXT: OR_INT T0.X, KC0[4].W, literal.x, 556; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 557; EG-NEXT: -8(nan), 2(2.802597e-45) 558; EG-NEXT: MOV * T0.Y, literal.x, 559; EG-NEXT: -1(nan), 0(0.000000e+00) 560 %or = or i64 %a, -8 561 store i64 %or, ptr addrspace(1) %out 562 ret void 563} 564 565define amdgpu_kernel void @vector_or_literal_i32(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { 566; GFX6-LABEL: vector_or_literal_i32: 567; GFX6: ; %bb.0: 568; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 569; GFX6-NEXT: s_mov_b32 s7, 0xf000 570; GFX6-NEXT: s_mov_b32 s6, -1 571; GFX6-NEXT: s_mov_b32 s10, s6 572; GFX6-NEXT: s_mov_b32 s11, s7 573; GFX6-NEXT: s_waitcnt lgkmcnt(0) 574; GFX6-NEXT: s_mov_b32 s8, s2 575; GFX6-NEXT: s_mov_b32 s9, s3 576; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0 577; GFX6-NEXT: s_mov_b32 s4, s0 578; GFX6-NEXT: s_mov_b32 s5, s1 579; GFX6-NEXT: s_waitcnt vmcnt(0) 580; GFX6-NEXT: v_or_b32_e32 v0, 0xffff, v0 581; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 582; GFX6-NEXT: s_endpgm 583; 584; GFX8-LABEL: vector_or_literal_i32: 585; GFX8: ; %bb.0: 586; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 587; GFX8-NEXT: s_mov_b32 s7, 0xf000 588; GFX8-NEXT: s_mov_b32 s6, -1 589; GFX8-NEXT: s_mov_b32 s10, s6 590; GFX8-NEXT: s_mov_b32 s11, s7 591; GFX8-NEXT: s_waitcnt lgkmcnt(0) 592; GFX8-NEXT: s_mov_b32 s8, s2 593; GFX8-NEXT: s_mov_b32 s9, s3 594; GFX8-NEXT: buffer_load_dword v0, off, s[8:11], 0 595; GFX8-NEXT: s_mov_b32 s4, s0 596; GFX8-NEXT: s_mov_b32 s5, s1 597; GFX8-NEXT: s_waitcnt vmcnt(0) 598; GFX8-NEXT: v_or_b32_e32 v0, 0xffff, v0 599; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 600; GFX8-NEXT: s_endpgm 601; 602; EG-LABEL: vector_or_literal_i32: 603; EG: ; %bb.0: 604; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 605; EG-NEXT: TEX 0 @6 606; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] 607; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 608; EG-NEXT: CF_END 609; EG-NEXT: PAD 610; EG-NEXT: Fetch clause starting at 6: 611; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 612; EG-NEXT: ALU clause starting at 8: 613; EG-NEXT: MOV * T0.X, KC0[2].Z, 614; EG-NEXT: ALU clause starting at 9: 615; EG-NEXT: OR_INT T0.X, T0.X, literal.x, 616; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 617; EG-NEXT: 65535(9.183409e-41), 2(2.802597e-45) 618 %loada = load i32, ptr addrspace(1) %a, align 4 619 %or = or i32 %loada, 65535 620 store i32 %or, ptr addrspace(1) %out, align 4 621 ret void 622} 623 624define amdgpu_kernel void @vector_or_inline_immediate_i32(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { 625; GFX6-LABEL: vector_or_inline_immediate_i32: 626; GFX6: ; %bb.0: 627; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 628; GFX6-NEXT: s_mov_b32 s7, 0xf000 629; GFX6-NEXT: s_mov_b32 s6, -1 630; GFX6-NEXT: s_mov_b32 s10, s6 631; GFX6-NEXT: s_mov_b32 s11, s7 632; GFX6-NEXT: s_waitcnt lgkmcnt(0) 633; GFX6-NEXT: s_mov_b32 s8, s2 634; GFX6-NEXT: s_mov_b32 s9, s3 635; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0 636; GFX6-NEXT: s_mov_b32 s4, s0 637; GFX6-NEXT: s_mov_b32 s5, s1 638; GFX6-NEXT: s_waitcnt vmcnt(0) 639; GFX6-NEXT: v_or_b32_e32 v0, 4, v0 640; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 641; GFX6-NEXT: s_endpgm 642; 643; GFX8-LABEL: vector_or_inline_immediate_i32: 644; GFX8: ; %bb.0: 645; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 646; GFX8-NEXT: s_mov_b32 s7, 0xf000 647; GFX8-NEXT: s_mov_b32 s6, -1 648; GFX8-NEXT: s_mov_b32 s10, s6 649; GFX8-NEXT: s_mov_b32 s11, s7 650; GFX8-NEXT: s_waitcnt lgkmcnt(0) 651; GFX8-NEXT: s_mov_b32 s8, s2 652; GFX8-NEXT: s_mov_b32 s9, s3 653; GFX8-NEXT: buffer_load_dword v0, off, s[8:11], 0 654; GFX8-NEXT: s_mov_b32 s4, s0 655; GFX8-NEXT: s_mov_b32 s5, s1 656; GFX8-NEXT: s_waitcnt vmcnt(0) 657; GFX8-NEXT: v_or_b32_e32 v0, 4, v0 658; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 659; GFX8-NEXT: s_endpgm 660; 661; EG-LABEL: vector_or_inline_immediate_i32: 662; EG: ; %bb.0: 663; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 664; EG-NEXT: TEX 0 @6 665; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] 666; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 667; EG-NEXT: CF_END 668; EG-NEXT: PAD 669; EG-NEXT: Fetch clause starting at 6: 670; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 671; EG-NEXT: ALU clause starting at 8: 672; EG-NEXT: MOV * T0.X, KC0[2].Z, 673; EG-NEXT: ALU clause starting at 9: 674; EG-NEXT: OR_INT T0.X, T0.X, literal.x, 675; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 676; EG-NEXT: 4(5.605194e-45), 2(2.802597e-45) 677 %loada = load i32, ptr addrspace(1) %a, align 4 678 %or = or i32 %loada, 4 679 store i32 %or, ptr addrspace(1) %out, align 4 680 ret void 681} 682 683define amdgpu_kernel void @scalar_or_i64(ptr addrspace(1) %out, i64 %a, i64 %b) { 684; GFX6-LABEL: scalar_or_i64: 685; GFX6: ; %bb.0: 686; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 687; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 688; GFX6-NEXT: s_mov_b32 s7, 0xf000 689; GFX6-NEXT: s_mov_b32 s6, -1 690; GFX6-NEXT: s_waitcnt lgkmcnt(0) 691; GFX6-NEXT: s_mov_b32 s4, s0 692; GFX6-NEXT: s_mov_b32 s5, s1 693; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9] 694; GFX6-NEXT: v_mov_b32_e32 v0, s0 695; GFX6-NEXT: v_mov_b32_e32 v1, s1 696; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 697; GFX6-NEXT: s_endpgm 698; 699; GFX8-LABEL: scalar_or_i64: 700; GFX8: ; %bb.0: 701; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 702; GFX8-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 703; GFX8-NEXT: s_mov_b32 s7, 0xf000 704; GFX8-NEXT: s_mov_b32 s6, -1 705; GFX8-NEXT: s_waitcnt lgkmcnt(0) 706; GFX8-NEXT: s_mov_b32 s4, s0 707; GFX8-NEXT: s_mov_b32 s5, s1 708; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9] 709; GFX8-NEXT: v_mov_b32_e32 v0, s0 710; GFX8-NEXT: v_mov_b32_e32 v1, s1 711; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 712; GFX8-NEXT: s_endpgm 713; 714; EG-LABEL: scalar_or_i64: 715; EG: ; %bb.0: 716; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 717; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 718; EG-NEXT: CF_END 719; EG-NEXT: PAD 720; EG-NEXT: ALU clause starting at 4: 721; EG-NEXT: OR_INT * T0.Y, KC0[3].X, KC0[3].Z, 722; EG-NEXT: OR_INT * T0.X, KC0[2].W, KC0[3].Y, 723; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 724; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 725 %or = or i64 %a, %b 726 store i64 %or, ptr addrspace(1) %out 727 ret void 728} 729 730define amdgpu_kernel void @vector_or_i64(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { 731; GFX6-LABEL: vector_or_i64: 732; GFX6: ; %bb.0: 733; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 734; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 735; GFX6-NEXT: s_mov_b32 s7, 0xf000 736; GFX6-NEXT: s_mov_b32 s6, -1 737; GFX6-NEXT: s_mov_b32 s10, s6 738; GFX6-NEXT: s_mov_b32 s11, s7 739; GFX6-NEXT: s_waitcnt lgkmcnt(0) 740; GFX6-NEXT: s_mov_b32 s12, s2 741; GFX6-NEXT: s_mov_b32 s13, s3 742; GFX6-NEXT: s_mov_b32 s14, s6 743; GFX6-NEXT: s_mov_b32 s15, s7 744; GFX6-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 745; GFX6-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 746; GFX6-NEXT: s_mov_b32 s4, s0 747; GFX6-NEXT: s_mov_b32 s5, s1 748; GFX6-NEXT: s_waitcnt vmcnt(0) 749; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 750; GFX6-NEXT: v_or_b32_e32 v1, v3, v1 751; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 752; GFX6-NEXT: s_endpgm 753; 754; GFX8-LABEL: vector_or_i64: 755; GFX8: ; %bb.0: 756; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 757; GFX8-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 758; GFX8-NEXT: s_mov_b32 s7, 0xf000 759; GFX8-NEXT: s_mov_b32 s6, -1 760; GFX8-NEXT: s_mov_b32 s10, s6 761; GFX8-NEXT: s_mov_b32 s11, s7 762; GFX8-NEXT: s_waitcnt lgkmcnt(0) 763; GFX8-NEXT: s_mov_b32 s12, s2 764; GFX8-NEXT: s_mov_b32 s13, s3 765; GFX8-NEXT: s_mov_b32 s14, s6 766; GFX8-NEXT: s_mov_b32 s15, s7 767; GFX8-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 768; GFX8-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 769; GFX8-NEXT: s_mov_b32 s4, s0 770; GFX8-NEXT: s_mov_b32 s5, s1 771; GFX8-NEXT: s_waitcnt vmcnt(0) 772; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 773; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 774; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 775; GFX8-NEXT: s_endpgm 776; 777; EG-LABEL: vector_or_i64: 778; EG: ; %bb.0: 779; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[] 780; EG-NEXT: TEX 1 @6 781; EG-NEXT: ALU 3, @12, KC0[CB0:0-32], KC1[] 782; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 783; EG-NEXT: CF_END 784; EG-NEXT: PAD 785; EG-NEXT: Fetch clause starting at 6: 786; EG-NEXT: VTX_READ_64 T1.XY, T1.X, 0, #1 787; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 788; EG-NEXT: ALU clause starting at 10: 789; EG-NEXT: MOV T0.X, KC0[2].Z, 790; EG-NEXT: MOV * T1.X, KC0[2].W, 791; EG-NEXT: ALU clause starting at 12: 792; EG-NEXT: OR_INT * T0.Y, T0.Y, T1.Y, 793; EG-NEXT: OR_INT T0.X, T0.X, T1.X, 794; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 795; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 796 %loada = load i64, ptr addrspace(1) %a, align 8 797 %loadb = load i64, ptr addrspace(1) %b, align 8 798 %or = or i64 %loada, %loadb 799 store i64 %or, ptr addrspace(1) %out 800 ret void 801} 802 803define amdgpu_kernel void @scalar_vector_or_i64(ptr addrspace(1) %out, ptr addrspace(1) %a, i64 %b) { 804; GFX6-LABEL: scalar_vector_or_i64: 805; GFX6: ; %bb.0: 806; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 807; GFX6-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xd 808; GFX6-NEXT: s_mov_b32 s7, 0xf000 809; GFX6-NEXT: s_mov_b32 s6, -1 810; GFX6-NEXT: s_mov_b32 s10, s6 811; GFX6-NEXT: s_waitcnt lgkmcnt(0) 812; GFX6-NEXT: s_mov_b32 s8, s2 813; GFX6-NEXT: s_mov_b32 s9, s3 814; GFX6-NEXT: s_mov_b32 s11, s7 815; GFX6-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 816; GFX6-NEXT: s_mov_b32 s4, s0 817; GFX6-NEXT: s_mov_b32 s5, s1 818; GFX6-NEXT: s_waitcnt vmcnt(0) 819; GFX6-NEXT: v_or_b32_e32 v0, s12, v0 820; GFX6-NEXT: v_or_b32_e32 v1, s13, v1 821; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 822; GFX6-NEXT: s_endpgm 823; 824; GFX8-LABEL: scalar_vector_or_i64: 825; GFX8: ; %bb.0: 826; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 827; GFX8-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34 828; GFX8-NEXT: s_mov_b32 s7, 0xf000 829; GFX8-NEXT: s_mov_b32 s6, -1 830; GFX8-NEXT: s_mov_b32 s10, s6 831; GFX8-NEXT: s_waitcnt lgkmcnt(0) 832; GFX8-NEXT: s_mov_b32 s8, s2 833; GFX8-NEXT: s_mov_b32 s9, s3 834; GFX8-NEXT: s_mov_b32 s11, s7 835; GFX8-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 836; GFX8-NEXT: s_mov_b32 s4, s0 837; GFX8-NEXT: s_mov_b32 s5, s1 838; GFX8-NEXT: s_waitcnt vmcnt(0) 839; GFX8-NEXT: v_or_b32_e32 v0, s12, v0 840; GFX8-NEXT: v_or_b32_e32 v1, s13, v1 841; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 842; GFX8-NEXT: s_endpgm 843; 844; EG-LABEL: scalar_vector_or_i64: 845; EG: ; %bb.0: 846; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 847; EG-NEXT: TEX 0 @6 848; EG-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[] 849; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 850; EG-NEXT: CF_END 851; EG-NEXT: PAD 852; EG-NEXT: Fetch clause starting at 6: 853; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 854; EG-NEXT: ALU clause starting at 8: 855; EG-NEXT: MOV * T0.X, KC0[2].Z, 856; EG-NEXT: ALU clause starting at 9: 857; EG-NEXT: OR_INT * T0.Y, T0.Y, KC0[3].X, 858; EG-NEXT: OR_INT T0.X, T0.X, KC0[2].W, 859; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 860; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 861 %loada = load i64, ptr addrspace(1) %a 862 %or = or i64 %loada, %b 863 store i64 %or, ptr addrspace(1) %out 864 ret void 865} 866 867define amdgpu_kernel void @vector_or_i64_loadimm(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { 868; GFX6-LABEL: vector_or_i64_loadimm: 869; GFX6: ; %bb.0: 870; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 871; GFX6-NEXT: s_mov_b32 s7, 0xf000 872; GFX6-NEXT: s_mov_b32 s6, -1 873; GFX6-NEXT: s_mov_b32 s10, s6 874; GFX6-NEXT: s_mov_b32 s11, s7 875; GFX6-NEXT: s_waitcnt lgkmcnt(0) 876; GFX6-NEXT: s_mov_b32 s8, s2 877; GFX6-NEXT: s_mov_b32 s9, s3 878; GFX6-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 879; GFX6-NEXT: s_mov_b32 s4, s0 880; GFX6-NEXT: s_mov_b32 s5, s1 881; GFX6-NEXT: s_waitcnt vmcnt(0) 882; GFX6-NEXT: v_or_b32_e32 v1, 0x146f, v1 883; GFX6-NEXT: v_or_b32_e32 v0, 0xdf77987f, v0 884; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 885; GFX6-NEXT: s_endpgm 886; 887; GFX8-LABEL: vector_or_i64_loadimm: 888; GFX8: ; %bb.0: 889; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 890; GFX8-NEXT: s_mov_b32 s7, 0xf000 891; GFX8-NEXT: s_mov_b32 s6, -1 892; GFX8-NEXT: s_mov_b32 s10, s6 893; GFX8-NEXT: s_mov_b32 s11, s7 894; GFX8-NEXT: s_waitcnt lgkmcnt(0) 895; GFX8-NEXT: s_mov_b32 s8, s2 896; GFX8-NEXT: s_mov_b32 s9, s3 897; GFX8-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 898; GFX8-NEXT: s_mov_b32 s4, s0 899; GFX8-NEXT: s_mov_b32 s5, s1 900; GFX8-NEXT: s_waitcnt vmcnt(0) 901; GFX8-NEXT: v_or_b32_e32 v1, 0x146f, v1 902; GFX8-NEXT: v_or_b32_e32 v0, 0xdf77987f, v0 903; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 904; GFX8-NEXT: s_endpgm 905; 906; EG-LABEL: vector_or_i64_loadimm: 907; EG: ; %bb.0: 908; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 909; EG-NEXT: TEX 0 @6 910; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[] 911; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 912; EG-NEXT: CF_END 913; EG-NEXT: PAD 914; EG-NEXT: Fetch clause starting at 6: 915; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 916; EG-NEXT: ALU clause starting at 8: 917; EG-NEXT: MOV * T0.X, KC0[2].Z, 918; EG-NEXT: ALU clause starting at 9: 919; EG-NEXT: OR_INT * T0.Y, T0.Y, literal.x, 920; EG-NEXT: 5231(7.330192e-42), 0(0.000000e+00) 921; EG-NEXT: OR_INT T0.X, T0.X, literal.x, 922; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 923; EG-NEXT: -545810305(-1.784115e+19), 2(2.802597e-45) 924 %loada = load i64, ptr addrspace(1) %a, align 8 925 %or = or i64 %loada, 22470723082367 926 store i64 %or, ptr addrspace(1) %out 927 ret void 928} 929 930; FIXME: The or 0 should really be removed. 931define amdgpu_kernel void @vector_or_i64_imm(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { 932; GFX6-LABEL: vector_or_i64_imm: 933; GFX6: ; %bb.0: 934; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 935; GFX6-NEXT: s_mov_b32 s7, 0xf000 936; GFX6-NEXT: s_mov_b32 s6, -1 937; GFX6-NEXT: s_mov_b32 s10, s6 938; GFX6-NEXT: s_mov_b32 s11, s7 939; GFX6-NEXT: s_waitcnt lgkmcnt(0) 940; GFX6-NEXT: s_mov_b32 s8, s2 941; GFX6-NEXT: s_mov_b32 s9, s3 942; GFX6-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 943; GFX6-NEXT: s_mov_b32 s4, s0 944; GFX6-NEXT: s_mov_b32 s5, s1 945; GFX6-NEXT: s_waitcnt vmcnt(0) 946; GFX6-NEXT: v_or_b32_e32 v0, 8, v0 947; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 948; GFX6-NEXT: s_endpgm 949; 950; GFX8-LABEL: vector_or_i64_imm: 951; GFX8: ; %bb.0: 952; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 953; GFX8-NEXT: s_mov_b32 s7, 0xf000 954; GFX8-NEXT: s_mov_b32 s6, -1 955; GFX8-NEXT: s_mov_b32 s10, s6 956; GFX8-NEXT: s_mov_b32 s11, s7 957; GFX8-NEXT: s_waitcnt lgkmcnt(0) 958; GFX8-NEXT: s_mov_b32 s8, s2 959; GFX8-NEXT: s_mov_b32 s9, s3 960; GFX8-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 961; GFX8-NEXT: s_mov_b32 s4, s0 962; GFX8-NEXT: s_mov_b32 s5, s1 963; GFX8-NEXT: s_waitcnt vmcnt(0) 964; GFX8-NEXT: v_or_b32_e32 v0, 8, v0 965; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 966; GFX8-NEXT: s_endpgm 967; 968; EG-LABEL: vector_or_i64_imm: 969; EG: ; %bb.0: 970; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 971; EG-NEXT: TEX 0 @6 972; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] 973; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 974; EG-NEXT: CF_END 975; EG-NEXT: PAD 976; EG-NEXT: Fetch clause starting at 6: 977; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 978; EG-NEXT: ALU clause starting at 8: 979; EG-NEXT: MOV * T0.X, KC0[2].Z, 980; EG-NEXT: ALU clause starting at 9: 981; EG-NEXT: OR_INT T0.X, T0.X, literal.x, 982; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 983; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45) 984 %loada = load i64, ptr addrspace(1) %a, align 8 985 %or = or i64 %loada, 8 986 store i64 %or, ptr addrspace(1) %out 987 ret void 988} 989 990define amdgpu_kernel void @vector_or_i64_neg_inline_imm(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { 991; GFX6-LABEL: vector_or_i64_neg_inline_imm: 992; GFX6: ; %bb.0: 993; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 994; GFX6-NEXT: s_mov_b32 s7, 0xf000 995; GFX6-NEXT: s_mov_b32 s6, -1 996; GFX6-NEXT: s_mov_b32 s10, s6 997; GFX6-NEXT: s_mov_b32 s11, s7 998; GFX6-NEXT: s_waitcnt lgkmcnt(0) 999; GFX6-NEXT: s_mov_b32 s8, s2 1000; GFX6-NEXT: s_mov_b32 s9, s3 1001; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0 1002; GFX6-NEXT: s_mov_b32 s4, s0 1003; GFX6-NEXT: s_mov_b32 s5, s1 1004; GFX6-NEXT: v_mov_b32_e32 v1, -1 1005; GFX6-NEXT: s_waitcnt vmcnt(0) 1006; GFX6-NEXT: v_or_b32_e32 v0, -8, v0 1007; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1008; GFX6-NEXT: s_endpgm 1009; 1010; GFX8-LABEL: vector_or_i64_neg_inline_imm: 1011; GFX8: ; %bb.0: 1012; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1013; GFX8-NEXT: s_mov_b32 s7, 0xf000 1014; GFX8-NEXT: s_mov_b32 s6, -1 1015; GFX8-NEXT: s_mov_b32 s10, s6 1016; GFX8-NEXT: s_mov_b32 s11, s7 1017; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1018; GFX8-NEXT: s_mov_b32 s8, s2 1019; GFX8-NEXT: s_mov_b32 s9, s3 1020; GFX8-NEXT: buffer_load_dword v0, off, s[8:11], 0 1021; GFX8-NEXT: s_mov_b32 s4, s0 1022; GFX8-NEXT: s_mov_b32 s5, s1 1023; GFX8-NEXT: v_mov_b32_e32 v1, -1 1024; GFX8-NEXT: s_waitcnt vmcnt(0) 1025; GFX8-NEXT: v_or_b32_e32 v0, -8, v0 1026; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1027; GFX8-NEXT: s_endpgm 1028; 1029; EG-LABEL: vector_or_i64_neg_inline_imm: 1030; EG: ; %bb.0: 1031; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1032; EG-NEXT: TEX 0 @6 1033; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[] 1034; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1035; EG-NEXT: CF_END 1036; EG-NEXT: PAD 1037; EG-NEXT: Fetch clause starting at 6: 1038; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1039; EG-NEXT: ALU clause starting at 8: 1040; EG-NEXT: MOV * T0.X, KC0[2].Z, 1041; EG-NEXT: ALU clause starting at 9: 1042; EG-NEXT: OR_INT T0.X, T0.X, literal.x, 1043; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 1044; EG-NEXT: -8(nan), 2(2.802597e-45) 1045; EG-NEXT: MOV * T0.Y, literal.x, 1046; EG-NEXT: -1(nan), 0(0.000000e+00) 1047 %loada = load i64, ptr addrspace(1) %a, align 8 1048 %or = or i64 %loada, -8 1049 store i64 %or, ptr addrspace(1) %out 1050 ret void 1051} 1052 1053define amdgpu_kernel void @vector_or_i64_neg_literal(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { 1054; GFX6-LABEL: vector_or_i64_neg_literal: 1055; GFX6: ; %bb.0: 1056; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1057; GFX6-NEXT: s_mov_b32 s7, 0xf000 1058; GFX6-NEXT: s_mov_b32 s6, -1 1059; GFX6-NEXT: s_mov_b32 s10, s6 1060; GFX6-NEXT: s_mov_b32 s11, s7 1061; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1062; GFX6-NEXT: s_mov_b32 s8, s2 1063; GFX6-NEXT: s_mov_b32 s9, s3 1064; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0 1065; GFX6-NEXT: s_mov_b32 s4, s0 1066; GFX6-NEXT: s_mov_b32 s5, s1 1067; GFX6-NEXT: v_mov_b32_e32 v1, -1 1068; GFX6-NEXT: s_waitcnt vmcnt(0) 1069; GFX6-NEXT: v_or_b32_e32 v0, 0xffffff38, v0 1070; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1071; GFX6-NEXT: s_endpgm 1072; 1073; GFX8-LABEL: vector_or_i64_neg_literal: 1074; GFX8: ; %bb.0: 1075; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1076; GFX8-NEXT: s_mov_b32 s7, 0xf000 1077; GFX8-NEXT: s_mov_b32 s6, -1 1078; GFX8-NEXT: s_mov_b32 s10, s6 1079; GFX8-NEXT: s_mov_b32 s11, s7 1080; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1081; GFX8-NEXT: s_mov_b32 s8, s2 1082; GFX8-NEXT: s_mov_b32 s9, s3 1083; GFX8-NEXT: buffer_load_dword v0, off, s[8:11], 0 1084; GFX8-NEXT: s_mov_b32 s4, s0 1085; GFX8-NEXT: s_mov_b32 s5, s1 1086; GFX8-NEXT: v_mov_b32_e32 v1, -1 1087; GFX8-NEXT: s_waitcnt vmcnt(0) 1088; GFX8-NEXT: v_or_b32_e32 v0, 0xffffff38, v0 1089; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1090; GFX8-NEXT: s_endpgm 1091; 1092; EG-LABEL: vector_or_i64_neg_literal: 1093; EG: ; %bb.0: 1094; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1095; EG-NEXT: TEX 0 @6 1096; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[] 1097; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1098; EG-NEXT: CF_END 1099; EG-NEXT: PAD 1100; EG-NEXT: Fetch clause starting at 6: 1101; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1102; EG-NEXT: ALU clause starting at 8: 1103; EG-NEXT: MOV * T0.X, KC0[2].Z, 1104; EG-NEXT: ALU clause starting at 9: 1105; EG-NEXT: OR_INT T0.X, T0.X, literal.x, 1106; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 1107; EG-NEXT: -200(nan), 2(2.802597e-45) 1108; EG-NEXT: MOV * T0.Y, literal.x, 1109; EG-NEXT: -1(nan), 0(0.000000e+00) 1110 %loada = load i64, ptr addrspace(1) %a, align 8 1111 %or = or i64 %loada, -200 1112 store i64 %or, ptr addrspace(1) %out 1113 ret void 1114} 1115 1116define amdgpu_kernel void @trunc_i64_or_to_i32(ptr addrspace(1) %out, [8 x i32], i64 %a, [8 x i32], i64 %b) { 1117; GFX6-LABEL: trunc_i64_or_to_i32: 1118; GFX6: ; %bb.0: 1119; GFX6-NEXT: s_load_dword s6, s[4:5], 0x13 1120; GFX6-NEXT: s_load_dword s7, s[4:5], 0x1d 1121; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1122; GFX6-NEXT: s_mov_b32 s3, 0xf000 1123; GFX6-NEXT: s_mov_b32 s2, -1 1124; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1125; GFX6-NEXT: s_or_b32 s4, s7, s6 1126; GFX6-NEXT: v_mov_b32_e32 v0, s4 1127; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 1128; GFX6-NEXT: s_endpgm 1129; 1130; GFX8-LABEL: trunc_i64_or_to_i32: 1131; GFX8: ; %bb.0: 1132; GFX8-NEXT: s_load_dword s6, s[4:5], 0x4c 1133; GFX8-NEXT: s_load_dword s7, s[4:5], 0x74 1134; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1135; GFX8-NEXT: s_mov_b32 s3, 0xf000 1136; GFX8-NEXT: s_mov_b32 s2, -1 1137; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1138; GFX8-NEXT: s_or_b32 s4, s7, s6 1139; GFX8-NEXT: v_mov_b32_e32 v0, s4 1140; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 1141; GFX8-NEXT: s_endpgm 1142; 1143; EG-LABEL: trunc_i64_or_to_i32: 1144; EG: ; %bb.0: 1145; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 1146; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 1147; EG-NEXT: CF_END 1148; EG-NEXT: PAD 1149; EG-NEXT: ALU clause starting at 4: 1150; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 1151; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1152; EG-NEXT: OR_INT * T1.X, KC0[7].Y, KC0[4].W, 1153 %add = or i64 %b, %a 1154 %trunc = trunc i64 %add to i32 1155 store i32 %trunc, ptr addrspace(1) %out, align 8 1156 ret void 1157} 1158 1159define amdgpu_kernel void @or_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { 1160; GFX6-LABEL: or_i1: 1161; GFX6: ; %bb.0: 1162; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1163; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 1164; GFX6-NEXT: s_mov_b32 s7, 0xf000 1165; GFX6-NEXT: s_mov_b32 s6, -1 1166; GFX6-NEXT: s_mov_b32 s10, s6 1167; GFX6-NEXT: s_mov_b32 s11, s7 1168; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1169; GFX6-NEXT: s_mov_b32 s12, s2 1170; GFX6-NEXT: s_mov_b32 s13, s3 1171; GFX6-NEXT: s_mov_b32 s14, s6 1172; GFX6-NEXT: s_mov_b32 s15, s7 1173; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0 1174; GFX6-NEXT: buffer_load_dword v1, off, s[12:15], 0 1175; GFX6-NEXT: s_mov_b32 s4, s0 1176; GFX6-NEXT: s_mov_b32 s5, s1 1177; GFX6-NEXT: s_waitcnt vmcnt(1) 1178; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 1179; GFX6-NEXT: s_waitcnt vmcnt(0) 1180; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 1181; GFX6-NEXT: v_max_f32_e32 v0, v1, v0 1182; GFX6-NEXT: v_cmp_le_f32_e32 vcc, 0, v0 1183; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 1184; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 1185; GFX6-NEXT: s_endpgm 1186; 1187; GFX8-LABEL: or_i1: 1188; GFX8: ; %bb.0: 1189; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1190; GFX8-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 1191; GFX8-NEXT: s_mov_b32 s7, 0xf000 1192; GFX8-NEXT: s_mov_b32 s6, -1 1193; GFX8-NEXT: s_mov_b32 s10, s6 1194; GFX8-NEXT: s_mov_b32 s11, s7 1195; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1196; GFX8-NEXT: s_mov_b32 s12, s2 1197; GFX8-NEXT: s_mov_b32 s13, s3 1198; GFX8-NEXT: s_mov_b32 s14, s6 1199; GFX8-NEXT: s_mov_b32 s15, s7 1200; GFX8-NEXT: buffer_load_dword v0, off, s[8:11], 0 1201; GFX8-NEXT: buffer_load_dword v1, off, s[12:15], 0 1202; GFX8-NEXT: s_mov_b32 s4, s0 1203; GFX8-NEXT: s_mov_b32 s5, s1 1204; GFX8-NEXT: s_waitcnt vmcnt(1) 1205; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0 1206; GFX8-NEXT: s_waitcnt vmcnt(0) 1207; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v1 1208; GFX8-NEXT: v_max_f32_e32 v0, v1, v0 1209; GFX8-NEXT: v_cmp_le_f32_e32 vcc, 0, v0 1210; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 1211; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 1212; GFX8-NEXT: s_endpgm 1213; 1214; EG-LABEL: or_i1: 1215; EG: ; %bb.0: 1216; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[] 1217; EG-NEXT: TEX 1 @6 1218; EG-NEXT: ALU 4, @12, KC0[CB0:0-32], KC1[] 1219; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1220; EG-NEXT: CF_END 1221; EG-NEXT: PAD 1222; EG-NEXT: Fetch clause starting at 6: 1223; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1 1224; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1225; EG-NEXT: ALU clause starting at 10: 1226; EG-NEXT: MOV T0.X, KC0[2].Z, 1227; EG-NEXT: MOV * T1.X, KC0[2].W, 1228; EG-NEXT: ALU clause starting at 12: 1229; EG-NEXT: MAX_DX10 * T0.W, T0.X, T1.X, 1230; EG-NEXT: SETGE_DX10 * T0.W, PV.W, 0.0, 1231; EG-NEXT: AND_INT T0.X, PV.W, 1, 1232; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1233; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1234 %a = load float, ptr addrspace(1) %in0 1235 %b = load float, ptr addrspace(1) %in1 1236 %acmp = fcmp oge float %a, 0.000000e+00 1237 %bcmp = fcmp oge float %b, 0.000000e+00 1238 %or = or i1 %acmp, %bcmp 1239 %result = zext i1 %or to i32 1240 store i32 %result, ptr addrspace(1) %out 1241 ret void 1242} 1243 1244define amdgpu_kernel void @s_or_i1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i32 %d) { 1245; GFX6-LABEL: s_or_i1: 1246; GFX6: ; %bb.0: 1247; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb 1248; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 1249; GFX6-NEXT: s_mov_b32 s7, 0xf000 1250; GFX6-NEXT: s_mov_b32 s6, -1 1251; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1252; GFX6-NEXT: s_cmp_eq_u32 s0, s1 1253; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 1254; GFX6-NEXT: s_cmp_eq_u32 s2, s3 1255; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 1256; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] 1257; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] 1258; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0 1259; GFX6-NEXT: s_endpgm 1260; 1261; GFX8-LABEL: s_or_i1: 1262; GFX8: ; %bb.0: 1263; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 1264; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 1265; GFX8-NEXT: s_mov_b32 s7, 0xf000 1266; GFX8-NEXT: s_mov_b32 s6, -1 1267; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1268; GFX8-NEXT: s_cmp_eq_u32 s0, s1 1269; GFX8-NEXT: s_cselect_b64 s[0:1], -1, 0 1270; GFX8-NEXT: s_cmp_eq_u32 s2, s3 1271; GFX8-NEXT: s_cselect_b64 s[2:3], -1, 0 1272; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] 1273; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] 1274; GFX8-NEXT: buffer_store_byte v0, off, s[4:7], 0 1275; GFX8-NEXT: s_endpgm 1276; 1277; EG-LABEL: s_or_i1: 1278; EG: ; %bb.0: 1279; EG-NEXT: ALU 14, @4, KC0[CB0:0-32], KC1[] 1280; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 1281; EG-NEXT: CF_END 1282; EG-NEXT: PAD 1283; EG-NEXT: ALU clause starting at 4: 1284; EG-NEXT: SETE_INT T0.W, KC0[3].X, KC0[3].Y, 1285; EG-NEXT: SETE_INT * T1.W, KC0[2].Z, KC0[2].W, 1286; EG-NEXT: AND_INT T2.W, KC0[2].Y, literal.x, 1287; EG-NEXT: OR_INT * T0.W, PS, PV.W, 1288; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1289; EG-NEXT: AND_INT T0.W, PS, 1, 1290; EG-NEXT: LSHL * T1.W, PV.W, literal.x, 1291; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1292; EG-NEXT: LSHL T0.X, PV.W, PS, 1293; EG-NEXT: LSHL * T0.W, literal.x, PS, 1294; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1295; EG-NEXT: MOV T0.Y, 0.0, 1296; EG-NEXT: MOV * T0.Z, 0.0, 1297; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1298; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1299 %cmp0 = icmp eq i32 %a, %b 1300 %cmp1 = icmp eq i32 %c, %d 1301 %or = or i1 %cmp0, %cmp1 1302 store i1 %or, ptr addrspace(1) %out 1303 ret void 1304} 1305 1306