1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10 3; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX9 4 5define hidden void @shuffle6766(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) { 6; GFX10-LABEL: shuffle6766: 7; GFX10: ; %bb.0: 8; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9; GFX10-NEXT: global_load_dword v0, v[2:3], off 10; GFX10-NEXT: s_waitcnt vmcnt(0) 11; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x6060706 12; GFX10-NEXT: global_store_dword v[4:5], v0, off 13; GFX10-NEXT: s_setpc_b64 s[30:31] 14; 15; GFX9-LABEL: shuffle6766: 16; GFX9: ; %bb.0: 17; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 18; GFX9-NEXT: global_load_dword v0, v[2:3], off 19; GFX9-NEXT: s_mov_b32 s4, 0x6060706 20; GFX9-NEXT: s_waitcnt vmcnt(0) 21; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 22; GFX9-NEXT: global_store_dword v[4:5], v0, off 23; GFX9-NEXT: s_waitcnt vmcnt(0) 24; GFX9-NEXT: s_setpc_b64 s[30:31] 25 %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4 26 %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4 27 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 6, i32 7, i32 6, i32 6> 28 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4 29 ret void 30} 31 32define hidden void @shuffle3744(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) { 33; GFX10-LABEL: shuffle3744: 34; GFX10: ; %bb.0: 35; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 36; GFX10-NEXT: global_load_dword v6, v[0:1], off 37; GFX10-NEXT: global_load_dword v7, v[2:3], off 38; GFX10-NEXT: s_waitcnt vmcnt(0) 39; GFX10-NEXT: v_perm_b32 v0, v6, v7, 0x307 40; GFX10-NEXT: global_store_dword v[4:5], v0, off 41; GFX10-NEXT: s_setpc_b64 s[30:31] 42; 43; GFX9-LABEL: shuffle3744: 44; GFX9: ; %bb.0: 45; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 46; GFX9-NEXT: global_load_dword v6, v[0:1], off 47; GFX9-NEXT: global_load_dword v7, v[2:3], off 48; GFX9-NEXT: s_movk_i32 s4, 0x307 49; GFX9-NEXT: s_waitcnt vmcnt(0) 50; GFX9-NEXT: v_perm_b32 v0, v6, v7, s4 51; GFX9-NEXT: global_store_dword v[4:5], v0, off 52; GFX9-NEXT: s_waitcnt vmcnt(0) 53; GFX9-NEXT: s_setpc_b64 s[30:31] 54 %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4 55 %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4 56 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 3, i32 7, i32 4, i32 4> 57 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4 58 ret void 59} 60 61define hidden void @shuffle4445(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) { 62; GFX10-LABEL: shuffle4445: 63; GFX10: ; %bb.0: 64; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 65; GFX10-NEXT: global_load_dword v0, v[2:3], off 66; GFX10-NEXT: s_waitcnt vmcnt(0) 67; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x5040404 68; GFX10-NEXT: global_store_dword v[4:5], v0, off 69; GFX10-NEXT: s_setpc_b64 s[30:31] 70; 71; GFX9-LABEL: shuffle4445: 72; GFX9: ; %bb.0: 73; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 74; GFX9-NEXT: global_load_dword v0, v[2:3], off 75; GFX9-NEXT: s_mov_b32 s4, 0x5040404 76; GFX9-NEXT: s_waitcnt vmcnt(0) 77; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 78; GFX9-NEXT: global_store_dword v[4:5], v0, off 79; GFX9-NEXT: s_waitcnt vmcnt(0) 80; GFX9-NEXT: s_setpc_b64 s[30:31] 81 %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4 82 %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4 83 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 4, i32 4, i32 4, i32 5> 84 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4 85 ret void 86} 87 88define hidden void @shuffle0101(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) { 89; GFX10-LABEL: shuffle0101: 90; GFX10: ; %bb.0: 91; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 92; GFX10-NEXT: global_load_dword v0, v[0:1], off 93; GFX10-NEXT: s_waitcnt vmcnt(0) 94; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x5040504 95; GFX10-NEXT: global_store_dword v[4:5], v0, off 96; GFX10-NEXT: s_setpc_b64 s[30:31] 97; 98; GFX9-LABEL: shuffle0101: 99; GFX9: ; %bb.0: 100; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 101; GFX9-NEXT: global_load_dword v0, v[0:1], off 102; GFX9-NEXT: s_mov_b32 s4, 0x5040504 103; GFX9-NEXT: s_waitcnt vmcnt(0) 104; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 105; GFX9-NEXT: global_store_dword v[4:5], v0, off 106; GFX9-NEXT: s_waitcnt vmcnt(0) 107; GFX9-NEXT: s_setpc_b64 s[30:31] 108 %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4 109 %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4 110 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 111 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4 112 ret void 113} 114 115define hidden void @shuffle1004(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) { 116; GFX10-LABEL: shuffle1004: 117; GFX10: ; %bb.0: 118; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 119; GFX10-NEXT: global_load_dword v6, v[0:1], off 120; GFX10-NEXT: global_load_dword v7, v[2:3], off 121; GFX10-NEXT: s_waitcnt vmcnt(0) 122; GFX10-NEXT: v_perm_b32 v0, v6, v7, 0x40405 123; GFX10-NEXT: global_store_dword v[4:5], v0, off 124; GFX10-NEXT: s_setpc_b64 s[30:31] 125; 126; GFX9-LABEL: shuffle1004: 127; GFX9: ; %bb.0: 128; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 129; GFX9-NEXT: global_load_dword v6, v[0:1], off 130; GFX9-NEXT: global_load_dword v7, v[2:3], off 131; GFX9-NEXT: s_mov_b32 s4, 0x40405 132; GFX9-NEXT: s_waitcnt vmcnt(0) 133; GFX9-NEXT: v_perm_b32 v0, v6, v7, s4 134; GFX9-NEXT: global_store_dword v[4:5], v0, off 135; GFX9-NEXT: s_waitcnt vmcnt(0) 136; GFX9-NEXT: s_setpc_b64 s[30:31] 137 %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4 138 %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4 139 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 1, i32 0, i32 0, i32 4> 140 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4 141 ret void 142} 143 144 145 146define hidden void @shuffle7533(ptr addrspace(0) %in0, ptr addrspace(0) %in1, ptr addrspace(0) %out0) { 147; GFX10-LABEL: shuffle7533: 148; GFX10: ; %bb.0: 149; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 150; GFX10-NEXT: flat_load_dword v6, v[0:1] 151; GFX10-NEXT: flat_load_dword v7, v[2:3] 152; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 153; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x3030507 154; GFX10-NEXT: flat_store_dword v[4:5], v0 155; GFX10-NEXT: s_waitcnt lgkmcnt(0) 156; GFX10-NEXT: s_setpc_b64 s[30:31] 157; 158; GFX9-LABEL: shuffle7533: 159; GFX9: ; %bb.0: 160; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 161; GFX9-NEXT: flat_load_dword v6, v[0:1] 162; GFX9-NEXT: flat_load_dword v7, v[2:3] 163; GFX9-NEXT: s_mov_b32 s4, 0x3030507 164; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 165; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4 166; GFX9-NEXT: flat_store_dword v[4:5], v0 167; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 168; GFX9-NEXT: s_setpc_b64 s[30:31] 169 %vec0 = load <4 x i8>, ptr addrspace(0) %in0, align 4 170 %vec1 = load <4 x i8>, ptr addrspace(0) %in1, align 4 171 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 7, i32 5, i32 3, i32 3> 172 store <4 x i8> %shuffle0_0, ptr addrspace(0) %out0, align 4 173 ret void 174} 175 176define hidden void @shuffle7767(ptr addrspace(0) %in0, ptr addrspace(0) %in1, ptr addrspace(0) %out0) { 177; GFX10-LABEL: shuffle7767: 178; GFX10: ; %bb.0: 179; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 180; GFX10-NEXT: flat_load_dword v0, v[2:3] 181; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 182; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060707 183; GFX10-NEXT: flat_store_dword v[4:5], v0 184; GFX10-NEXT: s_waitcnt lgkmcnt(0) 185; GFX10-NEXT: s_setpc_b64 s[30:31] 186; 187; GFX9-LABEL: shuffle7767: 188; GFX9: ; %bb.0: 189; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 190; GFX9-NEXT: flat_load_dword v0, v[2:3] 191; GFX9-NEXT: s_mov_b32 s4, 0x7060707 192; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 193; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 194; GFX9-NEXT: flat_store_dword v[4:5], v0 195; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 196; GFX9-NEXT: s_setpc_b64 s[30:31] 197 %vec0 = load <4 x i8>, ptr addrspace(0) %in0, align 4 198 %vec1 = load <4 x i8>, ptr addrspace(0) %in1, align 4 199 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 7, i32 7, i32 6, i32 7> 200 store <4 x i8> %shuffle0_0, ptr addrspace(0) %out0, align 4 201 ret void 202} 203 204define hidden void @shuffle0554(ptr addrspace(3) %in0, ptr addrspace(3) %in1, ptr addrspace(3) %out0) { 205; GFX10-LABEL: shuffle0554: 206; GFX10: ; %bb.0: 207; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 208; GFX10-NEXT: ds_read_b32 v0, v0 209; GFX10-NEXT: ds_read_b32 v1, v1 210; GFX10-NEXT: s_waitcnt lgkmcnt(0) 211; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x10104 212; GFX10-NEXT: ds_write_b32 v2, v0 213; GFX10-NEXT: s_waitcnt lgkmcnt(0) 214; GFX10-NEXT: s_setpc_b64 s[30:31] 215; 216; GFX9-LABEL: shuffle0554: 217; GFX9: ; %bb.0: 218; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 219; GFX9-NEXT: ds_read_b32 v0, v0 220; GFX9-NEXT: ds_read_b32 v1, v1 221; GFX9-NEXT: s_mov_b32 s4, 0x10104 222; GFX9-NEXT: s_waitcnt lgkmcnt(0) 223; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 224; GFX9-NEXT: ds_write_b32 v2, v0 225; GFX9-NEXT: s_waitcnt lgkmcnt(0) 226; GFX9-NEXT: s_setpc_b64 s[30:31] 227 %vec0 = load <4 x i8>, ptr addrspace(3) %in0, align 4 228 %vec1 = load <4 x i8>, ptr addrspace(3) %in1, align 4 229 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 0, i32 5, i32 5, i32 4> 230 store <4 x i8> %shuffle0_0, ptr addrspace(3) %out0, align 4 231 ret void 232} 233 234define hidden void @shuffle2127(ptr addrspace(3) %in0, ptr addrspace(3) %in1, ptr addrspace(3) %out0) { 235; GFX10-LABEL: shuffle2127: 236; GFX10: ; %bb.0: 237; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 238; GFX10-NEXT: ds_read_b32 v0, v0 239; GFX10-NEXT: ds_read_b32 v1, v1 240; GFX10-NEXT: s_waitcnt lgkmcnt(0) 241; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x3060506 242; GFX10-NEXT: ds_write_b32 v2, v0 243; GFX10-NEXT: s_waitcnt lgkmcnt(0) 244; GFX10-NEXT: s_setpc_b64 s[30:31] 245; 246; GFX9-LABEL: shuffle2127: 247; GFX9: ; %bb.0: 248; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 249; GFX9-NEXT: ds_read_b32 v0, v0 250; GFX9-NEXT: ds_read_b32 v1, v1 251; GFX9-NEXT: s_mov_b32 s4, 0x3060506 252; GFX9-NEXT: s_waitcnt lgkmcnt(0) 253; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 254; GFX9-NEXT: ds_write_b32 v2, v0 255; GFX9-NEXT: s_waitcnt lgkmcnt(0) 256; GFX9-NEXT: s_setpc_b64 s[30:31] 257 %vec0 = load <4 x i8>, ptr addrspace(3) %in0, align 4 258 %vec1 = load <4 x i8>, ptr addrspace(3) %in1, align 4 259 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 2, i32 1, i32 2, i32 7> 260 store <4 x i8> %shuffle0_0, ptr addrspace(3) %out0, align 4 261 ret void 262} 263 264define hidden void @shuffle5047(ptr addrspace(5) %in0, ptr addrspace(5) %in1, ptr addrspace(5) %out0) { 265; GFX10-LABEL: shuffle5047: 266; GFX10: ; %bb.0: 267; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 268; GFX10-NEXT: s_clause 0x1 269; GFX10-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen 270; GFX10-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen 271; GFX10-NEXT: s_waitcnt vmcnt(0) 272; GFX10-NEXT: v_perm_b32 v0, v4, v3, 0x7040005 273; GFX10-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen 274; GFX10-NEXT: s_setpc_b64 s[30:31] 275; 276; GFX9-LABEL: shuffle5047: 277; GFX9: ; %bb.0: 278; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 279; GFX9-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen 280; GFX9-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen 281; GFX9-NEXT: s_mov_b32 s4, 0x7040005 282; GFX9-NEXT: s_waitcnt vmcnt(0) 283; GFX9-NEXT: v_perm_b32 v0, v4, v3, s4 284; GFX9-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen 285; GFX9-NEXT: s_waitcnt vmcnt(0) 286; GFX9-NEXT: s_setpc_b64 s[30:31] 287 %vec0 = load <4 x i8>, ptr addrspace(5) %in0, align 4 288 %vec1 = load <4 x i8>, ptr addrspace(5) %in1, align 4 289 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 5, i32 0, i32 4, i32 7> 290 store <4 x i8> %shuffle0_0, ptr addrspace(5) %out0, align 4 291 ret void 292} 293 294define hidden void @shuffle3546(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) { 295; GFX10-LABEL: shuffle3546: 296; GFX10: ; %bb.0: 297; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 298; GFX10-NEXT: global_load_dword v6, v[0:1], off 299; GFX10-NEXT: global_load_dword v7, v[2:3], off 300; GFX10-NEXT: s_waitcnt vmcnt(0) 301; GFX10-NEXT: v_perm_b32 v0, v6, v7, 0x2000107 302; GFX10-NEXT: global_store_dword v[4:5], v0, off 303; GFX10-NEXT: s_setpc_b64 s[30:31] 304; 305; GFX9-LABEL: shuffle3546: 306; GFX9: ; %bb.0: 307; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 308; GFX9-NEXT: global_load_dword v6, v[0:1], off 309; GFX9-NEXT: global_load_dword v7, v[2:3], off 310; GFX9-NEXT: s_mov_b32 s4, 0x2000107 311; GFX9-NEXT: s_waitcnt vmcnt(0) 312; GFX9-NEXT: v_perm_b32 v0, v6, v7, s4 313; GFX9-NEXT: global_store_dword v[4:5], v0, off 314; GFX9-NEXT: s_waitcnt vmcnt(0) 315; GFX9-NEXT: s_setpc_b64 s[30:31] 316 %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4 317 %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4 318 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 3, i32 5, i32 4, i32 6> 319 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4 320 ret void 321} 322 323 324define hidden void @shuffle7330ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) { 325; GFX10-LABEL: shuffle7330ud2: 326; GFX10: ; %bb.0: 327; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 328; GFX10-NEXT: global_load_dword v0, v[0:1], off 329; GFX10-NEXT: s_waitcnt vmcnt(0) 330; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x4070706 331; GFX10-NEXT: global_store_dword v[2:3], v0, off 332; GFX10-NEXT: s_setpc_b64 s[30:31] 333; 334; GFX9-LABEL: shuffle7330ud2: 335; GFX9: ; %bb.0: 336; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 337; GFX9-NEXT: global_load_dword v0, v[0:1], off 338; GFX9-NEXT: s_mov_b32 s4, 0x4070706 339; GFX9-NEXT: s_waitcnt vmcnt(0) 340; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 341; GFX9-NEXT: global_store_dword v[2:3], v0, off 342; GFX9-NEXT: s_waitcnt vmcnt(0) 343; GFX9-NEXT: s_setpc_b64 s[30:31] 344 %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4 345 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 7, i32 3, i32 3, i32 0> 346 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4 347 ret void 348} 349 350define hidden void @shuffle5341ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) { 351; GFX10-LABEL: shuffle5341ud2: 352; GFX10: ; %bb.0: 353; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 354; GFX10-NEXT: global_load_dword v0, v[0:1], off 355; GFX10-NEXT: s_waitcnt vmcnt(0) 356; GFX10-NEXT: v_alignbit_b32 v0, v0, v0, 16 357; GFX10-NEXT: global_store_dword v[2:3], v0, off 358; GFX10-NEXT: s_setpc_b64 s[30:31] 359; 360; GFX9-LABEL: shuffle5341ud2: 361; GFX9: ; %bb.0: 362; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 363; GFX9-NEXT: global_load_dword v0, v[0:1], off 364; GFX9-NEXT: s_waitcnt vmcnt(0) 365; GFX9-NEXT: v_alignbit_b32 v0, v0, v0, 16 366; GFX9-NEXT: global_store_dword v[2:3], v0, off 367; GFX9-NEXT: s_waitcnt vmcnt(0) 368; GFX9-NEXT: s_setpc_b64 s[30:31] 369 %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4 370 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 5, i32 3, i32 4, i32 1> 371 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4 372 ret void 373} 374 375define hidden void @shuffle6106ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) { 376; GFX10-LABEL: shuffle6106ud2: 377; GFX10: ; %bb.0: 378; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 379; GFX10-NEXT: global_load_dword v0, v[0:1], off 380; GFX10-NEXT: s_waitcnt vmcnt(0) 381; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x5040504 382; GFX10-NEXT: global_store_dword v[2:3], v0, off 383; GFX10-NEXT: s_setpc_b64 s[30:31] 384; 385; GFX9-LABEL: shuffle6106ud2: 386; GFX9: ; %bb.0: 387; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 388; GFX9-NEXT: global_load_dword v0, v[0:1], off 389; GFX9-NEXT: s_mov_b32 s4, 0x5040504 390; GFX9-NEXT: s_waitcnt vmcnt(0) 391; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 392; GFX9-NEXT: global_store_dword v[2:3], v0, off 393; GFX9-NEXT: s_waitcnt vmcnt(0) 394; GFX9-NEXT: s_setpc_b64 s[30:31] 395 %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4 396 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 6, i32 1, i32 0, i32 6> 397 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4 398 ret void 399} 400 401 402define hidden void @shuffle4327ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) { 403; GFX10-LABEL: shuffle4327ud2: 404; GFX10: ; %bb.0: 405; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 406; GFX10-NEXT: global_load_dword v0, v[0:1], off 407; GFX10-NEXT: s_waitcnt vmcnt(0) 408; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060706 409; GFX10-NEXT: global_store_dword v[2:3], v0, off 410; GFX10-NEXT: s_setpc_b64 s[30:31] 411; 412; GFX9-LABEL: shuffle4327ud2: 413; GFX9: ; %bb.0: 414; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 415; GFX9-NEXT: global_load_dword v0, v[0:1], off 416; GFX9-NEXT: s_mov_b32 s4, 0x7060706 417; GFX9-NEXT: s_waitcnt vmcnt(0) 418; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 419; GFX9-NEXT: global_store_dword v[2:3], v0, off 420; GFX9-NEXT: s_waitcnt vmcnt(0) 421; GFX9-NEXT: s_setpc_b64 s[30:31] 422 %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4 423 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 4, i32 3, i32 2, i32 7> 424 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4 425 ret void 426} 427 428define hidden void @shuffle3263ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) { 429; GFX10-LABEL: shuffle3263ud2: 430; GFX10: ; %bb.0: 431; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 432; GFX10-NEXT: global_load_dword v0, v[0:1], off 433; GFX10-NEXT: s_waitcnt vmcnt(0) 434; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060607 435; GFX10-NEXT: global_store_dword v[2:3], v0, off 436; GFX10-NEXT: s_setpc_b64 s[30:31] 437; 438; GFX9-LABEL: shuffle3263ud2: 439; GFX9: ; %bb.0: 440; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 441; GFX9-NEXT: global_load_dword v0, v[0:1], off 442; GFX9-NEXT: s_mov_b32 s4, 0x7060607 443; GFX9-NEXT: s_waitcnt vmcnt(0) 444; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 445; GFX9-NEXT: global_store_dword v[2:3], v0, off 446; GFX9-NEXT: s_waitcnt vmcnt(0) 447; GFX9-NEXT: s_setpc_b64 s[30:31] 448 %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4 449 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 6, i32 3> 450 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4 451 ret void 452} 453 454define hidden void @shuffle2763ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) { 455; GFX10-LABEL: shuffle2763ud2: 456; GFX10: ; %bb.0: 457; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 458; GFX10-NEXT: global_load_dword v0, v[0:1], off 459; GFX10-NEXT: s_waitcnt vmcnt(0) 460; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060706 461; GFX10-NEXT: global_store_dword v[2:3], v0, off 462; GFX10-NEXT: s_setpc_b64 s[30:31] 463; 464; GFX9-LABEL: shuffle2763ud2: 465; GFX9: ; %bb.0: 466; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 467; GFX9-NEXT: global_load_dword v0, v[0:1], off 468; GFX9-NEXT: s_mov_b32 s4, 0x7060706 469; GFX9-NEXT: s_waitcnt vmcnt(0) 470; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 471; GFX9-NEXT: global_store_dword v[2:3], v0, off 472; GFX9-NEXT: s_waitcnt vmcnt(0) 473; GFX9-NEXT: s_setpc_b64 s[30:31] 474 %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4 475 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 2, i32 7, i32 6, i32 3> 476 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4 477 ret void 478} 479 480define hidden void @shuffle1327ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) { 481; GFX10-LABEL: shuffle1327ud2: 482; GFX10: ; %bb.0: 483; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 484; GFX10-NEXT: global_load_dword v0, v[0:1], off 485; GFX10-NEXT: s_waitcnt vmcnt(0) 486; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060705 487; GFX10-NEXT: global_store_dword v[2:3], v0, off 488; GFX10-NEXT: s_setpc_b64 s[30:31] 489; 490; GFX9-LABEL: shuffle1327ud2: 491; GFX9: ; %bb.0: 492; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 493; GFX9-NEXT: global_load_dword v0, v[0:1], off 494; GFX9-NEXT: s_mov_b32 s4, 0x7060705 495; GFX9-NEXT: s_waitcnt vmcnt(0) 496; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 497; GFX9-NEXT: global_store_dword v[2:3], v0, off 498; GFX9-NEXT: s_waitcnt vmcnt(0) 499; GFX9-NEXT: s_setpc_b64 s[30:31] 500 %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4 501 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 7> 502 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4 503 ret void 504} 505 506define hidden void @shuffle0605ud2(ptr addrspace(1) %in0, ptr addrspace(1) %out0) { 507; GFX10-LABEL: shuffle0605ud2: 508; GFX10: ; %bb.0: 509; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 510; GFX10-NEXT: global_load_dword v0, v[0:1], off 511; GFX10-NEXT: s_waitcnt vmcnt(0) 512; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x5040504 513; GFX10-NEXT: global_store_dword v[2:3], v0, off 514; GFX10-NEXT: s_setpc_b64 s[30:31] 515; 516; GFX9-LABEL: shuffle0605ud2: 517; GFX9: ; %bb.0: 518; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 519; GFX9-NEXT: global_load_dword v0, v[0:1], off 520; GFX9-NEXT: s_mov_b32 s4, 0x5040504 521; GFX9-NEXT: s_waitcnt vmcnt(0) 522; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 523; GFX9-NEXT: global_store_dword v[2:3], v0, off 524; GFX9-NEXT: s_waitcnt vmcnt(0) 525; GFX9-NEXT: s_setpc_b64 s[30:31] 526 %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4 527 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 0, i32 6, i32 0, i32 5> 528 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0, align 4 529 ret void 530} 531 532define hidden void @insertUsesOr(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0) { 533; GFX10-LABEL: insertUsesOr: 534; GFX10: ; %bb.0: 535; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 536; GFX10-NEXT: global_load_dword v0, v[0:1], off 537; GFX10-NEXT: v_lshlrev_b16 v1, 8, v4 538; GFX10-NEXT: s_waitcnt vmcnt(0) 539; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD 540; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 541; GFX10-NEXT: global_store_dword v[5:6], v0, off 542; GFX10-NEXT: s_setpc_b64 s[30:31] 543; 544; GFX9-LABEL: insertUsesOr: 545; GFX9: ; %bb.0: 546; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 547; GFX9-NEXT: global_load_dword v0, v[0:1], off 548; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v4 549; GFX9-NEXT: s_waitcnt vmcnt(0) 550; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD 551; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 552; GFX9-NEXT: global_store_dword v[5:6], v0, off 553; GFX9-NEXT: s_waitcnt vmcnt(0) 554; GFX9-NEXT: s_setpc_b64 s[30:31] 555 %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4 556 %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4 557 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 4> 558 %vecins = insertelement <4 x i8> %shuffle0_0, i8 %elt, i32 1 559 store <4 x i8> %vecins, ptr addrspace(1) %out0 560 ret void 561} 562 563define hidden void @addUsesOr(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0) { 564; GFX10-LABEL: addUsesOr: 565; GFX10: ; %bb.0: 566; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 567; GFX10-NEXT: global_load_dword v4, v[0:1], off 568; GFX10-NEXT: global_load_dword v7, v[2:3], off 569; GFX10-NEXT: s_waitcnt vmcnt(1) 570; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v4 571; GFX10-NEXT: s_waitcnt vmcnt(0) 572; GFX10-NEXT: v_lshrrev_b32_e32 v1, 24, v7 573; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v4 574; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v7 575; GFX10-NEXT: v_add_nc_u16 v0, v0, v1 576; GFX10-NEXT: v_lshrrev_b16 v1, 8, v7 577; GFX10-NEXT: v_add_nc_u16 v2, v2, v3 578; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0 579; GFX10-NEXT: v_add_nc_u16 v1, v4, v1 580; GFX10-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 581; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1 582; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 583; GFX10-NEXT: global_store_dword v[5:6], v0, off 584; GFX10-NEXT: s_setpc_b64 s[30:31] 585; 586; GFX9-LABEL: addUsesOr: 587; GFX9: ; %bb.0: 588; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 589; GFX9-NEXT: global_load_dword v4, v[0:1], off 590; GFX9-NEXT: global_load_dword v7, v[2:3], off 591; GFX9-NEXT: s_waitcnt vmcnt(0) 592; GFX9-NEXT: v_add_u16_sdwa v0, v4, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 593; GFX9-NEXT: v_add_u16_sdwa v1, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 594; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 595; GFX9-NEXT: v_add_u16_sdwa v1, v4, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 596; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 597; GFX9-NEXT: global_store_dword v[5:6], v0, off 598; GFX9-NEXT: s_waitcnt vmcnt(0) 599; GFX9-NEXT: s_setpc_b64 s[30:31] 600 %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4 601 %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4 602 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 7, i32 0, i32 6, i32 3> 603 %added = add <4 x i8> %shuffle0_0, %vec1 604 store <4 x i8> %added, ptr addrspace(1) %out0 605 ret void 606} 607 608 609define amdgpu_kernel void @shuffle8i8(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out1) #0 { 610; GFX10-LABEL: shuffle8i8: 611; GFX10: ; %bb.0: ; %bb 612; GFX10-NEXT: s_clause 0x1 613; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 614; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 615; GFX10-NEXT: v_mov_b32_e32 v2, 0 616; GFX10-NEXT: s_waitcnt lgkmcnt(0) 617; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 618; GFX10-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x0 619; GFX10-NEXT: s_waitcnt lgkmcnt(0) 620; GFX10-NEXT: s_bfe_u32 s2, s5, 0x80008 621; GFX10-NEXT: s_lshl_b32 s1, s9, 8 622; GFX10-NEXT: s_bfe_u32 s9, s9, 0x100010 623; GFX10-NEXT: s_bfe_u32 s0, s4, 0x80008 624; GFX10-NEXT: s_lshl_b32 s3, s8, 8 625; GFX10-NEXT: s_and_b32 s5, s8, 0xff00 626; GFX10-NEXT: s_bfe_u32 s8, s4, 0x80010 627; GFX10-NEXT: s_and_b32 s4, s4, 0xff 628; GFX10-NEXT: s_or_b32 s1, s2, s1 629; GFX10-NEXT: s_lshl_b32 s2, s9, 8 630; GFX10-NEXT: s_or_b32 s0, s0, s3 631; GFX10-NEXT: s_or_b32 s3, s8, s5 632; GFX10-NEXT: s_or_b32 s2, s4, s2 633; GFX10-NEXT: s_and_b32 s0, s0, 0xffff 634; GFX10-NEXT: s_lshl_b32 s1, s1, 16 635; GFX10-NEXT: s_and_b32 s2, s2, 0xffff 636; GFX10-NEXT: s_lshl_b32 s3, s3, 16 637; GFX10-NEXT: s_or_b32 s0, s0, s1 638; GFX10-NEXT: s_or_b32 s1, s2, s3 639; GFX10-NEXT: v_mov_b32_e32 v0, s0 640; GFX10-NEXT: v_mov_b32_e32 v1, s1 641; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] 642; GFX10-NEXT: s_endpgm 643; 644; GFX9-LABEL: shuffle8i8: 645; GFX9: ; %bb.0: ; %bb 646; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 647; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 648; GFX9-NEXT: v_mov_b32_e32 v2, 0 649; GFX9-NEXT: s_waitcnt lgkmcnt(0) 650; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 651; GFX9-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x0 652; GFX9-NEXT: s_waitcnt lgkmcnt(0) 653; GFX9-NEXT: s_bfe_u32 s0, s4, 0x80008 654; GFX9-NEXT: s_lshl_b32 s1, s9, 8 655; GFX9-NEXT: s_bfe_u32 s2, s5, 0x80008 656; GFX9-NEXT: s_lshl_b32 s3, s8, 8 657; GFX9-NEXT: s_or_b32 s1, s2, s1 658; GFX9-NEXT: s_or_b32 s0, s0, s3 659; GFX9-NEXT: s_bfe_u32 s2, s4, 0x80010 660; GFX9-NEXT: s_and_b32 s3, s4, 0xff 661; GFX9-NEXT: s_bfe_u32 s4, s9, 0x100010 662; GFX9-NEXT: s_and_b32 s5, s8, 0xff00 663; GFX9-NEXT: s_lshl_b32 s4, s4, 8 664; GFX9-NEXT: s_or_b32 s2, s2, s5 665; GFX9-NEXT: s_or_b32 s3, s3, s4 666; GFX9-NEXT: s_and_b32 s3, s3, 0xffff 667; GFX9-NEXT: s_lshl_b32 s2, s2, 16 668; GFX9-NEXT: s_and_b32 s0, s0, 0xffff 669; GFX9-NEXT: s_lshl_b32 s1, s1, 16 670; GFX9-NEXT: s_or_b32 s2, s3, s2 671; GFX9-NEXT: s_or_b32 s0, s0, s1 672; GFX9-NEXT: v_mov_b32_e32 v0, s0 673; GFX9-NEXT: v_mov_b32_e32 v1, s2 674; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] 675; GFX9-NEXT: s_endpgm 676bb: 677 %vec0 = load <8 x i8>, ptr addrspace(1) %in0 678 %vec1 = load <8 x i8>, ptr addrspace(1) %in1 679 %shuffle0 = shufflevector <8 x i8> %vec0, <8 x i8> %vec1, <8 x i32> <i32 1, i32 8, i32 5, i32 12, i32 0, i32 14, i32 2, i32 9> 680 store <8 x i8> %shuffle0, ptr addrspace(1) %out1 681 ret void 682} 683 684declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 685declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone 686 687; Not combined to perm due to non-vectorized use, non-divergent 688define hidden void @add(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0) { 689; GFX10-LABEL: add: 690; GFX10: ; %bb.0: 691; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 692; GFX10-NEXT: global_load_dword v4, v[0:1], off 693; GFX10-NEXT: global_load_dword v7, v[2:3], off 694; GFX10-NEXT: s_waitcnt vmcnt(1) 695; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v4 696; GFX10-NEXT: s_waitcnt vmcnt(0) 697; GFX10-NEXT: v_lshrrev_b16 v1, 8, v7 698; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v7 699; GFX10-NEXT: v_lshrrev_b16 v3, 8, v4 700; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v7 701; GFX10-NEXT: v_add_nc_u16 v0, v0, v1 702; GFX10-NEXT: v_add_nc_u16 v2, v7, v2 703; GFX10-NEXT: v_add_nc_u16 v3, v3, v7 704; GFX10-NEXT: v_add_nc_u16 v1, v1, v4 705; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0 706; GFX10-NEXT: v_lshlrev_b16 v2, 8, v2 707; GFX10-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 708; GFX10-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 709; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 710; GFX10-NEXT: global_store_dword v[5:6], v0, off 711; GFX10-NEXT: s_setpc_b64 s[30:31] 712; 713; GFX9-LABEL: add: 714; GFX9: ; %bb.0: 715; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 716; GFX9-NEXT: global_load_dword v4, v[0:1], off 717; GFX9-NEXT: global_load_dword v7, v[2:3], off 718; GFX9-NEXT: s_waitcnt vmcnt(0) 719; GFX9-NEXT: v_add_u16_sdwa v0, v4, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 720; GFX9-NEXT: v_add_u16_sdwa v1, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD 721; GFX9-NEXT: v_add_u16_sdwa v2, v7, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 722; GFX9-NEXT: v_add_u16_sdwa v3, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:WORD_1 723; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 724; GFX9-NEXT: v_or_b32_sdwa v1, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 725; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 726; GFX9-NEXT: global_store_dword v[5:6], v0, off 727; GFX9-NEXT: s_waitcnt vmcnt(0) 728; GFX9-NEXT: s_setpc_b64 s[30:31] 729 %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4 730 %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4 731 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 1, i32 3, i32 5, i32 4> 732 %vecins = add <4 x i8> %shuffle0_0, %vec1 733 store <4 x i8> %vecins, ptr addrspace(1) %out0 734 ret void 735} 736 737; Not combined to perm due to non-vectorized use 738define hidden void @add_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0) { 739; GFX10-LABEL: add_div: 740; GFX10: ; %bb.0: 741; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 742; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31 743; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4 744; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 745; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 746; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 747; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo 748; GFX10-NEXT: global_load_dword v4, v[0:1], off 749; GFX10-NEXT: global_load_dword v7, v[2:3], off 750; GFX10-NEXT: s_waitcnt vmcnt(1) 751; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v4 752; GFX10-NEXT: s_waitcnt vmcnt(0) 753; GFX10-NEXT: v_lshrrev_b16 v1, 8, v7 754; GFX10-NEXT: v_add_nc_u16 v0, v0, v1 755; GFX10-NEXT: v_lshrrev_b16 v1, 8, v4 756; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0 757; GFX10-NEXT: v_add_nc_u16 v1, v1, v7 758; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 759; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 760; GFX10-NEXT: global_store_dword v[5:6], v0, off 761; GFX10-NEXT: s_setpc_b64 s[30:31] 762; 763; GFX9-LABEL: add_div: 764; GFX9: ; %bb.0: 765; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 766; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31 767; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4 768; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 769; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 770; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 771; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 772; GFX9-NEXT: global_load_dword v4, v[0:1], off 773; GFX9-NEXT: global_load_dword v7, v[2:3], off 774; GFX9-NEXT: s_waitcnt vmcnt(0) 775; GFX9-NEXT: v_add_u16_sdwa v0, v4, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 776; GFX9-NEXT: v_add_u16_sdwa v1, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD 777; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 778; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 779; GFX9-NEXT: global_store_dword v[5:6], v0, off 780; GFX9-NEXT: s_waitcnt vmcnt(0) 781; GFX9-NEXT: s_setpc_b64 s[30:31] 782 %tid = call i32 @llvm.amdgcn.workitem.id.x() 783 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid 784 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid 785 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 786 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 787 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 4> 788 %vecins = add <4 x i8> %shuffle0_0, %vec1 789 store <4 x i8> %vecins, ptr addrspace(1) %out0 790 ret void 791} 792 793; Not combined to perm due to non-divergent use 794define hidden void @add_store(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { 795; GFX10-LABEL: add_store: 796; GFX10: ; %bb.0: 797; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 798; GFX10-NEXT: global_load_dword v4, v[0:1], off 799; GFX10-NEXT: global_load_dword v9, v[2:3], off 800; GFX10-NEXT: s_waitcnt vmcnt(1) 801; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v4 802; GFX10-NEXT: s_waitcnt vmcnt(0) 803; GFX10-NEXT: v_lshrrev_b16 v1, 8, v9 804; GFX10-NEXT: v_lshrrev_b16 v2, 8, v4 805; GFX10-NEXT: v_add_nc_u16 v0, v0, v1 806; GFX10-NEXT: v_mov_b32_e32 v1, 0xffffff00 807; GFX10-NEXT: v_add_nc_u16 v3, v2, v9 808; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0 809; GFX10-NEXT: v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 810; GFX10-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 811; GFX10-NEXT: v_or_b32_e32 v1, v2, v1 812; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 813; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 814; GFX10-NEXT: global_store_dword v[5:6], v0, off 815; GFX10-NEXT: global_store_dword v[7:8], v1, off 816; GFX10-NEXT: s_setpc_b64 s[30:31] 817; 818; GFX9-LABEL: add_store: 819; GFX9: ; %bb.0: 820; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 821; GFX9-NEXT: global_load_dword v4, v[0:1], off 822; GFX9-NEXT: global_load_dword v9, v[2:3], off 823; GFX9-NEXT: s_movk_i32 s4, 0xff00 824; GFX9-NEXT: s_waitcnt vmcnt(1) 825; GFX9-NEXT: v_and_b32_sdwa v0, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 826; GFX9-NEXT: s_waitcnt vmcnt(0) 827; GFX9-NEXT: v_add_u16_sdwa v1, v4, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 828; GFX9-NEXT: v_add_u16_sdwa v2, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD 829; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 830; GFX9-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD 831; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 832; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 833; GFX9-NEXT: global_store_dword v[5:6], v1, off 834; GFX9-NEXT: global_store_dword v[7:8], v0, off 835; GFX9-NEXT: s_waitcnt vmcnt(0) 836; GFX9-NEXT: s_setpc_b64 s[30:31] 837 %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4 838 %vec1 = load <4 x i8>, ptr addrspace(1) %in1, align 4 839 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 4> 840 %vecins = add <4 x i8> %shuffle0_0, %vec1 841 store <4 x i8> %vecins, ptr addrspace(1) %out0 842 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1 843 ret void 844} 845 846; Not combined to perm due to 16 bit or 847define hidden void @add_store_div_16(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { 848; GFX10-LABEL: add_store_div_16: 849; GFX10: ; %bb.0: 850; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 851; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31 852; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4 853; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 854; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 855; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 856; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo 857; GFX10-NEXT: global_load_dword v4, v[0:1], off 858; GFX10-NEXT: global_load_dword v9, v[2:3], off 859; GFX10-NEXT: s_waitcnt vmcnt(1) 860; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v4 861; GFX10-NEXT: s_waitcnt vmcnt(0) 862; GFX10-NEXT: v_lshrrev_b16 v1, 8, v9 863; GFX10-NEXT: v_lshrrev_b16 v2, 8, v4 864; GFX10-NEXT: v_add_nc_u16 v0, v0, v1 865; GFX10-NEXT: v_mov_b32_e32 v1, 0xffffff00 866; GFX10-NEXT: v_add_nc_u16 v3, v2, v9 867; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0 868; GFX10-NEXT: v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 869; GFX10-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 870; GFX10-NEXT: v_or_b32_e32 v1, v2, v1 871; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 872; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 873; GFX10-NEXT: global_store_dword v[5:6], v0, off 874; GFX10-NEXT: global_store_dword v[7:8], v1, off 875; GFX10-NEXT: s_setpc_b64 s[30:31] 876; 877; GFX9-LABEL: add_store_div_16: 878; GFX9: ; %bb.0: 879; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 880; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31 881; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4 882; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 883; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 884; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 885; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 886; GFX9-NEXT: global_load_dword v4, v[0:1], off 887; GFX9-NEXT: global_load_dword v9, v[2:3], off 888; GFX9-NEXT: s_movk_i32 s4, 0xff00 889; GFX9-NEXT: s_waitcnt vmcnt(1) 890; GFX9-NEXT: v_and_b32_sdwa v0, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 891; GFX9-NEXT: s_waitcnt vmcnt(0) 892; GFX9-NEXT: v_add_u16_sdwa v1, v4, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 893; GFX9-NEXT: v_add_u16_sdwa v2, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD 894; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 895; GFX9-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD 896; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 897; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 898; GFX9-NEXT: global_store_dword v[5:6], v1, off 899; GFX9-NEXT: global_store_dword v[7:8], v0, off 900; GFX9-NEXT: s_waitcnt vmcnt(0) 901; GFX9-NEXT: s_setpc_b64 s[30:31] 902 %tid = call i32 @llvm.amdgcn.workitem.id.x() 903 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid 904 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid 905 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 906 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 907 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 4> 908 %vecins = add <4 x i8> %shuffle0_0, %vec1 909 store <4 x i8> %vecins, ptr addrspace(1) %out0 910 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1 911 ret void 912} 913 914; Vectorized use, divergent, 32 bit or 915define hidden void @add_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { 916; GFX10-LABEL: add_store_div: 917; GFX10: ; %bb.0: 918; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 919; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31 920; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4 921; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 922; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 923; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 924; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo 925; GFX10-NEXT: global_load_dword v4, v[0:1], off 926; GFX10-NEXT: global_load_dword v9, v[2:3], off 927; GFX10-NEXT: s_waitcnt vmcnt(1) 928; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v4 929; GFX10-NEXT: s_waitcnt vmcnt(0) 930; GFX10-NEXT: v_lshrrev_b16 v1, 8, v9 931; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v9 932; GFX10-NEXT: v_lshrrev_b16 v3, 8, v4 933; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v9 934; GFX10-NEXT: v_add_nc_u16 v0, v0, v1 935; GFX10-NEXT: v_add_nc_u16 v2, v9, v2 936; GFX10-NEXT: v_add_nc_u16 v3, v3, v9 937; GFX10-NEXT: v_add_nc_u16 v1, v1, v10 938; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0 939; GFX10-NEXT: v_lshlrev_b16 v2, 8, v2 940; GFX10-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 941; GFX10-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 942; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 943; GFX10-NEXT: v_perm_b32 v1, v4, v9, 0x10705 944; GFX10-NEXT: global_store_dword v[5:6], v0, off 945; GFX10-NEXT: global_store_dword v[7:8], v1, off 946; GFX10-NEXT: s_setpc_b64 s[30:31] 947; 948; GFX9-LABEL: add_store_div: 949; GFX9: ; %bb.0: 950; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 951; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31 952; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4 953; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 954; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 955; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 956; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 957; GFX9-NEXT: global_load_dword v4, v[0:1], off 958; GFX9-NEXT: global_load_dword v9, v[2:3], off 959; GFX9-NEXT: s_mov_b32 s4, 0x10705 960; GFX9-NEXT: s_waitcnt vmcnt(0) 961; GFX9-NEXT: v_perm_b32 v0, v4, v9, s4 962; GFX9-NEXT: v_add_u16_sdwa v1, v4, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 963; GFX9-NEXT: v_add_u16_sdwa v2, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD 964; GFX9-NEXT: v_add_u16_sdwa v3, v9, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 965; GFX9-NEXT: v_add_u16_sdwa v4, v9, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:WORD_1 966; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 967; GFX9-NEXT: v_or_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 968; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 969; GFX9-NEXT: global_store_dword v[5:6], v1, off 970; GFX9-NEXT: global_store_dword v[7:8], v0, off 971; GFX9-NEXT: s_waitcnt vmcnt(0) 972; GFX9-NEXT: s_setpc_b64 s[30:31] 973 %tid = call i32 @llvm.amdgcn.workitem.id.x() 974 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid 975 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid 976 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 977 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 978 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 1, i32 3, i32 5, i32 4> 979 %vecins = add <4 x i8> %shuffle0_0, %vec1 980 store <4 x i8> %vecins, ptr addrspace(1) %out0 981 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1 982 ret void 983} 984 985define hidden void @and_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { 986; GFX10-LABEL: and_store_div: 987; GFX10: ; %bb.0: 988; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 989; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31 990; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4 991; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 992; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo 993; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 994; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 995; GFX10-NEXT: global_load_dword v4, v[2:3], off 996; GFX10-NEXT: global_load_dword v9, v[0:1], off 997; GFX10-NEXT: v_mov_b32_e32 v0, 2 998; GFX10-NEXT: v_mov_b32_e32 v1, 1 999; GFX10-NEXT: s_waitcnt vmcnt(1) 1000; GFX10-NEXT: v_and_b32_sdwa v2, v4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 1001; GFX10-NEXT: s_waitcnt vmcnt(0) 1002; GFX10-NEXT: v_and_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 1003; GFX10-NEXT: v_and_b32_e32 v3, 0x100, v9 1004; GFX10-NEXT: v_and_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD 1005; GFX10-NEXT: v_or_b32_e32 v1, v1, v2 1006; GFX10-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 1007; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1008; GFX10-NEXT: v_perm_b32 v1, v9, v4, 0x5070006 1009; GFX10-NEXT: global_store_dword v[5:6], v0, off 1010; GFX10-NEXT: global_store_dword v[7:8], v1, off 1011; GFX10-NEXT: s_setpc_b64 s[30:31] 1012; 1013; GFX9-LABEL: and_store_div: 1014; GFX9: ; %bb.0: 1015; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1016; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31 1017; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4 1018; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 1019; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 1020; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 1021; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 1022; GFX9-NEXT: global_load_dword v4, v[0:1], off 1023; GFX9-NEXT: global_load_dword v9, v[2:3], off 1024; GFX9-NEXT: s_mov_b32 s4, 0x5070006 1025; GFX9-NEXT: v_mov_b32_e32 v0, 2 1026; GFX9-NEXT: v_mov_b32_e32 v1, 1 1027; GFX9-NEXT: s_waitcnt vmcnt(1) 1028; GFX9-NEXT: v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 1029; GFX9-NEXT: s_waitcnt vmcnt(0) 1030; GFX9-NEXT: v_perm_b32 v2, v4, v9, s4 1031; GFX9-NEXT: v_and_b32_sdwa v3, v9, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 1032; GFX9-NEXT: v_and_b32_e32 v9, 0x100, v4 1033; GFX9-NEXT: v_and_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD 1034; GFX9-NEXT: v_or_b32_e32 v1, v1, v3 1035; GFX9-NEXT: v_or_b32_sdwa v0, v0, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 1036; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1037; GFX9-NEXT: global_store_dword v[5:6], v0, off 1038; GFX9-NEXT: global_store_dword v[7:8], v2, off 1039; GFX9-NEXT: s_waitcnt vmcnt(0) 1040; GFX9-NEXT: s_setpc_b64 s[30:31] 1041 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1042 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid 1043 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid 1044 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 1045 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 1046 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 2, i32 4, i32 3, i32 1> 1047 %vecins = and <4 x i8> %shuffle0_0, <i8 1, i8 2, i8 2, i8 1> 1048 store <4 x i8> %vecins, ptr addrspace(1) %out0 1049 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1 1050 ret void 1051} 1052 1053define hidden void @ashr_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { 1054; GFX10-LABEL: ashr_store_div: 1055; GFX10: ; %bb.0: 1056; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1057; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31 1058; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4 1059; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 1060; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 1061; GFX10-NEXT: global_load_dword v9, v[0:1], off 1062; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v2, v4 1063; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo 1064; GFX10-NEXT: v_mov_b32_e32 v2, 26 1065; GFX10-NEXT: global_load_dword v0, v[0:1], off 1066; GFX10-NEXT: s_waitcnt vmcnt(1) 1067; GFX10-NEXT: v_bfe_i32 v1, v9, 0, 8 1068; GFX10-NEXT: v_ashrrev_i32_sdwa v2, v2, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 1069; GFX10-NEXT: v_ashrrev_i32_e32 v3, 25, v9 1070; GFX10-NEXT: v_lshlrev_b16 v1, 7, v1 1071; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 1072; GFX10-NEXT: s_waitcnt vmcnt(0) 1073; GFX10-NEXT: v_ashrrev_i16 v4, 10, v0 1074; GFX10-NEXT: v_perm_b32 v0, v9, v0, 0x4010707 1075; GFX10-NEXT: v_and_b32_e32 v1, 0xffffff00, v1 1076; GFX10-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 1077; GFX10-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1078; GFX10-NEXT: global_store_dword v[5:6], v1, off 1079; GFX10-NEXT: global_store_dword v[7:8], v0, off 1080; GFX10-NEXT: s_setpc_b64 s[30:31] 1081; 1082; GFX9-LABEL: ashr_store_div: 1083; GFX9: ; %bb.0: 1084; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1085; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31 1086; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4 1087; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 1088; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 1089; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 1090; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 1091; GFX9-NEXT: global_load_dword v4, v[0:1], off 1092; GFX9-NEXT: global_load_dword v9, v[2:3], off 1093; GFX9-NEXT: v_mov_b32_e32 v1, 7 1094; GFX9-NEXT: s_mov_b32 s4, 0x4010707 1095; GFX9-NEXT: v_mov_b32_e32 v0, 26 1096; GFX9-NEXT: s_waitcnt vmcnt(1) 1097; GFX9-NEXT: v_lshlrev_b16_sdwa v1, v1, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 1098; GFX9-NEXT: s_waitcnt vmcnt(0) 1099; GFX9-NEXT: v_perm_b32 v2, v4, v9, s4 1100; GFX9-NEXT: v_ashrrev_i32_sdwa v0, v0, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 1101; GFX9-NEXT: v_ashrrev_i32_e32 v3, 25, v4 1102; GFX9-NEXT: v_ashrrev_i16_e32 v9, 10, v9 1103; GFX9-NEXT: v_and_b32_e32 v1, 0xffffff00, v1 1104; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 1105; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 1106; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1107; GFX9-NEXT: global_store_dword v[5:6], v0, off 1108; GFX9-NEXT: global_store_dword v[7:8], v2, off 1109; GFX9-NEXT: s_waitcnt vmcnt(0) 1110; GFX9-NEXT: s_setpc_b64 s[30:31] 1111 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1112 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid 1113 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid 1114 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 1115 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 1116 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 3, i32 3, i32 5, i32 0> 1117 %vecins = ashr <4 x i8> %shuffle0_0, <i8 1, i8 2, i8 2, i8 1> 1118 store <4 x i8> %vecins, ptr addrspace(1) %out0 1119 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1 1120 ret void 1121} 1122 1123define hidden void @bc_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { 1124; GFX10-LABEL: bc_store_div: 1125; GFX10: ; %bb.0: 1126; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1127; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31 1128; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4 1129; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 1130; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 1131; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 1132; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo 1133; GFX10-NEXT: global_load_dword v4, v[0:1], off 1134; GFX10-NEXT: global_load_dword v9, v[2:3], off 1135; GFX10-NEXT: s_waitcnt vmcnt(0) 1136; GFX10-NEXT: v_perm_b32 v0, v9, v4, 0x7060104 1137; GFX10-NEXT: global_store_dword v[7:8], v0, off 1138; GFX10-NEXT: global_store_dword v[5:6], v0, off 1139; GFX10-NEXT: s_setpc_b64 s[30:31] 1140; 1141; GFX9-LABEL: bc_store_div: 1142; GFX9: ; %bb.0: 1143; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1144; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31 1145; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4 1146; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 1147; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 1148; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 1149; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 1150; GFX9-NEXT: global_load_dword v4, v[0:1], off 1151; GFX9-NEXT: global_load_dword v9, v[2:3], off 1152; GFX9-NEXT: s_mov_b32 s4, 0x7060104 1153; GFX9-NEXT: s_waitcnt vmcnt(0) 1154; GFX9-NEXT: v_perm_b32 v0, v9, v4, s4 1155; GFX9-NEXT: global_store_dword v[7:8], v0, off 1156; GFX9-NEXT: global_store_dword v[5:6], v0, off 1157; GFX9-NEXT: s_waitcnt vmcnt(0) 1158; GFX9-NEXT: s_setpc_b64 s[30:31] 1159 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1160 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid 1161 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid 1162 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 1163 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 1164 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 4, i32 1, i32 6, i32 7> 1165 %insvec = bitcast <4 x i8> %shuffle0_0 to i32 1166 store i32 %insvec, ptr addrspace(1) %out1 1167 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0 1168 ret void 1169} 1170 1171 1172define hidden void @eve_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2) { 1173; GFX10-LABEL: eve_store_div: 1174; GFX10: ; %bb.0: 1175; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1176; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31 1177; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4 1178; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 1179; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 1180; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 1181; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo 1182; GFX10-NEXT: global_load_dword v4, v[0:1], off 1183; GFX10-NEXT: global_load_dword v5, v[2:3], off 1184; GFX10-NEXT: s_waitcnt vmcnt(1) 1185; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v4 1186; GFX10-NEXT: s_waitcnt vmcnt(0) 1187; GFX10-NEXT: v_perm_b32 v1, v5, v4, 0x1020305 1188; GFX10-NEXT: global_store_byte v[9:10], v0, off 1189; GFX10-NEXT: global_store_dword v[7:8], v1, off 1190; GFX10-NEXT: s_setpc_b64 s[30:31] 1191; 1192; GFX9-LABEL: eve_store_div: 1193; GFX9: ; %bb.0: 1194; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1195; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31 1196; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4 1197; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 1198; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 1199; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 1200; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 1201; GFX9-NEXT: global_load_dword v4, v[0:1], off 1202; GFX9-NEXT: global_load_dword v5, v[2:3], off 1203; GFX9-NEXT: s_mov_b32 s4, 0x1020305 1204; GFX9-NEXT: s_waitcnt vmcnt(1) 1205; GFX9-NEXT: v_lshrrev_b32_e32 v1, 24, v4 1206; GFX9-NEXT: s_waitcnt vmcnt(0) 1207; GFX9-NEXT: v_perm_b32 v0, v5, v4, s4 1208; GFX9-NEXT: global_store_byte v[9:10], v1, off 1209; GFX9-NEXT: global_store_dword v[7:8], v0, off 1210; GFX9-NEXT: s_waitcnt vmcnt(0) 1211; GFX9-NEXT: s_setpc_b64 s[30:31] 1212 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1213 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid 1214 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid 1215 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 1216 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 1217 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 5, i32 3, i32 2, i32 1> 1218 %tmp = extractelement <4 x i8> %shuffle0_0, i32 1 1219 store i8 %tmp, ptr addrspace(1) %out2 1220 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1 1221 ret void 1222} 1223 1224; Not combined to perm due to multi use of or operands (introduced by insert op) 1225define hidden void @ive_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { 1226; GFX10-LABEL: ive_store_div: 1227; GFX10: ; %bb.0: 1228; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1229; GFX10-NEXT: v_and_b32_e32 v9, 0x3ff, v31 1230; GFX10-NEXT: v_lshlrev_b32_e32 v9, 2, v9 1231; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v9 1232; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 1233; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v9 1234; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo 1235; GFX10-NEXT: global_load_dword v9, v[0:1], off 1236; GFX10-NEXT: global_load_dword v10, v[2:3], off 1237; GFX10-NEXT: v_mov_b32_e32 v0, 16 1238; GFX10-NEXT: v_mov_b32_e32 v1, 0xff 1239; GFX10-NEXT: v_lshlrev_b16 v2, 8, v4 1240; GFX10-NEXT: s_waitcnt vmcnt(1) 1241; GFX10-NEXT: v_lshrrev_b32_sdwa v0, v0, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 1242; GFX10-NEXT: s_waitcnt vmcnt(0) 1243; GFX10-NEXT: v_and_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 1244; GFX10-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 1245; GFX10-NEXT: v_or_b32_e32 v1, v1, v2 1246; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1247; GFX10-NEXT: v_perm_b32 v1, v10, v9, 0x2000706 1248; GFX10-NEXT: global_store_dword v[5:6], v0, off 1249; GFX10-NEXT: global_store_dword v[7:8], v1, off 1250; GFX10-NEXT: s_setpc_b64 s[30:31] 1251; 1252; GFX9-LABEL: ive_store_div: 1253; GFX9: ; %bb.0: 1254; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1255; GFX9-NEXT: v_and_b32_e32 v9, 0x3ff, v31 1256; GFX9-NEXT: v_lshlrev_b32_e32 v9, 2, v9 1257; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v9 1258; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 1259; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v9 1260; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 1261; GFX9-NEXT: global_load_dword v9, v[0:1], off 1262; GFX9-NEXT: global_load_dword v10, v[2:3], off 1263; GFX9-NEXT: s_movk_i32 s4, 0xff 1264; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v4 1265; GFX9-NEXT: s_mov_b32 s5, 0x2000706 1266; GFX9-NEXT: s_waitcnt vmcnt(1) 1267; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v9 1268; GFX9-NEXT: s_waitcnt vmcnt(0) 1269; GFX9-NEXT: v_and_b32_sdwa v2, v10, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 1270; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 1271; GFX9-NEXT: v_or_b32_e32 v0, v2, v0 1272; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 1273; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1274; GFX9-NEXT: v_perm_b32 v3, v10, v9, s5 1275; GFX9-NEXT: global_store_dword v[5:6], v0, off 1276; GFX9-NEXT: global_store_dword v[7:8], v3, off 1277; GFX9-NEXT: s_waitcnt vmcnt(0) 1278; GFX9-NEXT: s_setpc_b64 s[30:31] 1279 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1280 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid 1281 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid 1282 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 1283 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 1284 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 6, i32 7, i32 0, i32 2> 1285 %vecins = insertelement <4 x i8> %shuffle0_0, i8 %elt, i32 1 1286 store <4 x i8> %vecins, ptr addrspace(1) %out0 1287 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1 1288 ret void 1289} 1290 1291 1292define hidden void @lhsr_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { 1293; GFX10-LABEL: lhsr_store_div: 1294; GFX10: ; %bb.0: 1295; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1296; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31 1297; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4 1298; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 1299; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 1300; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 1301; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo 1302; GFX10-NEXT: global_load_dword v4, v[0:1], off 1303; GFX10-NEXT: global_load_dword v9, v[2:3], off 1304; GFX10-NEXT: v_mov_b32_e32 v0, 26 1305; GFX10-NEXT: s_waitcnt vmcnt(1) 1306; GFX10-NEXT: v_lshrrev_b16 v1, 1, v4 1307; GFX10-NEXT: s_waitcnt vmcnt(0) 1308; GFX10-NEXT: v_lshrrev_b32_sdwa v0, v0, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 1309; GFX10-NEXT: v_lshrrev_b32_e32 v2, 25, v9 1310; GFX10-NEXT: v_lshrrev_b32_e32 v3, 26, v4 1311; GFX10-NEXT: v_and_b32_e32 v1, 0x7f00, v1 1312; GFX10-NEXT: v_or_b32_e32 v0, v2, v0 1313; GFX10-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 1314; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1315; GFX10-NEXT: v_perm_b32 v1, v9, v4, 0x1030707 1316; GFX10-NEXT: global_store_dword v[5:6], v0, off 1317; GFX10-NEXT: global_store_dword v[7:8], v1, off 1318; GFX10-NEXT: s_setpc_b64 s[30:31] 1319; 1320; GFX9-LABEL: lhsr_store_div: 1321; GFX9: ; %bb.0: 1322; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1323; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31 1324; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4 1325; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 1326; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 1327; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 1328; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 1329; GFX9-NEXT: global_load_dword v4, v[0:1], off 1330; GFX9-NEXT: global_load_dword v9, v[2:3], off 1331; GFX9-NEXT: v_mov_b32_e32 v0, 26 1332; GFX9-NEXT: s_mov_b32 s4, 0x1030707 1333; GFX9-NEXT: s_waitcnt vmcnt(1) 1334; GFX9-NEXT: v_lshrrev_b16_e32 v3, 1, v4 1335; GFX9-NEXT: s_waitcnt vmcnt(0) 1336; GFX9-NEXT: v_lshrrev_b32_sdwa v0, v0, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 1337; GFX9-NEXT: v_lshrrev_b32_e32 v2, 25, v9 1338; GFX9-NEXT: v_perm_b32 v1, v9, v4, s4 1339; GFX9-NEXT: v_lshrrev_b32_e32 v4, 26, v4 1340; GFX9-NEXT: v_or_b32_e32 v0, v2, v0 1341; GFX9-NEXT: v_and_b32_e32 v2, 0x7f00, v3 1342; GFX9-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 1343; GFX9-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1344; GFX9-NEXT: global_store_dword v[5:6], v0, off 1345; GFX9-NEXT: global_store_dword v[7:8], v1, off 1346; GFX9-NEXT: s_waitcnt vmcnt(0) 1347; GFX9-NEXT: s_setpc_b64 s[30:31] 1348 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1349 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid 1350 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid 1351 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 1352 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 1353 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 7, i32 7, i32 3, i32 1> 1354 %vecins = lshr <4 x i8> %shuffle0_0, <i8 1, i8 2, i8 2, i8 1> 1355 store <4 x i8> %vecins, ptr addrspace(1) %out0 1356 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1 1357 ret void 1358} 1359 1360 1361define hidden void @mul_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { 1362; GFX10-LABEL: mul_store_div: 1363; GFX10: ; %bb.0: 1364; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1365; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31 1366; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4 1367; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 1368; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 1369; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 1370; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo 1371; GFX10-NEXT: global_load_dword v4, v[0:1], off 1372; GFX10-NEXT: global_load_dword v9, v[2:3], off 1373; GFX10-NEXT: s_waitcnt vmcnt(1) 1374; GFX10-NEXT: v_lshrrev_b16 v0, 8, v4 1375; GFX10-NEXT: s_waitcnt vmcnt(0) 1376; GFX10-NEXT: v_lshrrev_b32_e32 v1, 24, v9 1377; GFX10-NEXT: v_lshrrev_b16 v2, 8, v9 1378; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v9 1379; GFX10-NEXT: v_mul_lo_u16 v0, v0, v2 1380; GFX10-NEXT: v_mul_lo_u16 v1, v3, v1 1381; GFX10-NEXT: v_mul_lo_u16 v2, v4, v9 1382; GFX10-NEXT: v_mul_lo_u16 v3, v9, v3 1383; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0 1384; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1 1385; GFX10-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 1386; GFX10-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 1387; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1388; GFX10-NEXT: v_perm_b32 v1, v4, v9, 0x2000504 1389; GFX10-NEXT: global_store_dword v[5:6], v0, off 1390; GFX10-NEXT: global_store_dword v[7:8], v1, off 1391; GFX10-NEXT: s_setpc_b64 s[30:31] 1392; 1393; GFX9-LABEL: mul_store_div: 1394; GFX9: ; %bb.0: 1395; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1396; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31 1397; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4 1398; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 1399; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 1400; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 1401; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 1402; GFX9-NEXT: global_load_dword v4, v[0:1], off 1403; GFX9-NEXT: global_load_dword v9, v[2:3], off 1404; GFX9-NEXT: s_mov_b32 s4, 0x2000504 1405; GFX9-NEXT: s_waitcnt vmcnt(0) 1406; GFX9-NEXT: v_perm_b32 v0, v4, v9, s4 1407; GFX9-NEXT: v_mul_lo_u16_e32 v1, v4, v9 1408; GFX9-NEXT: v_mul_lo_u16_sdwa v2, v4, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 1409; GFX9-NEXT: v_mul_lo_u16_sdwa v3, v9, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_3 1410; GFX9-NEXT: v_mul_lo_u16_sdwa v4, v9, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1411; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 1412; GFX9-NEXT: v_or_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 1413; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1414; GFX9-NEXT: global_store_dword v[5:6], v1, off 1415; GFX9-NEXT: global_store_dword v[7:8], v0, off 1416; GFX9-NEXT: s_waitcnt vmcnt(0) 1417; GFX9-NEXT: s_setpc_b64 s[30:31] 1418 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1419 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid 1420 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid 1421 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 1422 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 1423 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 0, i32 1, i32 4, i32 6> 1424 %vecins = mul <4 x i8> %shuffle0_0, %vec1 1425 store <4 x i8> %vecins, ptr addrspace(1) %out0 1426 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1 1427 ret void 1428} 1429 1430 1431define hidden void @or_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { 1432; GFX10-LABEL: or_store_div: 1433; GFX10: ; %bb.0: 1434; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1435; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31 1436; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4 1437; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 1438; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo 1439; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 1440; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 1441; GFX10-NEXT: global_load_dword v4, v[2:3], off 1442; GFX10-NEXT: global_load_dword v9, v[0:1], off 1443; GFX10-NEXT: v_mov_b32_e32 v0, 16 1444; GFX10-NEXT: v_bfrev_b32_e32 v2, 4.0 1445; GFX10-NEXT: s_waitcnt vmcnt(1) 1446; GFX10-NEXT: v_lshlrev_b16 v1, 8, v4 1447; GFX10-NEXT: v_lshrrev_b32_sdwa v0, v0, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 1448; GFX10-NEXT: s_waitcnt vmcnt(0) 1449; GFX10-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD 1450; GFX10-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD 1451; GFX10-NEXT: v_or_b32_e32 v1, 0x201, v1 1452; GFX10-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 1453; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1454; GFX10-NEXT: v_perm_b32 v1, v9, v4, 0x2010005 1455; GFX10-NEXT: global_store_dword v[5:6], v0, off 1456; GFX10-NEXT: global_store_dword v[7:8], v1, off 1457; GFX10-NEXT: s_setpc_b64 s[30:31] 1458; 1459; GFX9-LABEL: or_store_div: 1460; GFX9: ; %bb.0: 1461; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1462; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31 1463; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4 1464; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 1465; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 1466; GFX9-NEXT: global_load_dword v2, v[2:3], off 1467; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 1468; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 1469; GFX9-NEXT: global_load_dword v0, v[0:1], off 1470; GFX9-NEXT: s_mov_b32 s4, 0x2010005 1471; GFX9-NEXT: s_movk_i32 s5, 0x102 1472; GFX9-NEXT: s_waitcnt vmcnt(1) 1473; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v2 1474; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v2 1475; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 1476; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD 1477; GFX9-NEXT: v_or_b32_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 1478; GFX9-NEXT: s_waitcnt vmcnt(0) 1479; GFX9-NEXT: v_perm_b32 v4, v0, v2, s4 1480; GFX9-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD 1481; GFX9-NEXT: v_or_b32_e32 v0, 0x201, v0 1482; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1483; GFX9-NEXT: global_store_dword v[5:6], v0, off 1484; GFX9-NEXT: global_store_dword v[7:8], v4, off 1485; GFX9-NEXT: s_waitcnt vmcnt(0) 1486; GFX9-NEXT: s_setpc_b64 s[30:31] 1487 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1488 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid 1489 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid 1490 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 1491 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 1492 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 1, i32 4, i32 5, i32 6> 1493 %vecins = or <4 x i8> %shuffle0_0, <i8 1, i8 2, i8 2, i8 1> 1494 store <4 x i8> %vecins, ptr addrspace(1) %out0 1495 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1 1496 ret void 1497} 1498 1499define hidden void @sdiv_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { 1500; GFX10-LABEL: sdiv_store_div: 1501; GFX10: ; %bb.0: 1502; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1503; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31 1504; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4 1505; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 1506; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo 1507; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 1508; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 1509; GFX10-NEXT: global_load_dword v4, v[2:3], off 1510; GFX10-NEXT: global_load_dword v9, v[0:1], off 1511; GFX10-NEXT: s_waitcnt vmcnt(1) 1512; GFX10-NEXT: v_cvt_f32_i32_sdwa v1, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 1513; GFX10-NEXT: v_cvt_f32_i32_sdwa v10, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 1514; GFX10-NEXT: s_waitcnt vmcnt(0) 1515; GFX10-NEXT: v_cvt_f32_i32_sdwa v2, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 1516; GFX10-NEXT: v_cvt_f32_i32_sdwa v12, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 1517; GFX10-NEXT: v_cvt_f32_i32_sdwa v14, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 1518; GFX10-NEXT: v_rcp_iflag_f32_e32 v15, v1 1519; GFX10-NEXT: v_rcp_iflag_f32_e32 v16, v10 1520; GFX10-NEXT: v_cvt_f32_i32_sdwa v19, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 1521; GFX10-NEXT: v_rcp_iflag_f32_e32 v17, v12 1522; GFX10-NEXT: v_xor_b32_sdwa v0, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0 1523; GFX10-NEXT: v_rcp_iflag_f32_e32 v18, v14 1524; GFX10-NEXT: v_xor_b32_sdwa v3, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 1525; GFX10-NEXT: v_xor_b32_sdwa v11, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 1526; GFX10-NEXT: v_xor_b32_sdwa v13, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_3 1527; GFX10-NEXT: v_ashrrev_i32_e32 v0, 30, v0 1528; GFX10-NEXT: v_mul_f32_e32 v15, v2, v15 1529; GFX10-NEXT: v_mul_f32_e32 v16, v19, v16 1530; GFX10-NEXT: v_ashrrev_i32_e32 v3, 30, v3 1531; GFX10-NEXT: v_mul_f32_e32 v17, v2, v17 1532; GFX10-NEXT: v_or_b32_e32 v0, 1, v0 1533; GFX10-NEXT: v_trunc_f32_e32 v15, v15 1534; GFX10-NEXT: v_trunc_f32_e32 v16, v16 1535; GFX10-NEXT: v_mul_f32_e32 v18, v1, v18 1536; GFX10-NEXT: v_trunc_f32_e32 v17, v17 1537; GFX10-NEXT: v_ashrrev_i32_e32 v11, 30, v11 1538; GFX10-NEXT: v_mad_f32 v20, -v15, v1, v2 1539; GFX10-NEXT: v_mad_f32 v19, -v16, v10, v19 1540; GFX10-NEXT: v_or_b32_e32 v3, 1, v3 1541; GFX10-NEXT: v_trunc_f32_e32 v18, v18 1542; GFX10-NEXT: v_mad_f32 v2, -v17, v12, v2 1543; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v20|, |v1| 1544; GFX10-NEXT: v_ashrrev_i32_e32 v13, 30, v13 1545; GFX10-NEXT: v_or_b32_e32 v11, 1, v11 1546; GFX10-NEXT: v_mad_f32 v21, -v18, v14, v1 1547; GFX10-NEXT: v_cvt_i32_f32_e32 v15, v15 1548; GFX10-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo 1549; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v19|, |v10| 1550; GFX10-NEXT: v_or_b32_e32 v13, 1, v13 1551; GFX10-NEXT: v_cvt_i32_f32_e32 v16, v16 1552; GFX10-NEXT: v_cvt_i32_f32_e32 v17, v17 1553; GFX10-NEXT: v_cvt_i32_f32_e32 v18, v18 1554; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc_lo 1555; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v2|, |v12| 1556; GFX10-NEXT: v_add_nc_u32_e32 v0, v15, v0 1557; GFX10-NEXT: v_add_nc_u32_sdwa v1, v16, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 1558; GFX10-NEXT: v_cndmask_b32_e32 v2, 0, v11, vcc_lo 1559; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v21|, |v14| 1560; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 1561; GFX10-NEXT: v_add_nc_u32_e32 v2, v17, v2 1562; GFX10-NEXT: v_cndmask_b32_e32 v3, 0, v13, vcc_lo 1563; GFX10-NEXT: v_add_nc_u32_sdwa v3, v18, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 1564; GFX10-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 1565; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1566; GFX10-NEXT: v_perm_b32 v1, v9, v4, 0x60706 1567; GFX10-NEXT: global_store_dword v[5:6], v0, off 1568; GFX10-NEXT: global_store_dword v[7:8], v1, off 1569; GFX10-NEXT: s_setpc_b64 s[30:31] 1570; 1571; GFX9-LABEL: sdiv_store_div: 1572; GFX9: ; %bb.0: 1573; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1574; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31 1575; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4 1576; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 1577; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 1578; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 1579; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 1580; GFX9-NEXT: global_load_dword v4, v[2:3], off 1581; GFX9-NEXT: global_load_dword v9, v[0:1], off 1582; GFX9-NEXT: s_mov_b32 s4, 0x60706 1583; GFX9-NEXT: s_waitcnt vmcnt(1) 1584; GFX9-NEXT: v_cvt_f32_i32_sdwa v2, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 1585; GFX9-NEXT: v_cvt_f32_i32_sdwa v12, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 1586; GFX9-NEXT: s_waitcnt vmcnt(0) 1587; GFX9-NEXT: v_perm_b32 v0, v9, v4, s4 1588; GFX9-NEXT: v_xor_b32_sdwa v1, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0 1589; GFX9-NEXT: v_cvt_f32_i32_sdwa v3, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 1590; GFX9-NEXT: v_xor_b32_sdwa v10, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 1591; GFX9-NEXT: v_cvt_f32_i32_sdwa v11, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 1592; GFX9-NEXT: v_xor_b32_sdwa v9, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 1593; GFX9-NEXT: v_cvt_f32_i32_sdwa v13, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 1594; GFX9-NEXT: v_xor_b32_sdwa v14, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_3 1595; GFX9-NEXT: v_cvt_f32_i32_sdwa v4, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 1596; GFX9-NEXT: v_rcp_iflag_f32_e32 v15, v2 1597; GFX9-NEXT: v_rcp_iflag_f32_e32 v16, v12 1598; GFX9-NEXT: v_rcp_iflag_f32_e32 v17, v13 1599; GFX9-NEXT: v_rcp_iflag_f32_e32 v18, v4 1600; GFX9-NEXT: v_mul_f32_e32 v15, v3, v15 1601; GFX9-NEXT: v_mul_f32_e32 v16, v11, v16 1602; GFX9-NEXT: v_trunc_f32_e32 v15, v15 1603; GFX9-NEXT: v_ashrrev_i32_e32 v1, 30, v1 1604; GFX9-NEXT: v_mul_f32_e32 v17, v3, v17 1605; GFX9-NEXT: v_mul_f32_e32 v18, v2, v18 1606; GFX9-NEXT: v_trunc_f32_e32 v16, v16 1607; GFX9-NEXT: v_mad_f32 v19, -v15, v2, v3 1608; GFX9-NEXT: v_ashrrev_i32_e32 v10, 30, v10 1609; GFX9-NEXT: v_or_b32_e32 v1, 1, v1 1610; GFX9-NEXT: v_trunc_f32_e32 v17, v17 1611; GFX9-NEXT: v_trunc_f32_e32 v18, v18 1612; GFX9-NEXT: v_mad_f32 v11, -v16, v12, v11 1613; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v19|, |v2| 1614; GFX9-NEXT: v_ashrrev_i32_e32 v9, 30, v9 1615; GFX9-NEXT: v_or_b32_e32 v10, 1, v10 1616; GFX9-NEXT: v_cvt_i32_f32_e32 v15, v15 1617; GFX9-NEXT: v_cvt_i32_f32_e32 v16, v16 1618; GFX9-NEXT: v_mad_f32 v3, -v17, v13, v3 1619; GFX9-NEXT: v_cvt_i32_f32_e32 v17, v17 1620; GFX9-NEXT: v_mad_f32 v2, -v18, v4, v2 1621; GFX9-NEXT: v_cvt_i32_f32_e32 v18, v18 1622; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 1623; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v11|, |v12| 1624; GFX9-NEXT: v_ashrrev_i32_e32 v14, 30, v14 1625; GFX9-NEXT: v_or_b32_e32 v9, 1, v9 1626; GFX9-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc 1627; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v13| 1628; GFX9-NEXT: v_or_b32_e32 v14, 1, v14 1629; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc 1630; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v4| 1631; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v14, vcc 1632; GFX9-NEXT: v_add_u32_e32 v1, v15, v1 1633; GFX9-NEXT: v_add_u32_sdwa v4, v16, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 1634; GFX9-NEXT: v_add_u32_e32 v3, v17, v3 1635; GFX9-NEXT: v_add_u32_sdwa v2, v18, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 1636; GFX9-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 1637; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 1638; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1639; GFX9-NEXT: global_store_dword v[5:6], v1, off 1640; GFX9-NEXT: global_store_dword v[7:8], v0, off 1641; GFX9-NEXT: s_waitcnt vmcnt(0) 1642; GFX9-NEXT: s_setpc_b64 s[30:31] 1643 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1644 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid 1645 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid 1646 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 1647 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 1648 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 2, i32 3, i32 2, i32 4> 1649 %vecins = sdiv <4 x i8> %shuffle0_0, %vec1 1650 store <4 x i8> %vecins, ptr addrspace(1) %out0 1651 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1 1652 ret void 1653} 1654 1655 1656define hidden void @sext_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { 1657; GFX10-LABEL: sext_store_div: 1658; GFX10: ; %bb.0: 1659; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1660; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31 1661; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4 1662; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 1663; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo 1664; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 1665; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 1666; GFX10-NEXT: global_load_dword v4, v[2:3], off 1667; GFX10-NEXT: global_load_dword v9, v[0:1], off 1668; GFX10-NEXT: s_waitcnt vmcnt(1) 1669; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v4 1670; GFX10-NEXT: s_waitcnt vmcnt(0) 1671; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v9 1672; GFX10-NEXT: v_ashrrev_i16 v2, 8, v4 1673; GFX10-NEXT: v_ashrrev_i16 v0, 8, v0 1674; GFX10-NEXT: v_ashrrev_i16 v3, 8, v1 1675; GFX10-NEXT: v_perm_b32 v1, v0, v2, 0x5040100 1676; GFX10-NEXT: v_perm_b32 v0, v3, v3, 0x5040100 1677; GFX10-NEXT: v_perm_b32 v2, v9, v4, 0x3010707 1678; GFX10-NEXT: global_store_dwordx2 v[7:8], v[0:1], off 1679; GFX10-NEXT: global_store_dword v[5:6], v2, off 1680; GFX10-NEXT: s_setpc_b64 s[30:31] 1681; 1682; GFX9-LABEL: sext_store_div: 1683; GFX9: ; %bb.0: 1684; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1685; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31 1686; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4 1687; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 1688; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 1689; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 1690; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 1691; GFX9-NEXT: global_load_dword v4, v[0:1], off 1692; GFX9-NEXT: global_load_dword v9, v[2:3], off 1693; GFX9-NEXT: v_mov_b32_e32 v0, 8 1694; GFX9-NEXT: s_mov_b32 s5, 0x5040100 1695; GFX9-NEXT: s_mov_b32 s4, 0x3010707 1696; GFX9-NEXT: s_waitcnt vmcnt(0) 1697; GFX9-NEXT: v_ashrrev_i16_e32 v1, 8, v9 1698; GFX9-NEXT: v_ashrrev_i16_sdwa v3, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1699; GFX9-NEXT: v_ashrrev_i16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1700; GFX9-NEXT: v_perm_b32 v1, v3, v1, s5 1701; GFX9-NEXT: v_perm_b32 v0, v0, v0, s5 1702; GFX9-NEXT: v_perm_b32 v2, v4, v9, s4 1703; GFX9-NEXT: global_store_dwordx2 v[7:8], v[0:1], off 1704; GFX9-NEXT: global_store_dword v[5:6], v2, off 1705; GFX9-NEXT: s_waitcnt vmcnt(0) 1706; GFX9-NEXT: s_setpc_b64 s[30:31] 1707 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1708 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid 1709 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid 1710 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 1711 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 1712 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 3, i32 3, i32 5, i32 7> 1713 %insvec = sext <4 x i8> %shuffle0_0 to <4 x i16> 1714 store <4 x i16> %insvec, ptr addrspace(1) %out1 1715 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0 1716 ret void 1717} 1718 1719 1720define hidden void @shl_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { 1721; GFX10-LABEL: shl_store_div: 1722; GFX10: ; %bb.0: 1723; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1724; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31 1725; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4 1726; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 1727; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 1728; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 1729; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo 1730; GFX10-NEXT: global_load_dword v4, v[0:1], off 1731; GFX10-NEXT: global_load_dword v9, v[2:3], off 1732; GFX10-NEXT: s_waitcnt vmcnt(1) 1733; GFX10-NEXT: v_lshlrev_b16 v0, 2, v4 1734; GFX10-NEXT: s_waitcnt vmcnt(0) 1735; GFX10-NEXT: v_lshlrev_b16 v1, 1, v9 1736; GFX10-NEXT: v_and_b32_e32 v2, 0xfffffc00, v0 1737; GFX10-NEXT: v_and_b32_e32 v3, 0xfe, v1 1738; GFX10-NEXT: v_and_b32_e32 v1, 0xfffffe00, v1 1739; GFX10-NEXT: v_and_b32_e32 v0, 0xfc, v0 1740; GFX10-NEXT: v_or_b32_e32 v2, v3, v2 1741; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 1742; GFX10-NEXT: v_perm_b32 v1, v9, v4, 0x5000104 1743; GFX10-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1744; GFX10-NEXT: global_store_dword v[5:6], v0, off 1745; GFX10-NEXT: global_store_dword v[7:8], v1, off 1746; GFX10-NEXT: s_setpc_b64 s[30:31] 1747; 1748; GFX9-LABEL: shl_store_div: 1749; GFX9: ; %bb.0: 1750; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1751; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31 1752; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4 1753; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 1754; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 1755; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 1756; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 1757; GFX9-NEXT: global_load_dword v4, v[0:1], off 1758; GFX9-NEXT: global_load_dword v9, v[2:3], off 1759; GFX9-NEXT: s_mov_b32 s4, 0x5000104 1760; GFX9-NEXT: s_waitcnt vmcnt(1) 1761; GFX9-NEXT: v_lshlrev_b16_e32 v1, 2, v4 1762; GFX9-NEXT: s_waitcnt vmcnt(0) 1763; GFX9-NEXT: v_lshlrev_b16_e32 v2, 1, v9 1764; GFX9-NEXT: v_perm_b32 v0, v9, v4, s4 1765; GFX9-NEXT: v_and_b32_e32 v3, 0xfffffc00, v1 1766; GFX9-NEXT: v_and_b32_e32 v4, 0xfe, v2 1767; GFX9-NEXT: v_and_b32_e32 v2, 0xfffffe00, v2 1768; GFX9-NEXT: v_and_b32_e32 v1, 0xfc, v1 1769; GFX9-NEXT: v_or_b32_e32 v3, v4, v3 1770; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 1771; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1772; GFX9-NEXT: global_store_dword v[5:6], v1, off 1773; GFX9-NEXT: global_store_dword v[7:8], v0, off 1774; GFX9-NEXT: s_waitcnt vmcnt(0) 1775; GFX9-NEXT: s_setpc_b64 s[30:31] 1776 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1777 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid 1778 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid 1779 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 1780 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 1781 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 4, i32 1, i32 0, i32 5> 1782 %vecins = shl <4 x i8> %shuffle0_0, <i8 1, i8 2, i8 2, i8 1> 1783 store <4 x i8> %vecins, ptr addrspace(1) %out0 1784 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1 1785 ret void 1786} 1787 1788 1789define hidden void @sitofp_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { 1790; GFX10-LABEL: sitofp_store_div: 1791; GFX10: ; %bb.0: 1792; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1793; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31 1794; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4 1795; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 1796; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo 1797; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 1798; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 1799; GFX10-NEXT: global_load_dword v4, v[2:3], off 1800; GFX10-NEXT: global_load_dword v9, v[0:1], off 1801; GFX10-NEXT: s_waitcnt vmcnt(1) 1802; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v4 1803; GFX10-NEXT: s_waitcnt vmcnt(0) 1804; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v9 1805; GFX10-NEXT: v_ashrrev_i16 v2, 8, v9 1806; GFX10-NEXT: v_ashrrev_i16 v3, 8, v4 1807; GFX10-NEXT: v_perm_b32 v4, v4, v9, 0x6010205 1808; GFX10-NEXT: v_bfe_i32 v10, v0, 0, 8 1809; GFX10-NEXT: v_bfe_i32 v1, v1, 0, 8 1810; GFX10-NEXT: v_cvt_f32_i32_sdwa v2, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 1811; GFX10-NEXT: v_cvt_f32_i32_sdwa v0, sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 1812; GFX10-NEXT: v_cvt_f32_i32_sdwa v3, sext(v10) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 1813; GFX10-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 1814; GFX10-NEXT: global_store_dwordx4 v[7:8], v[0:3], off 1815; GFX10-NEXT: global_store_dword v[5:6], v4, off 1816; GFX10-NEXT: s_setpc_b64 s[30:31] 1817; 1818; GFX9-LABEL: sitofp_store_div: 1819; GFX9: ; %bb.0: 1820; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1821; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31 1822; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4 1823; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 1824; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 1825; GFX9-NEXT: global_load_dword v9, v[0:1], off 1826; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v4 1827; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc 1828; GFX9-NEXT: global_load_dword v4, v[0:1], off 1829; GFX9-NEXT: s_mov_b32 s4, 0x6010205 1830; GFX9-NEXT: s_waitcnt vmcnt(1) 1831; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v9 1832; GFX9-NEXT: v_ashrrev_i16_e32 v1, 8, v9 1833; GFX9-NEXT: v_bfe_i32 v10, v0, 0, 8 1834; GFX9-NEXT: s_waitcnt vmcnt(0) 1835; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v4 1836; GFX9-NEXT: v_ashrrev_i16_e32 v3, 8, v4 1837; GFX9-NEXT: v_bfe_i32 v11, v2, 0, 8 1838; GFX9-NEXT: v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 1839; GFX9-NEXT: v_cvt_f32_i32_sdwa v0, sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 1840; GFX9-NEXT: v_cvt_f32_i32_sdwa v3, sext(v11) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 1841; GFX9-NEXT: v_cvt_f32_i32_sdwa v1, sext(v10) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 1842; GFX9-NEXT: v_perm_b32 v4, v4, v9, s4 1843; GFX9-NEXT: global_store_dwordx4 v[7:8], v[0:3], off 1844; GFX9-NEXT: global_store_dword v[5:6], v4, off 1845; GFX9-NEXT: s_waitcnt vmcnt(0) 1846; GFX9-NEXT: s_setpc_b64 s[30:31] 1847 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1848 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid 1849 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid 1850 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 1851 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 1852 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 5, i32 2, i32 1, i32 6> 1853 %insvec = sitofp <4 x i8> %shuffle0_0 to <4 x float> 1854 store <4 x float> %insvec, ptr addrspace(1) %out1 1855 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0 1856 ret void 1857} 1858 1859 1860define hidden void @srem_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { 1861; GFX10-LABEL: srem_store_div: 1862; GFX10: ; %bb.0: 1863; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1864; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31 1865; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4 1866; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 1867; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo 1868; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 1869; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 1870; GFX10-NEXT: global_load_dword v4, v[2:3], off 1871; GFX10-NEXT: global_load_dword v9, v[0:1], off 1872; GFX10-NEXT: s_waitcnt vmcnt(1) 1873; GFX10-NEXT: v_cvt_f32_i32_sdwa v2, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 1874; GFX10-NEXT: v_cvt_f32_i32_sdwa v13, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 1875; GFX10-NEXT: v_cvt_f32_i32_sdwa v3, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 1876; GFX10-NEXT: s_waitcnt vmcnt(0) 1877; GFX10-NEXT: v_cvt_f32_i32_sdwa v12, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 1878; GFX10-NEXT: v_cvt_f32_i32_sdwa v15, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 1879; GFX10-NEXT: v_rcp_iflag_f32_e32 v17, v2 1880; GFX10-NEXT: v_rcp_iflag_f32_e32 v18, v13 1881; GFX10-NEXT: v_rcp_iflag_f32_e32 v19, v3 1882; GFX10-NEXT: v_xor_b32_sdwa v1, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0 1883; GFX10-NEXT: v_rcp_iflag_f32_e32 v20, v15 1884; GFX10-NEXT: v_xor_b32_sdwa v11, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 1885; GFX10-NEXT: v_cvt_f32_i32_sdwa v21, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 1886; GFX10-NEXT: v_xor_b32_sdwa v14, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_2 1887; GFX10-NEXT: v_ashrrev_i32_e32 v1, 30, v1 1888; GFX10-NEXT: v_xor_b32_sdwa v16, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_3 1889; GFX10-NEXT: v_mul_f32_e32 v17, v3, v17 1890; GFX10-NEXT: v_mul_f32_e32 v18, v12, v18 1891; GFX10-NEXT: v_mul_f32_e32 v19, v15, v19 1892; GFX10-NEXT: v_ashrrev_i32_e32 v11, 30, v11 1893; GFX10-NEXT: v_or_b32_e32 v1, 1, v1 1894; GFX10-NEXT: v_trunc_f32_e32 v17, v17 1895; GFX10-NEXT: v_trunc_f32_e32 v18, v18 1896; GFX10-NEXT: v_mul_f32_e32 v20, v21, v20 1897; GFX10-NEXT: v_trunc_f32_e32 v19, v19 1898; GFX10-NEXT: v_ashrrev_i32_e32 v14, 30, v14 1899; GFX10-NEXT: v_mad_f32 v22, -v17, v2, v3 1900; GFX10-NEXT: v_mad_f32 v12, -v18, v13, v12 1901; GFX10-NEXT: v_or_b32_e32 v11, 1, v11 1902; GFX10-NEXT: v_trunc_f32_e32 v20, v20 1903; GFX10-NEXT: v_mad_f32 v23, -v19, v3, v15 1904; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v22|, |v2| 1905; GFX10-NEXT: v_ashrrev_i32_e32 v16, 30, v16 1906; GFX10-NEXT: v_or_b32_e32 v14, 1, v14 1907; GFX10-NEXT: v_mad_f32 v21, -v20, v15, v21 1908; GFX10-NEXT: v_cvt_i32_f32_e32 v17, v17 1909; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo 1910; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v12|, |v13| 1911; GFX10-NEXT: v_or_b32_e32 v16, 1, v16 1912; GFX10-NEXT: v_cvt_i32_f32_e32 v18, v18 1913; GFX10-NEXT: v_cvt_i32_f32_e32 v19, v19 1914; GFX10-NEXT: v_cvt_i32_f32_e32 v20, v20 1915; GFX10-NEXT: v_cndmask_b32_e32 v2, 0, v11, vcc_lo 1916; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v23|, |v3| 1917; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v4 1918; GFX10-NEXT: v_lshrrev_b32_e32 v10, 8, v4 1919; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v4 1920; GFX10-NEXT: v_add_nc_u32_e32 v1, v17, v1 1921; GFX10-NEXT: v_cndmask_b32_e32 v3, 0, v14, vcc_lo 1922; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v21|, |v15| 1923; GFX10-NEXT: v_add_nc_u32_e32 v2, v18, v2 1924; GFX10-NEXT: v_mul_lo_u32 v1, v1, v4 1925; GFX10-NEXT: v_add_nc_u32_e32 v3, v19, v3 1926; GFX10-NEXT: v_cndmask_b32_e32 v11, 0, v16, vcc_lo 1927; GFX10-NEXT: v_mul_lo_u32 v2, v2, v10 1928; GFX10-NEXT: v_mul_lo_u32 v3, v3, v0 1929; GFX10-NEXT: v_add_nc_u32_e32 v11, v20, v11 1930; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v1 1931; GFX10-NEXT: v_sub_nc_u32_sdwa v1, v9, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD 1932; GFX10-NEXT: v_mul_lo_u32 v10, v11, v12 1933; GFX10-NEXT: v_sub_nc_u32_e32 v2, v12, v3 1934; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 1935; GFX10-NEXT: v_sub_nc_u32_sdwa v3, v9, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 1936; GFX10-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 1937; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1938; GFX10-NEXT: v_perm_b32 v1, v4, v9, 0x2070306 1939; GFX10-NEXT: global_store_dword v[5:6], v0, off 1940; GFX10-NEXT: global_store_dword v[7:8], v1, off 1941; GFX10-NEXT: s_setpc_b64 s[30:31] 1942; 1943; GFX9-LABEL: srem_store_div: 1944; GFX9: ; %bb.0: 1945; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1946; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31 1947; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4 1948; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 1949; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 1950; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 1951; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 1952; GFX9-NEXT: global_load_dword v4, v[2:3], off 1953; GFX9-NEXT: global_load_dword v9, v[0:1], off 1954; GFX9-NEXT: s_mov_b32 s4, 0x2070306 1955; GFX9-NEXT: s_waitcnt vmcnt(1) 1956; GFX9-NEXT: v_cvt_f32_i32_sdwa v14, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 1957; GFX9-NEXT: s_waitcnt vmcnt(0) 1958; GFX9-NEXT: v_cvt_f32_i32_sdwa v13, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 1959; GFX9-NEXT: v_cvt_f32_i32_sdwa v10, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 1960; GFX9-NEXT: v_cvt_f32_i32_sdwa v3, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 1961; GFX9-NEXT: v_rcp_iflag_f32_e32 v18, v14 1962; GFX9-NEXT: v_cvt_f32_i32_sdwa v16, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 1963; GFX9-NEXT: v_rcp_iflag_f32_e32 v19, v10 1964; GFX9-NEXT: v_perm_b32 v1, v4, v9, s4 1965; GFX9-NEXT: v_mul_f32_e32 v18, v13, v18 1966; GFX9-NEXT: v_trunc_f32_e32 v18, v18 1967; GFX9-NEXT: v_mad_f32 v13, -v18, v14, v13 1968; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v13|, |v14| 1969; GFX9-NEXT: v_rcp_iflag_f32_e32 v13, v3 1970; GFX9-NEXT: v_mul_f32_e32 v14, v16, v19 1971; GFX9-NEXT: v_trunc_f32_e32 v14, v14 1972; GFX9-NEXT: v_mad_f32 v19, -v14, v10, v16 1973; GFX9-NEXT: v_mul_f32_e32 v13, v10, v13 1974; GFX9-NEXT: v_trunc_f32_e32 v13, v13 1975; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v19|, |v10| 1976; GFX9-NEXT: v_mad_f32 v10, -v13, v3, v10 1977; GFX9-NEXT: v_cvt_f32_i32_sdwa v19, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 1978; GFX9-NEXT: v_cmp_ge_f32_e64 s[6:7], |v10|, |v3| 1979; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v16 1980; GFX9-NEXT: v_xor_b32_sdwa v12, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 1981; GFX9-NEXT: v_xor_b32_sdwa v2, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0 1982; GFX9-NEXT: v_xor_b32_sdwa v15, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_2 1983; GFX9-NEXT: v_mul_f32_e32 v3, v19, v3 1984; GFX9-NEXT: v_trunc_f32_e32 v3, v3 1985; GFX9-NEXT: v_ashrrev_i32_e32 v12, 30, v12 1986; GFX9-NEXT: v_xor_b32_sdwa v10, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_3 1987; GFX9-NEXT: v_cvt_i32_f32_e32 v13, v13 1988; GFX9-NEXT: v_cvt_i32_f32_e32 v18, v18 1989; GFX9-NEXT: v_cvt_i32_f32_e32 v14, v14 1990; GFX9-NEXT: v_mad_f32 v19, -v3, v16, v19 1991; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 1992; GFX9-NEXT: v_ashrrev_i32_e32 v15, 30, v15 1993; GFX9-NEXT: v_or_b32_e32 v12, 1, v12 1994; GFX9-NEXT: v_ashrrev_i32_e32 v2, 30, v2 1995; GFX9-NEXT: v_ashrrev_i32_e32 v10, 30, v10 1996; GFX9-NEXT: v_or_b32_e32 v15, 1, v15 1997; GFX9-NEXT: v_or_b32_e32 v2, 1, v2 1998; GFX9-NEXT: v_or_b32_e32 v10, 1, v10 1999; GFX9-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc 2000; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v19|, |v16| 2001; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] 2002; GFX9-NEXT: v_cndmask_b32_e64 v15, 0, v15, s[4:5] 2003; GFX9-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc 2004; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v4 2005; GFX9-NEXT: v_lshrrev_b32_e32 v11, 8, v4 2006; GFX9-NEXT: v_lshrrev_b32_e32 v17, 24, v4 2007; GFX9-NEXT: v_add_u32_e32 v2, v13, v2 2008; GFX9-NEXT: v_add_u32_e32 v12, v18, v12 2009; GFX9-NEXT: v_add_u32_e32 v13, v14, v15 2010; GFX9-NEXT: v_add_u32_e32 v3, v3, v10 2011; GFX9-NEXT: v_mul_lo_u32 v2, v2, v4 2012; GFX9-NEXT: v_mul_lo_u32 v4, v12, v11 2013; GFX9-NEXT: v_mul_lo_u32 v10, v13, v0 2014; GFX9-NEXT: v_mul_lo_u32 v3, v3, v17 2015; GFX9-NEXT: v_sub_u32_e32 v0, v0, v2 2016; GFX9-NEXT: v_sub_u32_sdwa v2, v9, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD 2017; GFX9-NEXT: v_sub_u32_e32 v4, v17, v10 2018; GFX9-NEXT: v_sub_u32_sdwa v3, v9, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 2019; GFX9-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2020; GFX9-NEXT: v_or_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2021; GFX9-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2022; GFX9-NEXT: global_store_dword v[5:6], v0, off 2023; GFX9-NEXT: global_store_dword v[7:8], v1, off 2024; GFX9-NEXT: s_waitcnt vmcnt(0) 2025; GFX9-NEXT: s_setpc_b64 s[30:31] 2026 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2027 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid 2028 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid 2029 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 2030 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 2031 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 6, i32 3, i32 7, i32 2> 2032 %vecins = srem <4 x i8> %shuffle0_0, %vec1 2033 store <4 x i8> %vecins, ptr addrspace(1) %out0 2034 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1 2035 ret void 2036} 2037 2038 2039define hidden void @sub_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { 2040; GFX10-LABEL: sub_store_div: 2041; GFX10: ; %bb.0: 2042; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2043; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31 2044; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4 2045; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 2046; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo 2047; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 2048; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 2049; GFX10-NEXT: global_load_dword v2, v[2:3], off 2050; GFX10-NEXT: global_load_dword v0, v[0:1], off 2051; GFX10-NEXT: s_waitcnt vmcnt(1) 2052; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v2 2053; GFX10-NEXT: v_lshrrev_b16 v3, 8, v2 2054; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v2 2055; GFX10-NEXT: s_waitcnt vmcnt(0) 2056; GFX10-NEXT: v_sub_nc_u16 v3, v0, v3 2057; GFX10-NEXT: v_sub_nc_u16 v9, v1, v4 2058; GFX10-NEXT: v_sub_nc_u16 v10, v4, v2 2059; GFX10-NEXT: v_sub_nc_u16 v1, v4, v1 2060; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x6070007 2061; GFX10-NEXT: v_lshlrev_b16 v3, 8, v3 2062; GFX10-NEXT: v_lshlrev_b16 v4, 8, v9 2063; GFX10-NEXT: v_or_b32_sdwa v3, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2064; GFX10-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2065; GFX10-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2066; GFX10-NEXT: global_store_dword v[5:6], v1, off 2067; GFX10-NEXT: global_store_dword v[7:8], v0, off 2068; GFX10-NEXT: s_setpc_b64 s[30:31] 2069; 2070; GFX9-LABEL: sub_store_div: 2071; GFX9: ; %bb.0: 2072; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2073; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31 2074; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4 2075; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 2076; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 2077; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 2078; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 2079; GFX9-NEXT: global_load_dword v4, v[0:1], off 2080; GFX9-NEXT: global_load_dword v9, v[2:3], off 2081; GFX9-NEXT: s_mov_b32 s4, 0x6070007 2082; GFX9-NEXT: s_waitcnt vmcnt(0) 2083; GFX9-NEXT: v_perm_b32 v0, v9, v4, s4 2084; GFX9-NEXT: v_sub_u16_sdwa v1, v4, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 2085; GFX9-NEXT: v_sub_u16_sdwa v2, v9, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD 2086; GFX9-NEXT: v_sub_u16_sdwa v3, v9, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_3 2087; GFX9-NEXT: v_sub_u16_sdwa v4, v9, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:WORD_1 2088; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2089; GFX9-NEXT: v_or_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2090; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2091; GFX9-NEXT: global_store_dword v[5:6], v1, off 2092; GFX9-NEXT: global_store_dword v[7:8], v0, off 2093; GFX9-NEXT: s_waitcnt vmcnt(0) 2094; GFX9-NEXT: s_setpc_b64 s[30:31] 2095 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2096 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid 2097 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid 2098 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 2099 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 2100 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 7, i32 0, i32 7, i32 6> 2101 %vecins = sub <4 x i8> %shuffle0_0, %vec1 2102 store <4 x i8> %vecins, ptr addrspace(1) %out0 2103 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1 2104 ret void 2105} 2106 2107 2108define hidden void @sv_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2) { 2109; GFX10-LABEL: sv_store_div: 2110; GFX10: ; %bb.0: 2111; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2112; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31 2113; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4 2114; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 2115; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 2116; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 2117; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo 2118; GFX10-NEXT: global_load_dword v4, v[0:1], off 2119; GFX10-NEXT: global_load_dword v5, v[2:3], off 2120; GFX10-NEXT: s_waitcnt vmcnt(0) 2121; GFX10-NEXT: v_perm_b32 v0, v4, v5, 0x50705 2122; GFX10-NEXT: global_store_dword v[7:8], v0, off 2123; GFX10-NEXT: s_setpc_b64 s[30:31] 2124; 2125; GFX9-LABEL: sv_store_div: 2126; GFX9: ; %bb.0: 2127; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2128; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31 2129; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4 2130; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 2131; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 2132; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 2133; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 2134; GFX9-NEXT: global_load_dword v4, v[0:1], off 2135; GFX9-NEXT: global_load_dword v5, v[2:3], off 2136; GFX9-NEXT: s_mov_b32 s4, 0x50705 2137; GFX9-NEXT: s_waitcnt vmcnt(0) 2138; GFX9-NEXT: v_perm_b32 v0, v4, v5, s4 2139; GFX9-NEXT: global_store_dword v[7:8], v0, off 2140; GFX9-NEXT: s_waitcnt vmcnt(0) 2141; GFX9-NEXT: s_setpc_b64 s[30:31] 2142 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2143 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid 2144 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid 2145 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 2146 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 2147 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 1, i32 3, i32 1, i32 4> 2148 %insvec = shufflevector <4 x i8> %shuffle0_0, <4 x i8> %vec1, <4 x i32> <i32 6, i32 3, i32 7, i32 0> 2149 store <4 x i8> %insvec, ptr addrspace(1) %out1 2150 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1 2151 ret void 2152} 2153 2154 2155define hidden void @trunc_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { 2156; GFX10-LABEL: trunc_store_div: 2157; GFX10: ; %bb.0: 2158; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2159; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31 2160; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4 2161; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 2162; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 2163; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 2164; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo 2165; GFX10-NEXT: global_load_dword v4, v[0:1], off 2166; GFX10-NEXT: global_load_dword v9, v[2:3], off 2167; GFX10-NEXT: v_mov_b32_e32 v0, 1 2168; GFX10-NEXT: s_waitcnt vmcnt(1) 2169; GFX10-NEXT: v_and_b32_sdwa v1, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 2170; GFX10-NEXT: s_waitcnt vmcnt(0) 2171; GFX10-NEXT: v_and_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD 2172; GFX10-NEXT: v_lshlrev_b16 v1, 1, v1 2173; GFX10-NEXT: v_lshlrev_b16 v2, 2, v0 2174; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 2175; GFX10-NEXT: v_lshlrev_b16 v1, 3, v4 2176; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 2177; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 2178; GFX10-NEXT: v_perm_b32 v1, v9, v4, 0x50205 2179; GFX10-NEXT: v_and_b32_e32 v0, 15, v0 2180; GFX10-NEXT: global_store_byte v[7:8], v0, off 2181; GFX10-NEXT: global_store_dword v[5:6], v1, off 2182; GFX10-NEXT: s_setpc_b64 s[30:31] 2183; 2184; GFX9-LABEL: trunc_store_div: 2185; GFX9: ; %bb.0: 2186; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2187; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31 2188; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4 2189; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 2190; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 2191; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 2192; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 2193; GFX9-NEXT: global_load_dword v4, v[0:1], off 2194; GFX9-NEXT: global_load_dword v9, v[2:3], off 2195; GFX9-NEXT: v_mov_b32_e32 v0, 1 2196; GFX9-NEXT: s_mov_b32 s4, 0x50205 2197; GFX9-NEXT: s_waitcnt vmcnt(1) 2198; GFX9-NEXT: v_lshlrev_b16_e32 v3, 3, v4 2199; GFX9-NEXT: s_waitcnt vmcnt(0) 2200; GFX9-NEXT: v_and_b32_sdwa v2, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD 2201; GFX9-NEXT: v_and_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 2202; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0 2203; GFX9-NEXT: v_perm_b32 v1, v9, v4, s4 2204; GFX9-NEXT: v_lshlrev_b16_e32 v4, 2, v2 2205; GFX9-NEXT: v_or_b32_e32 v0, v2, v0 2206; GFX9-NEXT: v_or_b32_e32 v0, v0, v4 2207; GFX9-NEXT: v_or_b32_e32 v0, v0, v3 2208; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 2209; GFX9-NEXT: global_store_byte v[7:8], v0, off 2210; GFX9-NEXT: global_store_dword v[5:6], v1, off 2211; GFX9-NEXT: s_waitcnt vmcnt(0) 2212; GFX9-NEXT: s_setpc_b64 s[30:31] 2213 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2214 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid 2215 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid 2216 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 2217 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 2218 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 5, i32 2, i32 5, i32 0> 2219 %insvec = trunc <4 x i8> %shuffle0_0 to <4 x i1> 2220 store <4 x i1> %insvec, ptr addrspace(1) %out1 2221 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0 2222 ret void 2223} 2224 2225define hidden void @udiv(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { 2226; GFX10-LABEL: udiv: 2227; GFX10: ; %bb.0: 2228; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2229; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31 2230; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4 2231; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 2232; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo 2233; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 2234; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 2235; GFX10-NEXT: global_load_dword v2, v[2:3], off 2236; GFX10-NEXT: global_load_dword v0, v[0:1], off 2237; GFX10-NEXT: s_waitcnt vmcnt(1) 2238; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v2 2239; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v3, v2 2240; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v9, v2 2241; GFX10-NEXT: s_waitcnt vmcnt(0) 2242; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v14, v0 2243; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v4, v2 2244; GFX10-NEXT: v_rcp_iflag_f32_e32 v10, v1 2245; GFX10-NEXT: v_rcp_iflag_f32_e32 v11, v3 2246; GFX10-NEXT: v_rcp_iflag_f32_e32 v13, v9 2247; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v15, v0 2248; GFX10-NEXT: v_rcp_iflag_f32_e32 v12, v4 2249; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x40207 2250; GFX10-NEXT: v_mul_f32_e32 v10, v14, v10 2251; GFX10-NEXT: v_mul_f32_e32 v11, v4, v11 2252; GFX10-NEXT: v_mul_f32_e32 v13, v1, v13 2253; GFX10-NEXT: v_mul_f32_e32 v12, v15, v12 2254; GFX10-NEXT: v_trunc_f32_e32 v10, v10 2255; GFX10-NEXT: v_trunc_f32_e32 v11, v11 2256; GFX10-NEXT: v_trunc_f32_e32 v13, v13 2257; GFX10-NEXT: v_trunc_f32_e32 v12, v12 2258; GFX10-NEXT: v_mad_f32 v14, -v10, v1, v14 2259; GFX10-NEXT: v_cvt_u32_f32_e32 v10, v10 2260; GFX10-NEXT: v_mad_f32 v16, -v11, v3, v4 2261; GFX10-NEXT: v_mad_f32 v17, -v13, v9, v1 2262; GFX10-NEXT: v_cvt_u32_f32_e32 v11, v11 2263; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v14|, v1 2264; GFX10-NEXT: v_cvt_u32_f32_e32 v13, v13 2265; GFX10-NEXT: v_mad_f32 v15, -v12, v4, v15 2266; GFX10-NEXT: v_cvt_u32_f32_e32 v12, v12 2267; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v10, vcc_lo 2268; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v16|, v3 2269; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v11, vcc_lo 2270; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v17|, v9 2271; GFX10-NEXT: v_lshlrev_b16 v3, 8, v3 2272; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v13, vcc_lo 2273; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v15|, v4 2274; GFX10-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2275; GFX10-NEXT: v_lshlrev_b16 v9, 8, v9 2276; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v12, vcc_lo 2277; GFX10-NEXT: v_or_b32_sdwa v3, v4, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2278; GFX10-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2279; GFX10-NEXT: global_store_dword v[5:6], v1, off 2280; GFX10-NEXT: global_store_dword v[7:8], v0, off 2281; GFX10-NEXT: s_setpc_b64 s[30:31] 2282; 2283; GFX9-LABEL: udiv: 2284; GFX9: ; %bb.0: 2285; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2286; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31 2287; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4 2288; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 2289; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 2290; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 2291; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 2292; GFX9-NEXT: global_load_dword v4, v[2:3], off 2293; GFX9-NEXT: global_load_dword v9, v[0:1], off 2294; GFX9-NEXT: s_mov_b32 s4, 0x40207 2295; GFX9-NEXT: s_waitcnt vmcnt(1) 2296; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v4 2297; GFX9-NEXT: v_rcp_iflag_f32_e32 v11, v2 2298; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v3, v4 2299; GFX9-NEXT: v_rcp_iflag_f32_e32 v12, v3 2300; GFX9-NEXT: s_waitcnt vmcnt(0) 2301; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v1, v9 2302; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v10, v4 2303; GFX9-NEXT: v_rcp_iflag_f32_e32 v13, v10 2304; GFX9-NEXT: v_mul_f32_e32 v11, v1, v11 2305; GFX9-NEXT: v_perm_b32 v0, v9, v4, s4 2306; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v4, v4 2307; GFX9-NEXT: v_trunc_f32_e32 v11, v11 2308; GFX9-NEXT: v_rcp_iflag_f32_e32 v14, v4 2309; GFX9-NEXT: v_mul_f32_e32 v12, v10, v12 2310; GFX9-NEXT: v_mad_f32 v1, -v11, v2, v1 2311; GFX9-NEXT: v_cvt_u32_f32_e32 v11, v11 2312; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v9, v9 2313; GFX9-NEXT: v_trunc_f32_e32 v12, v12 2314; GFX9-NEXT: v_mul_f32_e32 v13, v9, v13 2315; GFX9-NEXT: v_mad_f32 v15, -v12, v3, v10 2316; GFX9-NEXT: v_cvt_u32_f32_e32 v12, v12 2317; GFX9-NEXT: v_trunc_f32_e32 v13, v13 2318; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v2 2319; GFX9-NEXT: v_mul_f32_e32 v14, v2, v14 2320; GFX9-NEXT: v_mad_f32 v9, -v13, v10, v9 2321; GFX9-NEXT: v_cvt_u32_f32_e32 v13, v13 2322; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v11, vcc 2323; GFX9-NEXT: v_trunc_f32_e32 v14, v14 2324; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v15|, v3 2325; GFX9-NEXT: v_mad_f32 v16, -v14, v4, v2 2326; GFX9-NEXT: v_cvt_u32_f32_e32 v14, v14 2327; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v12, vcc 2328; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v9|, v10 2329; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v13, vcc 2330; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v16|, v4 2331; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v14, vcc 2332; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 2333; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v4 2334; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2335; GFX9-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2336; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2337; GFX9-NEXT: global_store_dword v[5:6], v1, off 2338; GFX9-NEXT: global_store_dword v[7:8], v0, off 2339; GFX9-NEXT: s_waitcnt vmcnt(0) 2340; GFX9-NEXT: s_setpc_b64 s[30:31] 2341 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2342 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid 2343 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid 2344 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 2345 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 2346 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 3, i32 6, i32 0, i32 4> 2347 %vecins = udiv <4 x i8> %shuffle0_0, %vec1 2348 store <4 x i8> %vecins, ptr addrspace(1) %out0 2349 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1 2350 ret void 2351} 2352 2353 2354define hidden void @uitofp_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { 2355; GFX10-LABEL: uitofp_store_div: 2356; GFX10: ; %bb.0: 2357; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2358; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31 2359; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4 2360; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 2361; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo 2362; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 2363; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 2364; GFX10-NEXT: global_load_dword v4, v[2:3], off 2365; GFX10-NEXT: global_load_dword v9, v[0:1], off 2366; GFX10-NEXT: s_waitcnt vmcnt(1) 2367; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v3, v4 2368; GFX10-NEXT: s_waitcnt vmcnt(0) 2369; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v9 2370; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v9 2371; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 2372; GFX10-NEXT: v_perm_b32 v4, v4, v9, 0x5020104 2373; GFX10-NEXT: global_store_dwordx4 v[7:8], v[0:3], off 2374; GFX10-NEXT: global_store_dword v[5:6], v4, off 2375; GFX10-NEXT: s_setpc_b64 s[30:31] 2376; 2377; GFX9-LABEL: uitofp_store_div: 2378; GFX9: ; %bb.0: 2379; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2380; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31 2381; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4 2382; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 2383; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 2384; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 2385; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 2386; GFX9-NEXT: global_load_dword v4, v[0:1], off 2387; GFX9-NEXT: global_load_dword v9, v[2:3], off 2388; GFX9-NEXT: s_mov_b32 s4, 0x5020104 2389; GFX9-NEXT: s_waitcnt vmcnt(1) 2390; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v2, v4 2391; GFX9-NEXT: s_waitcnt vmcnt(0) 2392; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v3, v9 2393; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v1, v4 2394; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v9 2395; GFX9-NEXT: v_perm_b32 v10, v9, v4, s4 2396; GFX9-NEXT: global_store_dwordx4 v[7:8], v[0:3], off 2397; GFX9-NEXT: global_store_dword v[5:6], v10, off 2398; GFX9-NEXT: s_waitcnt vmcnt(0) 2399; GFX9-NEXT: s_setpc_b64 s[30:31] 2400 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2401 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid 2402 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid 2403 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 2404 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 2405 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 4, i32 1, i32 2, i32 5> 2406 %insvec = uitofp <4 x i8> %shuffle0_0 to <4 x float> 2407 store <4 x float> %insvec, ptr addrspace(1) %out1 2408 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0 2409 ret void 2410} 2411 2412 2413define hidden void @urem_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { 2414; GFX10-LABEL: urem_store_div: 2415; GFX10: ; %bb.0: 2416; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2417; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31 2418; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4 2419; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 2420; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo 2421; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 2422; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 2423; GFX10-NEXT: global_load_dword v2, v[2:3], off 2424; GFX10-NEXT: global_load_dword v0, v[0:1], off 2425; GFX10-NEXT: s_waitcnt vmcnt(1) 2426; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v2 2427; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v3, v2 2428; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v4, v2 2429; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v9, v2 2430; GFX10-NEXT: s_waitcnt vmcnt(0) 2431; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v15, v0 2432; GFX10-NEXT: v_rcp_iflag_f32_e32 v10, v1 2433; GFX10-NEXT: v_rcp_iflag_f32_e32 v11, v3 2434; GFX10-NEXT: v_rcp_iflag_f32_e32 v12, v4 2435; GFX10-NEXT: v_rcp_iflag_f32_e32 v13, v9 2436; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v2 2437; GFX10-NEXT: v_lshrrev_b32_e32 v16, 8, v2 2438; GFX10-NEXT: v_lshrrev_b32_e32 v17, 24, v2 2439; GFX10-NEXT: v_mul_f32_e32 v10, v3, v10 2440; GFX10-NEXT: v_mul_f32_e32 v11, v3, v11 2441; GFX10-NEXT: v_mul_f32_e32 v12, v3, v12 2442; GFX10-NEXT: v_mul_f32_e32 v13, v15, v13 2443; GFX10-NEXT: v_trunc_f32_e32 v10, v10 2444; GFX10-NEXT: v_trunc_f32_e32 v11, v11 2445; GFX10-NEXT: v_trunc_f32_e32 v12, v12 2446; GFX10-NEXT: v_trunc_f32_e32 v13, v13 2447; GFX10-NEXT: v_mad_f32 v18, -v10, v1, v3 2448; GFX10-NEXT: v_cvt_u32_f32_e32 v10, v10 2449; GFX10-NEXT: v_mad_f32 v19, -v11, v3, v3 2450; GFX10-NEXT: v_cvt_u32_f32_e32 v11, v11 2451; GFX10-NEXT: v_mad_f32 v20, -v12, v4, v3 2452; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v18|, v1 2453; GFX10-NEXT: v_cvt_u32_f32_e32 v12, v12 2454; GFX10-NEXT: v_mad_f32 v15, -v13, v9, v15 2455; GFX10-NEXT: v_cvt_u32_f32_e32 v13, v13 2456; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v10, vcc_lo 2457; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v19|, v3 2458; GFX10-NEXT: v_mul_lo_u32 v1, v1, v2 2459; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v11, vcc_lo 2460; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v20|, v4 2461; GFX10-NEXT: v_mul_lo_u32 v3, v3, v16 2462; GFX10-NEXT: v_sub_nc_u32_e32 v1, v16, v1 2463; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v12, vcc_lo 2464; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v15|, v9 2465; GFX10-NEXT: v_mul_lo_u32 v4, v4, v14 2466; GFX10-NEXT: v_sub_nc_u32_sdwa v3, v16, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2467; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v13, vcc_lo 2468; GFX10-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2469; GFX10-NEXT: v_mul_lo_u32 v9, v9, v17 2470; GFX10-NEXT: v_sub_nc_u32_e32 v4, v16, v4 2471; GFX10-NEXT: v_sub_nc_u32_sdwa v9, v0, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 2472; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x2050505 2473; GFX10-NEXT: v_or_b32_sdwa v3, v4, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2474; GFX10-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2475; GFX10-NEXT: global_store_dword v[5:6], v1, off 2476; GFX10-NEXT: global_store_dword v[7:8], v0, off 2477; GFX10-NEXT: s_setpc_b64 s[30:31] 2478; 2479; GFX9-LABEL: urem_store_div: 2480; GFX9: ; %bb.0: 2481; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2482; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31 2483; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4 2484; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 2485; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 2486; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 2487; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 2488; GFX9-NEXT: global_load_dword v4, v[2:3], off 2489; GFX9-NEXT: global_load_dword v9, v[0:1], off 2490; GFX9-NEXT: s_mov_b32 s4, 0x2050505 2491; GFX9-NEXT: s_waitcnt vmcnt(1) 2492; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v4 2493; GFX9-NEXT: v_rcp_iflag_f32_e32 v15, v2 2494; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v3, v4 2495; GFX9-NEXT: v_rcp_iflag_f32_e32 v16, v3 2496; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v11, v4 2497; GFX9-NEXT: v_rcp_iflag_f32_e32 v17, v11 2498; GFX9-NEXT: v_mul_f32_e32 v15, v3, v15 2499; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v14, v4 2500; GFX9-NEXT: v_trunc_f32_e32 v15, v15 2501; GFX9-NEXT: v_rcp_iflag_f32_e32 v18, v14 2502; GFX9-NEXT: v_mul_f32_e32 v16, v3, v16 2503; GFX9-NEXT: v_mad_f32 v19, -v15, v2, v3 2504; GFX9-NEXT: v_cvt_u32_f32_e32 v15, v15 2505; GFX9-NEXT: v_trunc_f32_e32 v16, v16 2506; GFX9-NEXT: v_mul_f32_e32 v17, v3, v17 2507; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v19|, v2 2508; GFX9-NEXT: v_mad_f32 v2, -v16, v3, v3 2509; GFX9-NEXT: v_cvt_u32_f32_e32 v16, v16 2510; GFX9-NEXT: s_waitcnt vmcnt(0) 2511; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v13, v9 2512; GFX9-NEXT: v_trunc_f32_e32 v17, v17 2513; GFX9-NEXT: v_mul_f32_e32 v18, v13, v18 2514; GFX9-NEXT: v_mad_f32 v19, -v17, v11, v3 2515; GFX9-NEXT: v_cvt_u32_f32_e32 v17, v17 2516; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc 2517; GFX9-NEXT: v_trunc_f32_e32 v18, v18 2518; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 2519; GFX9-NEXT: v_mad_f32 v13, -v18, v14, v13 2520; GFX9-NEXT: v_cvt_u32_f32_e32 v18, v18 2521; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v16, vcc 2522; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v19|, v11 2523; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v17, vcc 2524; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v13|, v14 2525; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v4 2526; GFX9-NEXT: v_lshrrev_b32_e32 v10, 8, v4 2527; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v4 2528; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v18, vcc 2529; GFX9-NEXT: v_perm_b32 v1, v4, v9, s4 2530; GFX9-NEXT: v_mul_lo_u32 v4, v15, v4 2531; GFX9-NEXT: v_mul_lo_u32 v2, v2, v10 2532; GFX9-NEXT: v_mul_lo_u32 v0, v3, v0 2533; GFX9-NEXT: v_mul_lo_u32 v3, v11, v12 2534; GFX9-NEXT: v_sub_u32_e32 v4, v10, v4 2535; GFX9-NEXT: v_sub_u32_sdwa v2, v10, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2536; GFX9-NEXT: v_sub_u32_e32 v0, v10, v0 2537; GFX9-NEXT: v_sub_u32_sdwa v3, v9, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 2538; GFX9-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2539; GFX9-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2540; GFX9-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2541; GFX9-NEXT: global_store_dword v[5:6], v0, off 2542; GFX9-NEXT: global_store_dword v[7:8], v1, off 2543; GFX9-NEXT: s_waitcnt vmcnt(0) 2544; GFX9-NEXT: s_setpc_b64 s[30:31] 2545 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2546 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid 2547 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid 2548 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 2549 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 2550 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 5, i32 5, i32 5, i32 2> 2551 %vecins = urem <4 x i8> %shuffle0_0, %vec1 2552 store <4 x i8> %vecins, ptr addrspace(1) %out0 2553 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1 2554 ret void 2555} 2556 2557 2558define hidden void @xor_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { 2559; GFX10-LABEL: xor_store_div: 2560; GFX10: ; %bb.0: 2561; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2562; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31 2563; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4 2564; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 2565; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 2566; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 2567; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo 2568; GFX10-NEXT: global_load_dword v4, v[0:1], off 2569; GFX10-NEXT: global_load_dword v9, v[2:3], off 2570; GFX10-NEXT: v_mov_b32_e32 v0, 0xffffff00 2571; GFX10-NEXT: v_mov_b32_e32 v1, 1 2572; GFX10-NEXT: v_mov_b32_e32 v2, 2 2573; GFX10-NEXT: s_waitcnt vmcnt(1) 2574; GFX10-NEXT: v_and_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 2575; GFX10-NEXT: s_waitcnt vmcnt(0) 2576; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff00, v9 2577; GFX10-NEXT: v_xor_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD 2578; GFX10-NEXT: v_xor_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 2579; GFX10-NEXT: v_xor_b32_e32 v0, 0x200, v0 2580; GFX10-NEXT: v_xor_b32_e32 v3, 0x100, v3 2581; GFX10-NEXT: v_or_b32_e32 v0, v1, v0 2582; GFX10-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2583; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2584; GFX10-NEXT: v_perm_b32 v1, v9, v4, 0x5060307 2585; GFX10-NEXT: global_store_dword v[5:6], v0, off 2586; GFX10-NEXT: global_store_dword v[7:8], v1, off 2587; GFX10-NEXT: s_setpc_b64 s[30:31] 2588; 2589; GFX9-LABEL: xor_store_div: 2590; GFX9: ; %bb.0: 2591; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2592; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31 2593; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4 2594; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 2595; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 2596; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 2597; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 2598; GFX9-NEXT: global_load_dword v4, v[0:1], off 2599; GFX9-NEXT: global_load_dword v9, v[2:3], off 2600; GFX9-NEXT: s_movk_i32 s4, 0xff00 2601; GFX9-NEXT: v_mov_b32_e32 v0, 1 2602; GFX9-NEXT: v_mov_b32_e32 v1, 2 2603; GFX9-NEXT: s_mov_b32 s5, 0x5060307 2604; GFX9-NEXT: s_waitcnt vmcnt(1) 2605; GFX9-NEXT: v_and_b32_sdwa v2, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 2606; GFX9-NEXT: s_waitcnt vmcnt(0) 2607; GFX9-NEXT: v_and_b32_e32 v3, 0xffffff00, v9 2608; GFX9-NEXT: v_xor_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD 2609; GFX9-NEXT: v_xor_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 2610; GFX9-NEXT: v_xor_b32_e32 v2, 0x200, v2 2611; GFX9-NEXT: v_xor_b32_e32 v3, 0x100, v3 2612; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 2613; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 2614; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2615; GFX9-NEXT: v_perm_b32 v4, v9, v4, s5 2616; GFX9-NEXT: global_store_dword v[5:6], v0, off 2617; GFX9-NEXT: global_store_dword v[7:8], v4, off 2618; GFX9-NEXT: s_waitcnt vmcnt(0) 2619; GFX9-NEXT: s_setpc_b64 s[30:31] 2620 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2621 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid 2622 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid 2623 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 2624 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 2625 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 7, i32 3, i32 6, i32 5> 2626 %vecins = xor <4 x i8> %shuffle0_0, <i8 1, i8 2, i8 2, i8 1> 2627 store <4 x i8> %vecins, ptr addrspace(1) %out0 2628 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1 2629 ret void 2630} 2631 2632 2633define hidden void @zext_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { 2634; GFX10-LABEL: zext_store_div: 2635; GFX10: ; %bb.0: 2636; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2637; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31 2638; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4 2639; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 2640; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 2641; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 2642; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo 2643; GFX10-NEXT: global_load_dword v4, v[0:1], off 2644; GFX10-NEXT: global_load_dword v9, v[2:3], off 2645; GFX10-NEXT: v_mov_b32_e32 v0, 0xff 2646; GFX10-NEXT: s_waitcnt vmcnt(1) 2647; GFX10-NEXT: v_lshrrev_b16 v1, 8, v4 2648; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v4 2649; GFX10-NEXT: s_waitcnt vmcnt(0) 2650; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v9 2651; GFX10-NEXT: v_and_b32_sdwa v10, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 2652; GFX10-NEXT: v_perm_b32 v0, v1, v2, 0x5040100 2653; GFX10-NEXT: v_perm_b32 v2, v4, v9, 0x60504 2654; GFX10-NEXT: v_perm_b32 v1, v3, v10, 0x5040100 2655; GFX10-NEXT: global_store_dwordx2 v[7:8], v[0:1], off 2656; GFX10-NEXT: global_store_dword v[5:6], v2, off 2657; GFX10-NEXT: s_setpc_b64 s[30:31] 2658; 2659; GFX9-LABEL: zext_store_div: 2660; GFX9: ; %bb.0: 2661; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2662; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31 2663; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4 2664; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 2665; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 2666; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 2667; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 2668; GFX9-NEXT: global_load_dword v4, v[0:1], off 2669; GFX9-NEXT: global_load_dword v9, v[2:3], off 2670; GFX9-NEXT: s_mov_b32 s4, 0x60504 2671; GFX9-NEXT: s_movk_i32 s5, 0xff 2672; GFX9-NEXT: s_mov_b32 s6, 0x5040100 2673; GFX9-NEXT: s_waitcnt vmcnt(1) 2674; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v4 2675; GFX9-NEXT: s_waitcnt vmcnt(0) 2676; GFX9-NEXT: v_perm_b32 v2, v4, v9, s4 2677; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v4 2678; GFX9-NEXT: v_and_b32_e32 v3, 0xff, v9 2679; GFX9-NEXT: v_and_b32_sdwa v4, v4, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 2680; GFX9-NEXT: v_perm_b32 v0, v0, v1, s6 2681; GFX9-NEXT: v_perm_b32 v1, v3, v4, s6 2682; GFX9-NEXT: global_store_dwordx2 v[7:8], v[0:1], off 2683; GFX9-NEXT: global_store_dword v[5:6], v2, off 2684; GFX9-NEXT: s_waitcnt vmcnt(0) 2685; GFX9-NEXT: s_setpc_b64 s[30:31] 2686 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2687 %gep0 = getelementptr <4 x i8>, ptr addrspace(1) %in0, i32 %tid 2688 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid 2689 %vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4 2690 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4 2691 %shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 0, i32 1, i32 2, i32 4> 2692 %insvec = zext <4 x i8> %shuffle0_0 to <4 x i16> 2693 store <4 x i16> %insvec, ptr addrspace(1) %out1 2694 store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0 2695 ret void 2696} 2697 2698define void @Source16Bit(i16 %in, <2 x i16> %reg) { 2699; GFX10-LABEL: Source16Bit: 2700; GFX10: ; %bb.0: ; %entry 2701; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2702; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x3050204 2703; GFX10-NEXT: global_store_dword v[0:1], v0, off 2704; GFX10-NEXT: s_setpc_b64 s[30:31] 2705; 2706; GFX9-LABEL: Source16Bit: 2707; GFX9: ; %bb.0: ; %entry 2708; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2709; GFX9-NEXT: s_mov_b32 s4, 0x3050204 2710; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 2711; GFX9-NEXT: global_store_dword v[0:1], v0, off 2712; GFX9-NEXT: s_waitcnt vmcnt(0) 2713; GFX9-NEXT: s_setpc_b64 s[30:31] 2714entry: 2715 %elt0 = extractelement <2 x i16> %reg, i32 1 2716 %e0b0 = and i16 %elt0, 255 2717 %e0b1 = and i16 %elt0, -256 2718 %e1b0 = and i16 %in, 255 2719 %e1b1 = and i16 %in, -256 2720 %tmp0 = shl i16 %e0b0, 8 2721 %byte0 = or i16 %tmp0, %e1b0 2722 %tmp2 = lshr i16 %e1b1, 8 2723 %byte1 = or i16 %e0b1, %tmp2 2724 %ext0 = zext i16 %byte0 to i32 2725 %ext1 = zext i16 %byte1 to i32 2726 %shifted = shl i32 %ext1, 16 2727 %result = or i32 %shifted, %ext0 2728 store i32 %result, ptr addrspace(1) undef 2729 ret void 2730} 2731 2732define hidden void @extract3744(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) { 2733; GFX10-LABEL: extract3744: 2734; GFX10: ; %bb.0: 2735; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2736; GFX10-NEXT: global_load_dword v6, v[0:1], off 2737; GFX10-NEXT: global_load_dword v7, v[2:3], off 2738; GFX10-NEXT: s_waitcnt vmcnt(0) 2739; GFX10-NEXT: v_perm_b32 v0, v6, v7, 0x3070404 2740; GFX10-NEXT: global_store_dword v[4:5], v0, off 2741; GFX10-NEXT: s_setpc_b64 s[30:31] 2742; 2743; GFX9-LABEL: extract3744: 2744; GFX9: ; %bb.0: 2745; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2746; GFX9-NEXT: global_load_dword v6, v[0:1], off 2747; GFX9-NEXT: global_load_dword v7, v[2:3], off 2748; GFX9-NEXT: s_mov_b32 s4, 0x3070404 2749; GFX9-NEXT: s_waitcnt vmcnt(0) 2750; GFX9-NEXT: v_perm_b32 v0, v6, v7, s4 2751; GFX9-NEXT: global_store_dword v[4:5], v0, off 2752; GFX9-NEXT: s_waitcnt vmcnt(0) 2753; GFX9-NEXT: s_setpc_b64 s[30:31] 2754 %vec1 = load <4 x i8>, ptr addrspace(1) %in0, align 4 2755 %vec2 = load <4 x i8>, ptr addrspace(1) %in1, align 4 2756 %v1e0 = extractelement <4 x i8> %vec1, i64 0 2757 %zv1e0 = zext i8 %v1e0 to i32 2758 %byte1 = shl i32 %zv1e0, 8 2759 2760 %v1e3 = extractelement <4 x i8> %vec1, i64 3 2761 %zv1e3 = zext i8 %v1e3 to i32 2762 %byte2 = shl i32 %zv1e3, 16 2763 %v2e3 = extractelement <4 x i8> %vec2, i64 3 2764 %zv2e3 = zext i8 %v2e3 to i32 2765 %byte3 = shl i32 %zv2e3, 24 2766 2767 %tmp0 = or i32 %zv1e0, %byte1 2768 %tmp1 = or i32 %tmp0, %byte2 2769 %res = or i32 %tmp1, %byte3 2770 store i32 %res, ptr addrspace(1) %out0, align 4 2771 ret void 2772} 2773 2774declare i32 @llvm.amdgcn.perm(i32, i32, i32) 2775 2776define hidden void @extract_perm_3744(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) { 2777; GFX10-LABEL: extract_perm_3744: 2778; GFX10: ; %bb.0: 2779; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2780; GFX10-NEXT: global_load_dword v6, v[0:1], off 2781; GFX10-NEXT: global_load_dword v7, v[2:3], off 2782; GFX10-NEXT: s_waitcnt vmcnt(0) 2783; GFX10-NEXT: v_perm_b32 v0, v6, v7, 0x3070404 2784; GFX10-NEXT: global_store_dword v[4:5], v0, off 2785; GFX10-NEXT: s_setpc_b64 s[30:31] 2786; 2787; GFX9-LABEL: extract_perm_3744: 2788; GFX9: ; %bb.0: 2789; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2790; GFX9-NEXT: global_load_dword v6, v[0:1], off 2791; GFX9-NEXT: global_load_dword v7, v[2:3], off 2792; GFX9-NEXT: s_mov_b32 s4, 0x3070404 2793; GFX9-NEXT: s_waitcnt vmcnt(0) 2794; GFX9-NEXT: v_perm_b32 v0, v6, v7, s4 2795; GFX9-NEXT: global_store_dword v[4:5], v0, off 2796; GFX9-NEXT: s_waitcnt vmcnt(0) 2797; GFX9-NEXT: s_setpc_b64 s[30:31] 2798 %vec1 = load <4 x i8>, ptr addrspace(1) %in0, align 4 2799 %vec2 = load <4 x i8>, ptr addrspace(1) %in1, align 4 2800 %cast1 = bitcast <4 x i8> %vec1 to i32 2801 %cast2 = bitcast <4 x i8> %vec2 to i32 2802 %lo24 = call i32 @llvm.amdgcn.perm(i32 %cast1, i32 %cast1, i32 201523200) 2803 %hi8 = call i32 @llvm.amdgcn.perm(i32 %cast2, i32 %cast2, i32 51121164) 2804 %res = or i32 %hi8, %lo24 2805 store i32 %res, ptr addrspace(1) %out0, align 4 2806 ret void 2807} 2808 2809define hidden void @extract1347_v2i16(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) { 2810; GFX10-LABEL: extract1347_v2i16: 2811; GFX10: ; %bb.0: 2812; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2813; GFX10-NEXT: global_load_dword v6, v[0:1], off 2814; GFX10-NEXT: global_load_dword v7, v[2:3], off 2815; GFX10-NEXT: s_waitcnt vmcnt(0) 2816; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x1030407 2817; GFX10-NEXT: global_store_dword v[4:5], v0, off 2818; GFX10-NEXT: s_setpc_b64 s[30:31] 2819; 2820; GFX9-LABEL: extract1347_v2i16: 2821; GFX9: ; %bb.0: 2822; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2823; GFX9-NEXT: global_load_dword v6, v[0:1], off 2824; GFX9-NEXT: global_load_dword v7, v[2:3], off 2825; GFX9-NEXT: s_mov_b32 s4, 0x1030407 2826; GFX9-NEXT: s_waitcnt vmcnt(0) 2827; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4 2828; GFX9-NEXT: global_store_dword v[4:5], v0, off 2829; GFX9-NEXT: s_waitcnt vmcnt(0) 2830; GFX9-NEXT: s_setpc_b64 s[30:31] 2831 %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4 2832 %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4 2833 %v1e0 = extractelement <2 x i16> %vec1, i64 0 2834 %v1e1 = extractelement <2 x i16> %vec1, i64 1 2835 %v2e0 = extractelement <2 x i16> %vec2, i64 0 2836 %v2e1 = extractelement <2 x i16> %vec2, i64 1 2837 2838 %b0t0 = and i16 -256, %v2e1 2839 %b0t1 = lshr i16 %b0t0, 8 2840 %byte0 = zext i16 %b0t1 to i32 2841 2842 %b1t0 = and i16 255, %v2e0 2843 %b1t1 = zext i16 %b1t0 to i32 2844 %byte1 = shl i32 %b1t1, 8 2845 2846 %b2t0 = and i16 -256, %v1e1 2847 %b2t1 = lshr i16 %b2t0, 8 2848 %b2t2 = zext i16 %b2t1 to i32 2849 %byte2 = shl i32 %b2t2, 16 2850 2851 %b3t0 = and i16 -256, %v1e0 2852 %b3t1 = lshr i16 %b3t0, 8 2853 %b3t2 = zext i16 %b3t1 to i32 2854 %byte3 = shl i32 %b3t2, 24 2855 2856 %tmp0 = or i32 %byte0, %byte1 2857 %tmp1 = or i32 %tmp0, %byte2 2858 %res = or i32 %tmp1, %byte3 2859 store i32 %res, ptr addrspace(1) %out0, align 4 2860 ret void 2861} 2862 2863 2864declare i16 @llvm.fshr.i16(i16, i16, i16) 2865 2866define hidden void @fshri16_8(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) { 2867; GFX10-LABEL: fshri16_8: 2868; GFX10: ; %bb.0: 2869; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2870; GFX10-NEXT: global_load_dword v6, v[0:1], off 2871; GFX10-NEXT: global_load_dword v7, v[2:3], off 2872; GFX10-NEXT: s_waitcnt vmcnt(0) 2873; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x30407 2874; GFX10-NEXT: global_store_dword v[4:5], v0, off 2875; GFX10-NEXT: s_setpc_b64 s[30:31] 2876; 2877; GFX9-LABEL: fshri16_8: 2878; GFX9: ; %bb.0: 2879; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2880; GFX9-NEXT: global_load_dword v6, v[0:1], off 2881; GFX9-NEXT: global_load_dword v7, v[2:3], off 2882; GFX9-NEXT: s_mov_b32 s4, 0x30407 2883; GFX9-NEXT: s_waitcnt vmcnt(0) 2884; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4 2885; GFX9-NEXT: global_store_dword v[4:5], v0, off 2886; GFX9-NEXT: s_waitcnt vmcnt(0) 2887; GFX9-NEXT: s_setpc_b64 s[30:31] 2888 %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4 2889 %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4 2890 %v1e0 = extractelement <2 x i16> %vec1, i64 0 2891 %v1e1 = extractelement <2 x i16> %vec1, i64 1 2892 %v2e0 = extractelement <2 x i16> %vec2, i64 0 2893 %v2e1 = extractelement <2 x i16> %vec2, i64 1 2894 2895 %tmp01.0 = call i16 @llvm.fshr.i16(i16 %v2e0, i16 %v2e1, i16 8) 2896 %byte01 = zext i16 %tmp01.0 to i32 2897 2898 %tmp23.0 = call i16 @llvm.fshr.i16(i16 %v1e0, i16 %v1e1, i16 8) 2899 %tmp23.1 = zext i16 %tmp23.0 to i32 2900 %byte23 = shl i32 %tmp23.1, 16 2901 %res = or i32 %byte01, %byte23 2902 store i32 %res, ptr addrspace(1) %out0, align 4 2903 ret void 2904} 2905 2906define hidden void @fshri16_16(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) { 2907; GFX10-LABEL: fshri16_16: 2908; GFX10: ; %bb.0: 2909; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2910; GFX10-NEXT: global_load_dword v6, v[0:1], off 2911; GFX10-NEXT: global_load_dword v7, v[2:3], off 2912; GFX10-NEXT: s_waitcnt vmcnt(0) 2913; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x3020706 2914; GFX10-NEXT: global_store_dword v[4:5], v0, off 2915; GFX10-NEXT: s_setpc_b64 s[30:31] 2916; 2917; GFX9-LABEL: fshri16_16: 2918; GFX9: ; %bb.0: 2919; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2920; GFX9-NEXT: global_load_dword v6, v[0:1], off 2921; GFX9-NEXT: global_load_dword v7, v[2:3], off 2922; GFX9-NEXT: s_mov_b32 s4, 0x3020706 2923; GFX9-NEXT: s_waitcnt vmcnt(0) 2924; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4 2925; GFX9-NEXT: global_store_dword v[4:5], v0, off 2926; GFX9-NEXT: s_waitcnt vmcnt(0) 2927; GFX9-NEXT: s_setpc_b64 s[30:31] 2928 %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4 2929 %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4 2930 %v1e0 = extractelement <2 x i16> %vec1, i64 0 2931 %v1e1 = extractelement <2 x i16> %vec1, i64 1 2932 %v2e0 = extractelement <2 x i16> %vec2, i64 0 2933 %v2e1 = extractelement <2 x i16> %vec2, i64 1 2934 2935 %tmp01.0 = call i16 @llvm.fshr.i16(i16 %v2e0, i16 %v2e1, i16 16) 2936 %byte01 = zext i16 %tmp01.0 to i32 2937 2938 %tmp23.0 = call i16 @llvm.fshr.i16(i16 %v1e0, i16 %v1e1, i16 16) 2939 %tmp23.1 = zext i16 %tmp23.0 to i32 2940 %byte23 = shl i32 %tmp23.1, 16 2941 %res = or i32 %byte01, %byte23 2942 store i32 %res, ptr addrspace(1) %out0, align 4 2943 ret void 2944} 2945 2946define hidden void @fshri16_24(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) { 2947; GFX10-LABEL: fshri16_24: 2948; GFX10: ; %bb.0: 2949; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2950; GFX10-NEXT: global_load_dword v6, v[0:1], off 2951; GFX10-NEXT: global_load_dword v7, v[2:3], off 2952; GFX10-NEXT: s_waitcnt vmcnt(0) 2953; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x30407 2954; GFX10-NEXT: global_store_dword v[4:5], v0, off 2955; GFX10-NEXT: s_setpc_b64 s[30:31] 2956; 2957; GFX9-LABEL: fshri16_24: 2958; GFX9: ; %bb.0: 2959; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2960; GFX9-NEXT: global_load_dword v6, v[0:1], off 2961; GFX9-NEXT: global_load_dword v7, v[2:3], off 2962; GFX9-NEXT: s_mov_b32 s4, 0x30407 2963; GFX9-NEXT: s_waitcnt vmcnt(0) 2964; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4 2965; GFX9-NEXT: global_store_dword v[4:5], v0, off 2966; GFX9-NEXT: s_waitcnt vmcnt(0) 2967; GFX9-NEXT: s_setpc_b64 s[30:31] 2968 %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4 2969 %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4 2970 %v1e0 = extractelement <2 x i16> %vec1, i64 0 2971 %v1e1 = extractelement <2 x i16> %vec1, i64 1 2972 %v2e0 = extractelement <2 x i16> %vec2, i64 0 2973 %v2e1 = extractelement <2 x i16> %vec2, i64 1 2974 2975 %tmp01.0 = call i16 @llvm.fshr.i16(i16 %v2e0, i16 %v2e1, i16 24) 2976 %byte01 = zext i16 %tmp01.0 to i32 2977 2978 %tmp23.0 = call i16 @llvm.fshr.i16(i16 %v1e0, i16 %v1e1, i16 24) 2979 %tmp23.1 = zext i16 %tmp23.0 to i32 2980 %byte23 = shl i32 %tmp23.1, 16 2981 %res = or i32 %byte01, %byte23 2982 store i32 %res, ptr addrspace(1) %out0, align 4 2983 ret void 2984} 2985 2986define hidden void @fshri16_32(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) { 2987; GFX10-LABEL: fshri16_32: 2988; GFX10: ; %bb.0: 2989; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2990; GFX10-NEXT: global_load_dword v6, v[0:1], off 2991; GFX10-NEXT: global_load_dword v7, v[2:3], off 2992; GFX10-NEXT: s_waitcnt vmcnt(0) 2993; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x3020706 2994; GFX10-NEXT: global_store_dword v[4:5], v0, off 2995; GFX10-NEXT: s_setpc_b64 s[30:31] 2996; 2997; GFX9-LABEL: fshri16_32: 2998; GFX9: ; %bb.0: 2999; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3000; GFX9-NEXT: global_load_dword v6, v[0:1], off 3001; GFX9-NEXT: global_load_dword v7, v[2:3], off 3002; GFX9-NEXT: s_mov_b32 s4, 0x3020706 3003; GFX9-NEXT: s_waitcnt vmcnt(0) 3004; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4 3005; GFX9-NEXT: global_store_dword v[4:5], v0, off 3006; GFX9-NEXT: s_waitcnt vmcnt(0) 3007; GFX9-NEXT: s_setpc_b64 s[30:31] 3008 %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4 3009 %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4 3010 %v1e0 = extractelement <2 x i16> %vec1, i64 0 3011 %v1e1 = extractelement <2 x i16> %vec1, i64 1 3012 %v2e0 = extractelement <2 x i16> %vec2, i64 0 3013 %v2e1 = extractelement <2 x i16> %vec2, i64 1 3014 3015 %tmp01.0 = call i16 @llvm.fshr.i16(i16 %v2e0, i16 %v2e1, i16 32) 3016 %byte01 = zext i16 %tmp01.0 to i32 3017 3018 %tmp23.0 = call i16 @llvm.fshr.i16(i16 %v1e0, i16 %v1e1, i16 32) 3019 %tmp23.1 = zext i16 %tmp23.0 to i32 3020 %byte23 = shl i32 %tmp23.1, 16 3021 %res = or i32 %byte01, %byte23 3022 store i32 %res, ptr addrspace(1) %out0, align 4 3023 ret void 3024} 3025 3026define hidden void @fshri16_88(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) { 3027; GFX10-LABEL: fshri16_88: 3028; GFX10: ; %bb.0: 3029; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3030; GFX10-NEXT: global_load_dword v6, v[0:1], off 3031; GFX10-NEXT: global_load_dword v7, v[2:3], off 3032; GFX10-NEXT: s_waitcnt vmcnt(0) 3033; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x30407 3034; GFX10-NEXT: global_store_dword v[4:5], v0, off 3035; GFX10-NEXT: s_setpc_b64 s[30:31] 3036; 3037; GFX9-LABEL: fshri16_88: 3038; GFX9: ; %bb.0: 3039; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3040; GFX9-NEXT: global_load_dword v6, v[0:1], off 3041; GFX9-NEXT: global_load_dword v7, v[2:3], off 3042; GFX9-NEXT: s_mov_b32 s4, 0x30407 3043; GFX9-NEXT: s_waitcnt vmcnt(0) 3044; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4 3045; GFX9-NEXT: global_store_dword v[4:5], v0, off 3046; GFX9-NEXT: s_waitcnt vmcnt(0) 3047; GFX9-NEXT: s_setpc_b64 s[30:31] 3048 %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4 3049 %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4 3050 %v1e0 = extractelement <2 x i16> %vec1, i64 0 3051 %v1e1 = extractelement <2 x i16> %vec1, i64 1 3052 %v2e0 = extractelement <2 x i16> %vec2, i64 0 3053 %v2e1 = extractelement <2 x i16> %vec2, i64 1 3054 3055 %tmp01.0 = call i16 @llvm.fshr.i16(i16 %v2e0, i16 %v2e1, i16 88) 3056 %byte01 = zext i16 %tmp01.0 to i32 3057 3058 %tmp23.0 = call i16 @llvm.fshr.i16(i16 %v1e0, i16 %v1e1, i16 88) 3059 %tmp23.1 = zext i16 %tmp23.0 to i32 3060 %byte23 = shl i32 %tmp23.1, 16 3061 %res = or i32 %byte01, %byte23 3062 store i32 %res, ptr addrspace(1) %out0, align 4 3063 ret void 3064} 3065 3066declare i16 @llvm.fshl.i16(i16, i16, i16) 3067 3068define hidden void @fshli16_1347(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) { 3069; GFX10-LABEL: fshli16_1347: 3070; GFX10: ; %bb.0: 3071; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3072; GFX10-NEXT: global_load_dword v6, v[0:1], off 3073; GFX10-NEXT: global_load_dword v7, v[2:3], off 3074; GFX10-NEXT: s_waitcnt vmcnt(0) 3075; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x30407 3076; GFX10-NEXT: global_store_dword v[4:5], v0, off 3077; GFX10-NEXT: s_setpc_b64 s[30:31] 3078; 3079; GFX9-LABEL: fshli16_1347: 3080; GFX9: ; %bb.0: 3081; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3082; GFX9-NEXT: global_load_dword v6, v[0:1], off 3083; GFX9-NEXT: global_load_dword v7, v[2:3], off 3084; GFX9-NEXT: s_mov_b32 s4, 0x30407 3085; GFX9-NEXT: s_waitcnt vmcnt(0) 3086; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4 3087; GFX9-NEXT: global_store_dword v[4:5], v0, off 3088; GFX9-NEXT: s_waitcnt vmcnt(0) 3089; GFX9-NEXT: s_setpc_b64 s[30:31] 3090 %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4 3091 %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4 3092 %v1e0 = extractelement <2 x i16> %vec1, i64 0 3093 %v1e1 = extractelement <2 x i16> %vec1, i64 1 3094 %v2e0 = extractelement <2 x i16> %vec2, i64 0 3095 %v2e1 = extractelement <2 x i16> %vec2, i64 1 3096 3097 %tmp01.0 = call i16 @llvm.fshl.i16(i16 %v2e0, i16 %v2e1, i16 8) 3098 %byte01 = zext i16 %tmp01.0 to i32 3099 3100 %tmp23.0 = call i16 @llvm.fshl.i16(i16 %v1e0, i16 %v1e1, i16 8) 3101 %tmp23.1 = zext i16 %tmp23.0 to i32 3102 %byte23 = shl i32 %tmp23.1, 16 3103 %res = or i32 %byte01, %byte23 3104 store i32 %res, ptr addrspace(1) %out0, align 4 3105 ret void 3106} 3107 3108define hidden void @fshli16_16(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) { 3109; GFX10-LABEL: fshli16_16: 3110; GFX10: ; %bb.0: 3111; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3112; GFX10-NEXT: global_load_dword v6, v[0:1], off 3113; GFX10-NEXT: global_load_dword v7, v[2:3], off 3114; GFX10-NEXT: s_waitcnt vmcnt(0) 3115; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x1000504 3116; GFX10-NEXT: global_store_dword v[4:5], v0, off 3117; GFX10-NEXT: s_setpc_b64 s[30:31] 3118; 3119; GFX9-LABEL: fshli16_16: 3120; GFX9: ; %bb.0: 3121; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3122; GFX9-NEXT: global_load_dword v6, v[0:1], off 3123; GFX9-NEXT: global_load_dword v7, v[2:3], off 3124; GFX9-NEXT: s_mov_b32 s4, 0x1000504 3125; GFX9-NEXT: s_waitcnt vmcnt(0) 3126; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4 3127; GFX9-NEXT: global_store_dword v[4:5], v0, off 3128; GFX9-NEXT: s_waitcnt vmcnt(0) 3129; GFX9-NEXT: s_setpc_b64 s[30:31] 3130 %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4 3131 %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4 3132 %v1e0 = extractelement <2 x i16> %vec1, i64 0 3133 %v1e1 = extractelement <2 x i16> %vec1, i64 1 3134 %v2e0 = extractelement <2 x i16> %vec2, i64 0 3135 %v2e1 = extractelement <2 x i16> %vec2, i64 1 3136 3137 %tmp01.0 = call i16 @llvm.fshl.i16(i16 %v2e0, i16 %v2e1, i16 16) 3138 %byte01 = zext i16 %tmp01.0 to i32 3139 3140 %tmp23.0 = call i16 @llvm.fshl.i16(i16 %v1e0, i16 %v1e1, i16 16) 3141 %tmp23.1 = zext i16 %tmp23.0 to i32 3142 %byte23 = shl i32 %tmp23.1, 16 3143 %res = or i32 %byte01, %byte23 3144 store i32 %res, ptr addrspace(1) %out0, align 4 3145 ret void 3146} 3147 3148define hidden void @fshli16_24(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) { 3149; GFX10-LABEL: fshli16_24: 3150; GFX10: ; %bb.0: 3151; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3152; GFX10-NEXT: global_load_dword v6, v[0:1], off 3153; GFX10-NEXT: global_load_dword v7, v[2:3], off 3154; GFX10-NEXT: s_waitcnt vmcnt(0) 3155; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x30407 3156; GFX10-NEXT: global_store_dword v[4:5], v0, off 3157; GFX10-NEXT: s_setpc_b64 s[30:31] 3158; 3159; GFX9-LABEL: fshli16_24: 3160; GFX9: ; %bb.0: 3161; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3162; GFX9-NEXT: global_load_dword v6, v[0:1], off 3163; GFX9-NEXT: global_load_dword v7, v[2:3], off 3164; GFX9-NEXT: s_mov_b32 s4, 0x30407 3165; GFX9-NEXT: s_waitcnt vmcnt(0) 3166; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4 3167; GFX9-NEXT: global_store_dword v[4:5], v0, off 3168; GFX9-NEXT: s_waitcnt vmcnt(0) 3169; GFX9-NEXT: s_setpc_b64 s[30:31] 3170 %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4 3171 %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4 3172 %v1e0 = extractelement <2 x i16> %vec1, i64 0 3173 %v1e1 = extractelement <2 x i16> %vec1, i64 1 3174 %v2e0 = extractelement <2 x i16> %vec2, i64 0 3175 %v2e1 = extractelement <2 x i16> %vec2, i64 1 3176 3177 %tmp01.0 = call i16 @llvm.fshl.i16(i16 %v2e0, i16 %v2e1, i16 24) 3178 %byte01 = zext i16 %tmp01.0 to i32 3179 3180 %tmp23.0 = call i16 @llvm.fshl.i16(i16 %v1e0, i16 %v1e1, i16 24) 3181 %tmp23.1 = zext i16 %tmp23.0 to i32 3182 %byte23 = shl i32 %tmp23.1, 16 3183 %res = or i32 %byte01, %byte23 3184 store i32 %res, ptr addrspace(1) %out0, align 4 3185 ret void 3186} 3187 3188define hidden void @fshli16_32(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) { 3189; GFX10-LABEL: fshli16_32: 3190; GFX10: ; %bb.0: 3191; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3192; GFX10-NEXT: global_load_dword v6, v[0:1], off 3193; GFX10-NEXT: global_load_dword v7, v[2:3], off 3194; GFX10-NEXT: s_waitcnt vmcnt(0) 3195; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x1000504 3196; GFX10-NEXT: global_store_dword v[4:5], v0, off 3197; GFX10-NEXT: s_setpc_b64 s[30:31] 3198; 3199; GFX9-LABEL: fshli16_32: 3200; GFX9: ; %bb.0: 3201; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3202; GFX9-NEXT: global_load_dword v6, v[0:1], off 3203; GFX9-NEXT: global_load_dword v7, v[2:3], off 3204; GFX9-NEXT: s_mov_b32 s4, 0x1000504 3205; GFX9-NEXT: s_waitcnt vmcnt(0) 3206; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4 3207; GFX9-NEXT: global_store_dword v[4:5], v0, off 3208; GFX9-NEXT: s_waitcnt vmcnt(0) 3209; GFX9-NEXT: s_setpc_b64 s[30:31] 3210 %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4 3211 %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4 3212 %v1e0 = extractelement <2 x i16> %vec1, i64 0 3213 %v1e1 = extractelement <2 x i16> %vec1, i64 1 3214 %v2e0 = extractelement <2 x i16> %vec2, i64 0 3215 %v2e1 = extractelement <2 x i16> %vec2, i64 1 3216 3217 %tmp01.0 = call i16 @llvm.fshl.i16(i16 %v2e0, i16 %v2e1, i16 32) 3218 %byte01 = zext i16 %tmp01.0 to i32 3219 3220 %tmp23.0 = call i16 @llvm.fshl.i16(i16 %v1e0, i16 %v1e1, i16 32) 3221 %tmp23.1 = zext i16 %tmp23.0 to i32 3222 %byte23 = shl i32 %tmp23.1, 16 3223 %res = or i32 %byte01, %byte23 3224 store i32 %res, ptr addrspace(1) %out0, align 4 3225 ret void 3226} 3227 3228define hidden void @fshli16_88(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) { 3229; GFX10-LABEL: fshli16_88: 3230; GFX10: ; %bb.0: 3231; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3232; GFX10-NEXT: global_load_dword v6, v[0:1], off 3233; GFX10-NEXT: global_load_dword v7, v[2:3], off 3234; GFX10-NEXT: s_waitcnt vmcnt(0) 3235; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x30407 3236; GFX10-NEXT: global_store_dword v[4:5], v0, off 3237; GFX10-NEXT: s_setpc_b64 s[30:31] 3238; 3239; GFX9-LABEL: fshli16_88: 3240; GFX9: ; %bb.0: 3241; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3242; GFX9-NEXT: global_load_dword v6, v[0:1], off 3243; GFX9-NEXT: global_load_dword v7, v[2:3], off 3244; GFX9-NEXT: s_mov_b32 s4, 0x30407 3245; GFX9-NEXT: s_waitcnt vmcnt(0) 3246; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4 3247; GFX9-NEXT: global_store_dword v[4:5], v0, off 3248; GFX9-NEXT: s_waitcnt vmcnt(0) 3249; GFX9-NEXT: s_setpc_b64 s[30:31] 3250 %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4 3251 %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4 3252 %v1e0 = extractelement <2 x i16> %vec1, i64 0 3253 %v1e1 = extractelement <2 x i16> %vec1, i64 1 3254 %v2e0 = extractelement <2 x i16> %vec2, i64 0 3255 %v2e1 = extractelement <2 x i16> %vec2, i64 1 3256 3257 %tmp01.0 = call i16 @llvm.fshl.i16(i16 %v2e0, i16 %v2e1, i16 88) 3258 %byte01 = zext i16 %tmp01.0 to i32 3259 3260 %tmp23.0 = call i16 @llvm.fshl.i16(i16 %v1e0, i16 %v1e1, i16 88) 3261 %tmp23.1 = zext i16 %tmp23.0 to i32 3262 %byte23 = shl i32 %tmp23.1, 16 3263 %res = or i32 %byte01, %byte23 3264 store i32 %res, ptr addrspace(1) %out0, align 4 3265 ret void 3266} 3267 3268define hidden void @shlbase(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0, i32 %base) { 3269; GFX10-LABEL: shlbase: 3270; GFX10: ; %bb.0: 3271; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3272; GFX10-NEXT: global_load_dword v7, v[0:1], off 3273; GFX10-NEXT: global_load_dword v8, v[2:3], off 3274; GFX10-NEXT: v_add_nc_u32_e32 v0, 16, v6 3275; GFX10-NEXT: v_add_nc_u32_e32 v1, 24, v6 3276; GFX10-NEXT: v_add_nc_u32_e32 v3, 8, v6 3277; GFX10-NEXT: s_waitcnt vmcnt(1) 3278; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v7 3279; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 3280; GFX10-NEXT: s_waitcnt vmcnt(0) 3281; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 3282; GFX10-NEXT: v_lshl_or_b32 v2, v2, v3, v2 3283; GFX10-NEXT: v_or3_b32 v0, v2, v0, v1 3284; GFX10-NEXT: global_store_dword v[4:5], v0, off 3285; GFX10-NEXT: s_setpc_b64 s[30:31] 3286; 3287; GFX9-LABEL: shlbase: 3288; GFX9: ; %bb.0: 3289; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3290; GFX9-NEXT: global_load_dword v7, v[0:1], off 3291; GFX9-NEXT: global_load_dword v8, v[2:3], off 3292; GFX9-NEXT: v_add_u32_e32 v0, 8, v6 3293; GFX9-NEXT: v_add_u32_e32 v1, 16, v6 3294; GFX9-NEXT: v_add_u32_e32 v2, 24, v6 3295; GFX9-NEXT: s_waitcnt vmcnt(1) 3296; GFX9-NEXT: v_and_b32_e32 v3, 0xff, v7 3297; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 3298; GFX9-NEXT: s_waitcnt vmcnt(0) 3299; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 3300; GFX9-NEXT: v_lshl_or_b32 v0, v3, v0, v3 3301; GFX9-NEXT: v_or3_b32 v0, v0, v1, v2 3302; GFX9-NEXT: global_store_dword v[4:5], v0, off 3303; GFX9-NEXT: s_waitcnt vmcnt(0) 3304; GFX9-NEXT: s_setpc_b64 s[30:31] 3305 %vec1 = load <4 x i8>, ptr addrspace(1) %in0, align 4 3306 %vec2 = load <4 x i8>, ptr addrspace(1) %in1, align 4 3307 %v1e0 = extractelement <4 x i8> %vec1, i64 0 3308 %zv1e0 = zext i8 %v1e0 to i32 3309 %b8 = add i32 %base, 8 3310 %byte1 = shl i32 %zv1e0, %b8 3311 3312 %v1e3 = extractelement <4 x i8> %vec1, i64 3 3313 %zv1e3 = zext i8 %v1e3 to i32 3314 %b16 = add i32 %base, 16 3315 %byte2 = shl i32 %zv1e3, %b16 3316 %v2e3 = extractelement <4 x i8> %vec2, i64 3 3317 %zv2e3 = zext i8 %v2e3 to i32 3318 %b24 = add i32 %base, 24 3319 %byte3 = shl i32 %zv2e3, %b24 3320 3321 %tmp0 = or i32 %zv1e0, %byte1 3322 %tmp1 = or i32 %tmp0, %byte2 3323 %res = or i32 %tmp1, %byte3 3324 store i32 %res, ptr addrspace(1) %out0, align 4 3325 ret void 3326} 3327 3328; TODO -- lower into v_perm 3329define hidden void @extractbase(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0, i64 %base) { 3330; GFX10-LABEL: extractbase: 3331; GFX10: ; %bb.0: 3332; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3333; GFX10-NEXT: global_load_dword v7, v[0:1], off 3334; GFX10-NEXT: global_load_dword v8, v[2:3], off 3335; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v6 3336; GFX10-NEXT: v_add_nc_u32_e32 v1, 24, v0 3337; GFX10-NEXT: s_waitcnt vmcnt(1) 3338; GFX10-NEXT: v_bfe_u32 v2, v7, v1, 8 3339; GFX10-NEXT: v_bfe_u32 v0, v7, v0, 8 3340; GFX10-NEXT: s_waitcnt vmcnt(0) 3341; GFX10-NEXT: v_lshrrev_b32_sdwa v1, v1, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 3342; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 3343; GFX10-NEXT: v_lshl_or_b32 v0, v0, 8, v0 3344; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 3345; GFX10-NEXT: global_store_dword v[4:5], v0, off 3346; GFX10-NEXT: s_setpc_b64 s[30:31] 3347; 3348; GFX9-LABEL: extractbase: 3349; GFX9: ; %bb.0: 3350; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3351; GFX9-NEXT: global_load_dword v7, v[0:1], off 3352; GFX9-NEXT: global_load_dword v8, v[2:3], off 3353; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v6 3354; GFX9-NEXT: v_add_u32_e32 v1, 24, v0 3355; GFX9-NEXT: s_waitcnt vmcnt(1) 3356; GFX9-NEXT: v_bfe_u32 v0, v7, v0, 8 3357; GFX9-NEXT: v_bfe_u32 v2, v7, v1, 8 3358; GFX9-NEXT: s_waitcnt vmcnt(0) 3359; GFX9-NEXT: v_lshrrev_b32_sdwa v1, v1, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 3360; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 3361; GFX9-NEXT: v_lshl_or_b32 v0, v0, 8, v0 3362; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 3363; GFX9-NEXT: global_store_dword v[4:5], v0, off 3364; GFX9-NEXT: s_waitcnt vmcnt(0) 3365; GFX9-NEXT: s_setpc_b64 s[30:31] 3366 %vec1 = load <4 x i8>, ptr addrspace(1) %in0, align 4 3367 %vec2 = load <4 x i8>, ptr addrspace(1) %in1, align 4 3368 %v1b = extractelement <4 x i8> %vec1, i64 %base 3369 %zv1b = zext i8 %v1b to i32 3370 %byte1 = shl i32 %zv1b, 8 3371 3372 %b3 = add i64 %base, 3 3373 %v1b3 = extractelement <4 x i8> %vec1, i64 %b3 3374 %zv1b3 = zext i8 %v1b3 to i32 3375 %byte2 = shl i32 %zv1b3, 16 3376 %v2b3 = extractelement <4 x i8> %vec2, i64 %b3 3377 %zv2b3 = zext i8 %v2b3 to i32 3378 %byte3 = shl i32 %zv2b3, 24 3379 3380 %tmp0 = or i32 %zv1b, %byte1 3381 %tmp1 = or i32 %tmp0, %byte2 3382 %res = or i32 %tmp1, %byte3 3383 store i32 %res, ptr addrspace(1) %out0, align 4 3384 ret void 3385} 3386 3387define hidden void @extract_hilo(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) { 3388; GFX10-LABEL: extract_hilo: 3389; GFX10: ; %bb.0: 3390; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3391; GFX10-NEXT: global_load_dword v6, v[2:3], off 3392; GFX10-NEXT: global_load_dword v7, v[0:1], off offset:4 3393; GFX10-NEXT: s_waitcnt vmcnt(0) 3394; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x3060505 3395; GFX10-NEXT: global_store_dword v[4:5], v0, off 3396; GFX10-NEXT: s_setpc_b64 s[30:31] 3397; 3398; GFX9-LABEL: extract_hilo: 3399; GFX9: ; %bb.0: 3400; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3401; GFX9-NEXT: global_load_dword v6, v[2:3], off 3402; GFX9-NEXT: global_load_dword v7, v[0:1], off offset:4 3403; GFX9-NEXT: s_mov_b32 s4, 0x3060505 3404; GFX9-NEXT: s_waitcnt vmcnt(0) 3405; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4 3406; GFX9-NEXT: global_store_dword v[4:5], v0, off 3407; GFX9-NEXT: s_waitcnt vmcnt(0) 3408; GFX9-NEXT: s_setpc_b64 s[30:31] 3409 %vec1 = load <8 x i8>, ptr addrspace(1) %in0, align 4 3410 %vec2 = load <8 x i8>, ptr addrspace(1) %in1, align 4 3411 %v1e5 = extractelement <8 x i8> %vec1, i64 5 3412 %zv1e5 = zext i8 %v1e5 to i32 3413 %byte1 = shl i32 %zv1e5, 8 3414 3415 %v1e6 = extractelement <8 x i8> %vec1, i64 6 3416 %zv1e6 = zext i8 %v1e6 to i32 3417 %byte2 = shl i32 %zv1e6, 16 3418 %v2e3 = extractelement <8 x i8> %vec2, i64 3 3419 %zv2e3 = zext i8 %v2e3 to i32 3420 %byte3 = shl i32 %zv2e3, 24 3421 3422 %tmp0 = or i32 %zv1e5, %byte1 3423 %tmp1 = or i32 %tmp0, %byte2 3424 %res = or i32 %tmp1, %byte3 3425 store i32 %res, ptr addrspace(1) %out0, align 4 3426 ret void 3427} 3428 3429define hidden void @extract_lohi(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) { 3430; GFX10-LABEL: extract_lohi: 3431; GFX10: ; %bb.0: 3432; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3433; GFX10-NEXT: global_load_dword v6, v[2:3], off offset:4 3434; GFX10-NEXT: global_load_dword v7, v[0:1], off 3435; GFX10-NEXT: s_waitcnt vmcnt(0) 3436; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x70404 3437; GFX10-NEXT: global_store_dword v[4:5], v0, off 3438; GFX10-NEXT: s_setpc_b64 s[30:31] 3439; 3440; GFX9-LABEL: extract_lohi: 3441; GFX9: ; %bb.0: 3442; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3443; GFX9-NEXT: global_load_dword v6, v[2:3], off offset:4 3444; GFX9-NEXT: global_load_dword v7, v[0:1], off 3445; GFX9-NEXT: s_mov_b32 s4, 0x70404 3446; GFX9-NEXT: s_waitcnt vmcnt(0) 3447; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4 3448; GFX9-NEXT: global_store_dword v[4:5], v0, off 3449; GFX9-NEXT: s_waitcnt vmcnt(0) 3450; GFX9-NEXT: s_setpc_b64 s[30:31] 3451 %vec1 = load <8 x i8>, ptr addrspace(1) %in0, align 4 3452 %vec2 = load <8 x i8>, ptr addrspace(1) %in1, align 4 3453 %v1e0 = extractelement <8 x i8> %vec1, i64 0 3454 %zv1e0 = zext i8 %v1e0 to i32 3455 %byte1 = shl i32 %zv1e0, 8 3456 3457 %v1e3 = extractelement <8 x i8> %vec1, i64 3 3458 %zv1e3 = zext i8 %v1e3 to i32 3459 %byte2 = shl i32 %zv1e3, 16 3460 %v2e4 = extractelement <8 x i8> %vec2, i64 4 3461 %zv2e4 = zext i8 %v2e4 to i32 3462 %byte3 = shl i32 %zv2e4, 24 3463 3464 %tmp0 = or i32 %zv1e0, %byte1 3465 %tmp1 = or i32 %tmp0, %byte2 3466 %res = or i32 %tmp1, %byte3 3467 store i32 %res, ptr addrspace(1) %out0, align 4 3468 ret void 3469} 3470 3471define hidden void @extract_hihi(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) { 3472; GFX10-LABEL: extract_hihi: 3473; GFX10: ; %bb.0: 3474; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3475; GFX10-NEXT: global_load_dword v6, v[2:3], off offset:4 3476; GFX10-NEXT: global_load_dword v7, v[0:1], off offset:4 3477; GFX10-NEXT: s_waitcnt vmcnt(0) 3478; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x2070505 3479; GFX10-NEXT: global_store_dword v[4:5], v0, off 3480; GFX10-NEXT: s_setpc_b64 s[30:31] 3481; 3482; GFX9-LABEL: extract_hihi: 3483; GFX9: ; %bb.0: 3484; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3485; GFX9-NEXT: global_load_dword v6, v[2:3], off offset:4 3486; GFX9-NEXT: global_load_dword v7, v[0:1], off offset:4 3487; GFX9-NEXT: s_mov_b32 s4, 0x2070505 3488; GFX9-NEXT: s_waitcnt vmcnt(0) 3489; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4 3490; GFX9-NEXT: global_store_dword v[4:5], v0, off 3491; GFX9-NEXT: s_waitcnt vmcnt(0) 3492; GFX9-NEXT: s_setpc_b64 s[30:31] 3493 %vec1 = load <8 x i8>, ptr addrspace(1) %in0, align 4 3494 %vec2 = load <8 x i8>, ptr addrspace(1) %in1, align 4 3495 %v1e5 = extractelement <8 x i8> %vec1, i64 5 3496 %zv1e5 = zext i8 %v1e5 to i32 3497 %byte1 = shl i32 %zv1e5, 8 3498 3499 %v1e7 = extractelement <8 x i8> %vec1, i64 7 3500 %zv1e7 = zext i8 %v1e7 to i32 3501 %byte2 = shl i32 %zv1e7, 16 3502 %v2e6 = extractelement <8 x i8> %vec2, i64 6 3503 %zv2e6 = zext i8 %v2e6 to i32 3504 %byte3 = shl i32 %zv2e6, 24 3505 3506 %tmp0 = or i32 %zv1e5, %byte1 3507 %tmp1 = or i32 %tmp0, %byte2 3508 %res = or i32 %tmp1, %byte3 3509 store i32 %res, ptr addrspace(1) %out0, align 4 3510 ret void 3511} 3512 3513define hidden void @extract_v8i8(ptr addrspace(1) %in0, ptr addrspace(1) %out0) { 3514; GFX10-LABEL: extract_v8i8: 3515; GFX10: ; %bb.0: 3516; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3517; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 3518; GFX10-NEXT: s_waitcnt vmcnt(0) 3519; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x1070404 3520; GFX10-NEXT: global_store_dword v[2:3], v0, off 3521; GFX10-NEXT: s_setpc_b64 s[30:31] 3522; 3523; GFX9-LABEL: extract_v8i8: 3524; GFX9: ; %bb.0: 3525; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3526; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 3527; GFX9-NEXT: s_mov_b32 s4, 0x1070404 3528; GFX9-NEXT: s_waitcnt vmcnt(0) 3529; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 3530; GFX9-NEXT: global_store_dword v[2:3], v0, off 3531; GFX9-NEXT: s_waitcnt vmcnt(0) 3532; GFX9-NEXT: s_setpc_b64 s[30:31] 3533 %vec1 = load <8 x i8>, ptr addrspace(1) %in0, align 4 3534 %v1e4 = extractelement <8 x i8> %vec1, i64 4 3535 %zv1e4 = zext i8 %v1e4 to i32 3536 %byte1 = shl i32 %zv1e4, 8 3537 3538 %v1e7 = extractelement <8 x i8> %vec1, i64 7 3539 %zv1e7 = zext i8 %v1e7 to i32 3540 %byte2 = shl i32 %zv1e7, 16 3541 %v2e1 = extractelement <8 x i8> %vec1, i64 1 3542 %zv2e1 = zext i8 %v2e1 to i32 3543 %byte3 = shl i32 %zv2e1, 24 3544 3545 %tmp0 = or i32 %zv1e4, %byte1 3546 %tmp1 = or i32 %tmp0, %byte2 3547 %res = or i32 %tmp1, %byte3 3548 store i32 %res, ptr addrspace(1) %out0, align 4 3549 ret void 3550} 3551 3552define hidden void @extract_v256i8(ptr addrspace(1) %in0, ptr addrspace(1) %out0) { 3553; GFX10-LABEL: extract_v256i8: 3554; GFX10: ; %bb.0: 3555; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3556; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:252 3557; GFX10-NEXT: s_waitcnt vmcnt(0) 3558; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x6050707 3559; GFX10-NEXT: global_store_dword v[2:3], v0, off 3560; GFX10-NEXT: s_setpc_b64 s[30:31] 3561; 3562; GFX9-LABEL: extract_v256i8: 3563; GFX9: ; %bb.0: 3564; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3565; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:252 3566; GFX9-NEXT: s_mov_b32 s4, 0x6050707 3567; GFX9-NEXT: s_waitcnt vmcnt(0) 3568; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 3569; GFX9-NEXT: global_store_dword v[2:3], v0, off 3570; GFX9-NEXT: s_waitcnt vmcnt(0) 3571; GFX9-NEXT: s_setpc_b64 s[30:31] 3572 %vec1 = load <256 x i8>, ptr addrspace(1) %in0, align 4 3573 %v1e4 = extractelement <256 x i8> %vec1, i64 255 3574 %zv1e4 = zext i8 %v1e4 to i32 3575 %byte1 = shl i32 %zv1e4, 8 3576 3577 %v1e7 = extractelement <256 x i8> %vec1, i64 253 3578 %zv1e7 = zext i8 %v1e7 to i32 3579 %byte2 = shl i32 %zv1e7, 16 3580 %v2e1 = extractelement <256 x i8> %vec1, i64 254 3581 %zv2e1 = zext i8 %v2e1 to i32 3582 %byte3 = shl i32 %zv2e1, 24 3583 3584 %tmp0 = or i32 %zv1e4, %byte1 3585 %tmp1 = or i32 %tmp0, %byte2 3586 %res = or i32 %tmp1, %byte3 3587 store i32 %res, ptr addrspace(1) %out0, align 4 3588 ret void 3589} 3590 3591; TODO : support this pattern 3592define hidden void @extract_3src(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) { 3593; GFX10-LABEL: extract_3src: 3594; GFX10: ; %bb.0: 3595; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3596; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off 3597; GFX10-NEXT: global_load_dword v8, v[2:3], off offset:4 3598; GFX10-NEXT: s_waitcnt vmcnt(1) 3599; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v7 3600; GFX10-NEXT: s_waitcnt vmcnt(0) 3601; GFX10-NEXT: v_lshlrev_b32_e32 v1, 8, v8 3602; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v6 3603; GFX10-NEXT: v_and_b32_e32 v0, 0xff0000, v0 3604; GFX10-NEXT: v_and_b32_e32 v1, 0xff000000, v1 3605; GFX10-NEXT: v_lshl_or_b32 v2, v2, 8, v2 3606; GFX10-NEXT: v_or3_b32 v0, v2, v0, v1 3607; GFX10-NEXT: global_store_dword v[4:5], v0, off 3608; GFX10-NEXT: s_setpc_b64 s[30:31] 3609; 3610; GFX9-LABEL: extract_3src: 3611; GFX9: ; %bb.0: 3612; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3613; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off 3614; GFX9-NEXT: global_load_dword v8, v[2:3], off offset:4 3615; GFX9-NEXT: s_waitcnt vmcnt(1) 3616; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v6 3617; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7 3618; GFX9-NEXT: s_waitcnt vmcnt(0) 3619; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v8 3620; GFX9-NEXT: v_and_b32_e32 v1, 0xff0000, v1 3621; GFX9-NEXT: v_and_b32_e32 v2, 0xff000000, v2 3622; GFX9-NEXT: v_lshl_or_b32 v0, v0, 8, v0 3623; GFX9-NEXT: v_or3_b32 v0, v0, v1, v2 3624; GFX9-NEXT: global_store_dword v[4:5], v0, off 3625; GFX9-NEXT: s_waitcnt vmcnt(0) 3626; GFX9-NEXT: s_setpc_b64 s[30:31] 3627 %vec1 = load <8 x i8>, ptr addrspace(1) %in0, align 4 3628 %vec2 = load <8 x i8>, ptr addrspace(1) %in1, align 4 3629 %v1e0 = extractelement <8 x i8> %vec1, i64 0 3630 %zv1e0 = zext i8 %v1e0 to i32 3631 %byte1 = shl i32 %zv1e0, 8 3632 3633 %v1e5 = extractelement <8 x i8> %vec1, i64 5 3634 %zv1e5 = zext i8 %v1e5 to i32 3635 %byte2 = shl i32 %zv1e5, 16 3636 %v2e6 = extractelement <8 x i8> %vec2, i64 6 3637 %zv2e6 = zext i8 %v2e6 to i32 3638 %byte3 = shl i32 %zv2e6, 24 3639 3640 %tmp0 = or i32 %zv1e0, %byte1 3641 %tmp1 = or i32 %tmp0, %byte2 3642 %res = or i32 %tmp1, %byte3 3643 store i32 %res, ptr addrspace(1) %out0, align 4 3644 ret void 3645} 3646 3647; Should not result in crash 3648define hidden void @extract_v6i16(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { 3649; GFX10-LABEL: extract_v6i16: 3650; GFX10: ; %bb.0: 3651; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3652; GFX10-NEXT: s_clause 0x3 3653; GFX10-NEXT: global_load_ushort v2, v[0:1], off offset:6 3654; GFX10-NEXT: global_load_ushort v3, v[0:1], off 3655; GFX10-NEXT: global_load_ushort v8, v[0:1], off offset:2 3656; GFX10-NEXT: global_load_ushort v9, v[0:1], off offset:4 3657; GFX10-NEXT: s_waitcnt vmcnt(1) 3658; GFX10-NEXT: v_lshl_or_b32 v0, v8, 16, v3 3659; GFX10-NEXT: s_waitcnt vmcnt(0) 3660; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v9 3661; GFX10-NEXT: global_store_dword v[4:5], v0, off 3662; GFX10-NEXT: global_store_dword v[6:7], v1, off 3663; GFX10-NEXT: s_setpc_b64 s[30:31] 3664; 3665; GFX9-LABEL: extract_v6i16: 3666; GFX9: ; %bb.0: 3667; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3668; GFX9-NEXT: global_load_ushort v2, v[0:1], off offset:6 3669; GFX9-NEXT: global_load_ushort v3, v[0:1], off 3670; GFX9-NEXT: global_load_ushort v8, v[0:1], off offset:4 3671; GFX9-NEXT: global_load_ushort v9, v[0:1], off offset:2 3672; GFX9-NEXT: s_waitcnt vmcnt(1) 3673; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v8 3674; GFX9-NEXT: s_waitcnt vmcnt(0) 3675; GFX9-NEXT: v_lshl_or_b32 v1, v9, 16, v3 3676; GFX9-NEXT: global_store_dword v[4:5], v1, off 3677; GFX9-NEXT: global_store_dword v[6:7], v0, off 3678; GFX9-NEXT: s_waitcnt vmcnt(0) 3679; GFX9-NEXT: s_setpc_b64 s[30:31] 3680 %vec = load <6 x i16>, ptr addrspace(1) %in0, align 2 3681 %el0 = extractelement <6 x i16> %vec, i32 0 3682 %el1 = extractelement <6 x i16> %vec, i32 1 3683 %el2 = extractelement <6 x i16> %vec, i32 2 3684 %el3 = extractelement <6 x i16> %vec, i32 3 3685 %z0 = zext i16 %el0 to i32 3686 %z1 = zext i16 %el1 to i32 3687 %s1 = shl nuw i32 %z1, 16 3688 %o0 = or i32 %s1, %z0 3689 %z2 = zext i16 %el2 to i32 3690 %z3 = zext i16 %el3 to i32 3691 %s3 = shl nuw i32 %z3, 16 3692 %o1 = or i32 %z2, %s3 3693 3694 store i32 %o0, ptr addrspace(1) %out0, align 4 3695 store i32 %o1, ptr addrspace(1) %out1, align 4 3696 ret void 3697} 3698 3699 3700; Should not result in crash 3701define hidden void @extract_v7i16(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { 3702; GFX10-LABEL: extract_v7i16: 3703; GFX10: ; %bb.0: 3704; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3705; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 3706; GFX10-NEXT: s_waitcnt vmcnt(0) 3707; GFX10-NEXT: global_store_dword v[4:5], v0, off 3708; GFX10-NEXT: global_store_dword v[6:7], v1, off 3709; GFX10-NEXT: s_setpc_b64 s[30:31] 3710; 3711; GFX9-LABEL: extract_v7i16: 3712; GFX9: ; %bb.0: 3713; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3714; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off 3715; GFX9-NEXT: s_waitcnt vmcnt(0) 3716; GFX9-NEXT: global_store_dword v[4:5], v0, off 3717; GFX9-NEXT: global_store_dword v[6:7], v1, off 3718; GFX9-NEXT: s_waitcnt vmcnt(0) 3719; GFX9-NEXT: s_setpc_b64 s[30:31] 3720 %vec = load <7 x i16>, ptr addrspace(1) %in0, align 2 3721 %el0 = extractelement <7 x i16> %vec, i32 0 3722 %el1 = extractelement <7 x i16> %vec, i32 1 3723 %el2 = extractelement <7 x i16> %vec, i32 2 3724 %el3 = extractelement <7 x i16> %vec, i32 3 3725 %z0 = zext i16 %el0 to i32 3726 %z1 = zext i16 %el1 to i32 3727 %s1 = shl nuw i32 %z1, 16 3728 %o0 = or i32 %s1, %z0 3729 %z2 = zext i16 %el2 to i32 3730 %z3 = zext i16 %el3 to i32 3731 %s3 = shl nuw i32 %z3, 16 3732 %o1 = or i32 %z2, %s3 3733 3734 store i32 %o0, ptr addrspace(1) %out0, align 4 3735 store i32 %o1, ptr addrspace(1) %out1, align 4 3736 ret void 3737} 3738 3739; Should not result in crash 3740define hidden void @extract_v13i8(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { 3741; GFX10-LABEL: extract_v13i8: 3742; GFX10: ; %bb.0: 3743; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3744; GFX10-NEXT: s_clause 0x1 3745; GFX10-NEXT: global_load_dwordx2 v[2:3], v[0:1], off 3746; GFX10-NEXT: global_load_ushort v8, v[0:1], off offset:8 3747; GFX10-NEXT: s_waitcnt vmcnt(1) 3748; GFX10-NEXT: v_bfe_u32 v0, v2, 8, 8 3749; GFX10-NEXT: s_waitcnt vmcnt(0) 3750; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v8 3751; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x5040c00 3752; GFX10-NEXT: v_perm_b32 v1, v1, v3, 0x5040c03 3753; GFX10-NEXT: global_store_dword v[4:5], v0, off 3754; GFX10-NEXT: global_store_dword v[6:7], v1, off 3755; GFX10-NEXT: s_setpc_b64 s[30:31] 3756; 3757; GFX9-LABEL: extract_v13i8: 3758; GFX9: ; %bb.0: 3759; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3760; GFX9-NEXT: global_load_dwordx2 v[2:3], v[0:1], off 3761; GFX9-NEXT: global_load_ushort v8, v[0:1], off offset:8 3762; GFX9-NEXT: s_mov_b32 s4, 0x5040c00 3763; GFX9-NEXT: s_mov_b32 s5, 0x5040c03 3764; GFX9-NEXT: s_waitcnt vmcnt(1) 3765; GFX9-NEXT: v_bfe_u32 v0, v2, 8, 8 3766; GFX9-NEXT: s_waitcnt vmcnt(0) 3767; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v8 3768; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4 3769; GFX9-NEXT: v_perm_b32 v1, v1, v3, s5 3770; GFX9-NEXT: global_store_dword v[4:5], v0, off 3771; GFX9-NEXT: global_store_dword v[6:7], v1, off 3772; GFX9-NEXT: s_waitcnt vmcnt(0) 3773; GFX9-NEXT: s_setpc_b64 s[30:31] 3774 %vec = load <13 x i8>, ptr addrspace(1) %in0, align 2 3775 %el0 = extractelement <13 x i8> %vec, i32 0 3776 %el1 = extractelement <13 x i8> %vec, i32 1 3777 %el2 = extractelement <13 x i8> %vec, i32 7 3778 %el3 = extractelement <13 x i8> %vec, i32 8 3779 %z0 = zext i8 %el0 to i32 3780 %z1 = zext i8 %el1 to i32 3781 %s1 = shl nuw i32 %z1, 16 3782 %o0 = or i32 %s1, %z0 3783 %z2 = zext i8 %el2 to i32 3784 %z3 = zext i8 %el3 to i32 3785 %s3 = shl nuw i32 %z3, 16 3786 %o1 = or i32 %z2, %s3 3787 3788 store i32 %o0, ptr addrspace(1) %out0, align 4 3789 store i32 %o1, ptr addrspace(1) %out1, align 4 3790 ret void 3791} 3792 3793; Should not result in crash 3794define hidden void @extract_v13i64(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0, ptr addrspace(1) %out1) { 3795; GFX10-LABEL: extract_v13i64: 3796; GFX10: ; %bb.0: 3797; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3798; GFX10-NEXT: s_clause 0x2 3799; GFX10-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:48 3800; GFX10-NEXT: global_load_dwordx4 v[11:14], v[0:1], off 3801; GFX10-NEXT: global_load_dwordx4 v[14:17], v[0:1], off offset:64 3802; GFX10-NEXT: ; kill: killed $vgpr0 killed $vgpr1 3803; GFX10-NEXT: s_waitcnt vmcnt(1) 3804; GFX10-NEXT: v_perm_b32 v0, v12, v13, 0x1000504 3805; GFX10-NEXT: s_waitcnt vmcnt(0) 3806; GFX10-NEXT: v_perm_b32 v1, v10, v14, 0x1000504 3807; GFX10-NEXT: global_store_dword v[4:5], v0, off 3808; GFX10-NEXT: global_store_dword v[6:7], v1, off 3809; GFX10-NEXT: s_setpc_b64 s[30:31] 3810; 3811; GFX9-LABEL: extract_v13i64: 3812; GFX9: ; %bb.0: 3813; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3814; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:48 3815; GFX9-NEXT: global_load_dwordx4 v[11:14], v[0:1], off 3816; GFX9-NEXT: global_load_dwordx4 v[14:17], v[0:1], off offset:64 3817; GFX9-NEXT: s_mov_b32 s4, 0x1000504 3818; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1 3819; GFX9-NEXT: s_waitcnt vmcnt(1) 3820; GFX9-NEXT: v_perm_b32 v0, v12, v13, s4 3821; GFX9-NEXT: s_waitcnt vmcnt(0) 3822; GFX9-NEXT: v_perm_b32 v1, v10, v14, s4 3823; GFX9-NEXT: global_store_dword v[4:5], v0, off 3824; GFX9-NEXT: global_store_dword v[6:7], v1, off 3825; GFX9-NEXT: s_waitcnt vmcnt(0) 3826; GFX9-NEXT: s_setpc_b64 s[30:31] 3827 %vec = load <13 x i64>, ptr addrspace(1) %in0, align 2 3828 %el0 = extractelement <13 x i64> %vec, i32 0 3829 %el1 = extractelement <13 x i64> %vec, i32 1 3830 %el2 = extractelement <13 x i64> %vec, i32 7 3831 %el3 = extractelement <13 x i64> %vec, i32 8 3832 %el00 = lshr i64 %el0, 32 3833 %t0 = trunc i64 %el00 to i16 3834 %z0 = zext i16 %t0 to i32 3835 %z1 = trunc i64 %el1 to i32 3836 %s1 = shl nuw i32 %z1, 16 3837 %o0 = or i32 %s1, %z0 3838 %t2 = trunc i64 %el2 to i16 3839 %z2 = zext i16 %t2 to i32 3840 %z3 = trunc i64 %el3 to i32 3841 %s3 = shl nuw i32 %z3, 16 3842 %o1 = or i32 %z2, %s3 3843 3844 store i32 %o0, ptr addrspace(1) %out0, align 4 3845 store i32 %o1, ptr addrspace(1) %out1, align 4 3846 ret void 3847} 3848 3849 3850; Should combine the lower 16 bits from each i32 in load 3851define hidden void @trunc_vector(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) { 3852; GFX10-LABEL: trunc_vector: 3853; GFX10: ; %bb.0: 3854; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3855; GFX10-NEXT: s_clause 0x1 3856; GFX10-NEXT: global_load_ushort v2, v[0:1], off 3857; GFX10-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:4 3858; GFX10-NEXT: s_waitcnt vmcnt(0) 3859; GFX10-NEXT: global_store_dword v[4:5], v2, off 3860; GFX10-NEXT: s_setpc_b64 s[30:31] 3861; 3862; GFX9-LABEL: trunc_vector: 3863; GFX9: ; %bb.0: 3864; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3865; GFX9-NEXT: global_load_ushort v2, v[0:1], off 3866; GFX9-NEXT: global_load_ushort v3, v[0:1], off offset:4 3867; GFX9-NEXT: s_mov_b32 s4, 0x5040100 3868; GFX9-NEXT: s_waitcnt vmcnt(0) 3869; GFX9-NEXT: v_perm_b32 v0, v3, v2, s4 3870; GFX9-NEXT: global_store_dword v[4:5], v0, off 3871; GFX9-NEXT: s_waitcnt vmcnt(0) 3872; GFX9-NEXT: s_setpc_b64 s[30:31] 3873 %vec = load <2 x i32>, ptr addrspace(1) %in0, align 2 3874 %tvec = trunc <2 x i32> %vec to <2 x i16> 3875 %el0 = extractelement <2 x i16> %tvec, i32 0 3876 %el1 = extractelement <2 x i16> %tvec, i32 1 3877 %z0 = zext i16 %el0 to i32 3878 %z1 = zext i16 %el1 to i32 3879 %s1 = shl nuw i32 %z1, 16 3880 %o0 = or i32 %s1, %z0 3881 3882 store i32 %o0, ptr addrspace(1) %out0, align 4 3883 ret void 3884} 3885