1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -o - %s | FileCheck -check-prefix=GFX6 %s 3; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s 4; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s 5; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s 6; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s 7 8define i7 @v_ssubsat_i7(i7 %lhs, i7 %rhs) { 9; GFX6-LABEL: v_ssubsat_i7: 10; GFX6: ; %bb.0: 11; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12; GFX6-NEXT: v_lshlrev_b32_e32 v0, 25, v0 13; GFX6-NEXT: v_max_i32_e32 v2, -1, v0 14; GFX6-NEXT: v_lshlrev_b32_e32 v1, 25, v1 15; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0x80000001, v2 16; GFX6-NEXT: v_min_i32_e32 v3, -1, v0 17; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0x80000000, v3 18; GFX6-NEXT: v_max_i32_e32 v1, v2, v1 19; GFX6-NEXT: v_min_i32_e32 v1, v1, v3 20; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 21; GFX6-NEXT: v_ashrrev_i32_e32 v0, 25, v0 22; GFX6-NEXT: s_setpc_b64 s[30:31] 23; 24; GFX8-LABEL: v_ssubsat_i7: 25; GFX8: ; %bb.0: 26; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27; GFX8-NEXT: v_lshlrev_b16_e32 v0, 9, v0 28; GFX8-NEXT: v_max_i16_e32 v2, -1, v0 29; GFX8-NEXT: v_lshlrev_b16_e32 v1, 9, v1 30; GFX8-NEXT: v_add_u16_e32 v2, 0x8001, v2 31; GFX8-NEXT: v_min_i16_e32 v3, -1, v0 32; GFX8-NEXT: v_add_u16_e32 v3, 0x8000, v3 33; GFX8-NEXT: v_max_i16_e32 v1, v2, v1 34; GFX8-NEXT: v_min_i16_e32 v1, v1, v3 35; GFX8-NEXT: v_sub_u16_e32 v0, v0, v1 36; GFX8-NEXT: v_ashrrev_i16_e32 v0, 9, v0 37; GFX8-NEXT: s_setpc_b64 s[30:31] 38; 39; GFX9-LABEL: v_ssubsat_i7: 40; GFX9: ; %bb.0: 41; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 42; GFX9-NEXT: v_lshlrev_b16_e32 v0, 9, v0 43; GFX9-NEXT: v_lshlrev_b16_e32 v1, 9, v1 44; GFX9-NEXT: v_sub_i16 v0, v0, v1 clamp 45; GFX9-NEXT: v_ashrrev_i16_e32 v0, 9, v0 46; GFX9-NEXT: s_setpc_b64 s[30:31] 47; 48; GFX10PLUS-LABEL: v_ssubsat_i7: 49; GFX10PLUS: ; %bb.0: 50; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 51; GFX10PLUS-NEXT: v_lshlrev_b16 v0, 9, v0 52; GFX10PLUS-NEXT: v_lshlrev_b16 v1, 9, v1 53; GFX10PLUS-NEXT: v_sub_nc_i16 v0, v0, v1 clamp 54; GFX10PLUS-NEXT: v_ashrrev_i16 v0, 9, v0 55; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 56 %result = call i7 @llvm.ssub.sat.i7(i7 %lhs, i7 %rhs) 57 ret i7 %result 58} 59 60define amdgpu_ps i7 @s_ssubsat_i7(i7 inreg %lhs, i7 inreg %rhs) { 61; GFX6-LABEL: s_ssubsat_i7: 62; GFX6: ; %bb.0: 63; GFX6-NEXT: s_lshl_b32 s0, s0, 25 64; GFX6-NEXT: s_max_i32 s2, s0, -1 65; GFX6-NEXT: s_lshl_b32 s1, s1, 25 66; GFX6-NEXT: s_add_i32 s2, s2, 0x80000001 67; GFX6-NEXT: s_min_i32 s3, s0, -1 68; GFX6-NEXT: s_add_i32 s3, s3, 0x80000000 69; GFX6-NEXT: s_max_i32 s1, s2, s1 70; GFX6-NEXT: s_min_i32 s1, s1, s3 71; GFX6-NEXT: s_sub_i32 s0, s0, s1 72; GFX6-NEXT: s_ashr_i32 s0, s0, 25 73; GFX6-NEXT: ; return to shader part epilog 74; 75; GFX8-LABEL: s_ssubsat_i7: 76; GFX8: ; %bb.0: 77; GFX8-NEXT: s_lshl_b32 s0, s0, 9 78; GFX8-NEXT: s_sext_i32_i16 s2, s0 79; GFX8-NEXT: s_sext_i32_i16 s3, -1 80; GFX8-NEXT: s_max_i32 s4, s2, s3 81; GFX8-NEXT: s_lshl_b32 s1, s1, 9 82; GFX8-NEXT: s_addk_i32 s4, 0x8001 83; GFX8-NEXT: s_min_i32 s2, s2, s3 84; GFX8-NEXT: s_sext_i32_i16 s3, s4 85; GFX8-NEXT: s_sext_i32_i16 s1, s1 86; GFX8-NEXT: s_addk_i32 s2, 0x8000 87; GFX8-NEXT: s_max_i32 s1, s3, s1 88; GFX8-NEXT: s_sext_i32_i16 s1, s1 89; GFX8-NEXT: s_sext_i32_i16 s2, s2 90; GFX8-NEXT: s_min_i32 s1, s1, s2 91; GFX8-NEXT: s_sub_i32 s0, s0, s1 92; GFX8-NEXT: s_sext_i32_i16 s0, s0 93; GFX8-NEXT: s_ashr_i32 s0, s0, 9 94; GFX8-NEXT: ; return to shader part epilog 95; 96; GFX9-LABEL: s_ssubsat_i7: 97; GFX9: ; %bb.0: 98; GFX9-NEXT: s_lshl_b32 s1, s1, 9 99; GFX9-NEXT: s_lshl_b32 s0, s0, 9 100; GFX9-NEXT: v_mov_b32_e32 v0, s1 101; GFX9-NEXT: v_sub_i16 v0, s0, v0 clamp 102; GFX9-NEXT: v_ashrrev_i16_e32 v0, 9, v0 103; GFX9-NEXT: v_readfirstlane_b32 s0, v0 104; GFX9-NEXT: ; return to shader part epilog 105; 106; GFX10PLUS-LABEL: s_ssubsat_i7: 107; GFX10PLUS: ; %bb.0: 108; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 9 109; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 9 110; GFX10PLUS-NEXT: v_sub_nc_i16 v0, s0, s1 clamp 111; GFX10PLUS-NEXT: v_ashrrev_i16 v0, 9, v0 112; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 113; GFX10PLUS-NEXT: ; return to shader part epilog 114 %result = call i7 @llvm.ssub.sat.i7(i7 %lhs, i7 %rhs) 115 ret i7 %result 116} 117 118define i8 @v_ssubsat_i8(i8 %lhs, i8 %rhs) { 119; GFX6-LABEL: v_ssubsat_i8: 120; GFX6: ; %bb.0: 121; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 122; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0 123; GFX6-NEXT: v_max_i32_e32 v2, -1, v0 124; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 125; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0x80000001, v2 126; GFX6-NEXT: v_min_i32_e32 v3, -1, v0 127; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0x80000000, v3 128; GFX6-NEXT: v_max_i32_e32 v1, v2, v1 129; GFX6-NEXT: v_min_i32_e32 v1, v1, v3 130; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 131; GFX6-NEXT: v_ashrrev_i32_e32 v0, 24, v0 132; GFX6-NEXT: s_setpc_b64 s[30:31] 133; 134; GFX8-LABEL: v_ssubsat_i8: 135; GFX8: ; %bb.0: 136; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 137; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 138; GFX8-NEXT: v_max_i16_e32 v2, -1, v0 139; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 140; GFX8-NEXT: v_add_u16_e32 v2, 0x8001, v2 141; GFX8-NEXT: v_min_i16_e32 v3, -1, v0 142; GFX8-NEXT: v_add_u16_e32 v3, 0x8000, v3 143; GFX8-NEXT: v_max_i16_e32 v1, v2, v1 144; GFX8-NEXT: v_min_i16_e32 v1, v1, v3 145; GFX8-NEXT: v_sub_u16_e32 v0, v0, v1 146; GFX8-NEXT: v_ashrrev_i16_e32 v0, 8, v0 147; GFX8-NEXT: s_setpc_b64 s[30:31] 148; 149; GFX9-LABEL: v_ssubsat_i8: 150; GFX9: ; %bb.0: 151; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 152; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 153; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 154; GFX9-NEXT: v_sub_i16 v0, v0, v1 clamp 155; GFX9-NEXT: v_ashrrev_i16_e32 v0, 8, v0 156; GFX9-NEXT: s_setpc_b64 s[30:31] 157; 158; GFX10PLUS-LABEL: v_ssubsat_i8: 159; GFX10PLUS: ; %bb.0: 160; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 161; GFX10PLUS-NEXT: v_lshlrev_b16 v0, 8, v0 162; GFX10PLUS-NEXT: v_lshlrev_b16 v1, 8, v1 163; GFX10PLUS-NEXT: v_sub_nc_i16 v0, v0, v1 clamp 164; GFX10PLUS-NEXT: v_ashrrev_i16 v0, 8, v0 165; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 166 %result = call i8 @llvm.ssub.sat.i8(i8 %lhs, i8 %rhs) 167 ret i8 %result 168} 169 170define amdgpu_ps i8 @s_ssubsat_i8(i8 inreg %lhs, i8 inreg %rhs) { 171; GFX6-LABEL: s_ssubsat_i8: 172; GFX6: ; %bb.0: 173; GFX6-NEXT: s_lshl_b32 s0, s0, 24 174; GFX6-NEXT: s_max_i32 s2, s0, -1 175; GFX6-NEXT: s_lshl_b32 s1, s1, 24 176; GFX6-NEXT: s_add_i32 s2, s2, 0x80000001 177; GFX6-NEXT: s_min_i32 s3, s0, -1 178; GFX6-NEXT: s_add_i32 s3, s3, 0x80000000 179; GFX6-NEXT: s_max_i32 s1, s2, s1 180; GFX6-NEXT: s_min_i32 s1, s1, s3 181; GFX6-NEXT: s_sub_i32 s0, s0, s1 182; GFX6-NEXT: s_ashr_i32 s0, s0, 24 183; GFX6-NEXT: ; return to shader part epilog 184; 185; GFX8-LABEL: s_ssubsat_i8: 186; GFX8: ; %bb.0: 187; GFX8-NEXT: s_lshl_b32 s0, s0, 8 188; GFX8-NEXT: s_sext_i32_i16 s2, s0 189; GFX8-NEXT: s_sext_i32_i16 s3, -1 190; GFX8-NEXT: s_max_i32 s4, s2, s3 191; GFX8-NEXT: s_lshl_b32 s1, s1, 8 192; GFX8-NEXT: s_addk_i32 s4, 0x8001 193; GFX8-NEXT: s_min_i32 s2, s2, s3 194; GFX8-NEXT: s_sext_i32_i16 s3, s4 195; GFX8-NEXT: s_sext_i32_i16 s1, s1 196; GFX8-NEXT: s_addk_i32 s2, 0x8000 197; GFX8-NEXT: s_max_i32 s1, s3, s1 198; GFX8-NEXT: s_sext_i32_i16 s1, s1 199; GFX8-NEXT: s_sext_i32_i16 s2, s2 200; GFX8-NEXT: s_min_i32 s1, s1, s2 201; GFX8-NEXT: s_sub_i32 s0, s0, s1 202; GFX8-NEXT: s_sext_i32_i16 s0, s0 203; GFX8-NEXT: s_ashr_i32 s0, s0, 8 204; GFX8-NEXT: ; return to shader part epilog 205; 206; GFX9-LABEL: s_ssubsat_i8: 207; GFX9: ; %bb.0: 208; GFX9-NEXT: s_lshl_b32 s1, s1, 8 209; GFX9-NEXT: s_lshl_b32 s0, s0, 8 210; GFX9-NEXT: v_mov_b32_e32 v0, s1 211; GFX9-NEXT: v_sub_i16 v0, s0, v0 clamp 212; GFX9-NEXT: v_ashrrev_i16_e32 v0, 8, v0 213; GFX9-NEXT: v_readfirstlane_b32 s0, v0 214; GFX9-NEXT: ; return to shader part epilog 215; 216; GFX10PLUS-LABEL: s_ssubsat_i8: 217; GFX10PLUS: ; %bb.0: 218; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 8 219; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 8 220; GFX10PLUS-NEXT: v_sub_nc_i16 v0, s0, s1 clamp 221; GFX10PLUS-NEXT: v_ashrrev_i16 v0, 8, v0 222; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 223; GFX10PLUS-NEXT: ; return to shader part epilog 224 %result = call i8 @llvm.ssub.sat.i8(i8 %lhs, i8 %rhs) 225 ret i8 %result 226} 227 228define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { 229; GFX6-LABEL: v_ssubsat_v2i8: 230; GFX6: ; %bb.0: 231; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 232; GFX6-NEXT: v_lshrrev_b32_e32 v2, 8, v0 233; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0 234; GFX6-NEXT: v_max_i32_e32 v4, -1, v0 235; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v1 236; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 237; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x80000001, v4 238; GFX6-NEXT: v_min_i32_e32 v5, -1, v0 239; GFX6-NEXT: v_bfrev_b32_e32 v6, 1 240; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v6 241; GFX6-NEXT: v_max_i32_e32 v1, v4, v1 242; GFX6-NEXT: v_min_i32_e32 v1, v1, v5 243; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 244; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v2 245; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v3 246; GFX6-NEXT: v_max_i32_e32 v3, -1, v1 247; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0x80000001, v3 248; GFX6-NEXT: v_min_i32_e32 v4, -1, v1 249; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x80000000, v4 250; GFX6-NEXT: v_max_i32_e32 v2, v3, v2 251; GFX6-NEXT: v_min_i32_e32 v2, v2, v4 252; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 253; GFX6-NEXT: v_ashrrev_i32_e32 v1, 24, v1 254; GFX6-NEXT: v_ashrrev_i32_e32 v0, 24, v0 255; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v1 256; GFX6-NEXT: v_and_b32_e32 v0, 0xff, v0 257; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 258; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 259; GFX6-NEXT: s_setpc_b64 s[30:31] 260; 261; GFX8-LABEL: v_ssubsat_v2i8: 262; GFX8: ; %bb.0: 263; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 264; GFX8-NEXT: v_mov_b32_e32 v2, 8 265; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 266; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 267; GFX8-NEXT: v_max_i16_e32 v4, -1, v0 268; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 269; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 270; GFX8-NEXT: v_add_u16_e32 v4, 0x8001, v4 271; GFX8-NEXT: v_min_i16_e32 v5, -1, v0 272; GFX8-NEXT: v_add_u16_e32 v5, 0x8000, v5 273; GFX8-NEXT: v_max_i16_e32 v1, v4, v1 274; GFX8-NEXT: v_min_i16_e32 v1, v1, v5 275; GFX8-NEXT: v_sub_u16_e32 v0, v0, v1 276; GFX8-NEXT: v_max_i16_e32 v1, -1, v3 277; GFX8-NEXT: v_add_u16_e32 v1, 0x8001, v1 278; GFX8-NEXT: v_min_i16_e32 v4, -1, v3 279; GFX8-NEXT: v_add_u16_e32 v4, 0x8000, v4 280; GFX8-NEXT: v_max_i16_e32 v1, v1, v2 281; GFX8-NEXT: v_min_i16_e32 v1, v1, v4 282; GFX8-NEXT: v_sub_u16_e32 v1, v3, v1 283; GFX8-NEXT: v_mov_b32_e32 v2, 0xff 284; GFX8-NEXT: v_and_b32_sdwa v1, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD 285; GFX8-NEXT: v_and_b32_sdwa v0, sext(v0), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD 286; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 287; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 288; GFX8-NEXT: s_setpc_b64 s[30:31] 289; 290; GFX9-LABEL: v_ssubsat_v2i8: 291; GFX9: ; %bb.0: 292; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 293; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 294; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v1 295; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 296; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 297; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 298; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 299; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] 300; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] 301; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 clamp 302; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] 303; GFX9-NEXT: v_mov_b32_e32 v1, 0xff 304; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 305; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 306; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 307; GFX9-NEXT: s_setpc_b64 s[30:31] 308; 309; GFX10-LABEL: v_ssubsat_v2i8: 310; GFX10: ; %bb.0: 311; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 312; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0 313; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v1 314; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 315; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 316; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v0 317; GFX10-NEXT: v_lshl_or_b32 v1, v3, 16, v1 318; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] 319; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] 320; GFX10-NEXT: v_pk_sub_i16 v0, v0, v1 clamp 321; GFX10-NEXT: v_mov_b32_e32 v1, 0xff 322; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] 323; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 324; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 325; GFX10-NEXT: s_setpc_b64 s[30:31] 326; 327; GFX11-LABEL: v_ssubsat_v2i8: 328; GFX11: ; %bb.0: 329; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 330; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v0 331; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1 332; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 333; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 334; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v0 335; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v1 336; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] 337; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] 338; GFX11-NEXT: v_pk_sub_i16 v0, v0, v1 clamp 339; GFX11-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] 340; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 341; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 342; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 343; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 344; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 345; GFX11-NEXT: s_setpc_b64 s[30:31] 346 %lhs = bitcast i16 %lhs.arg to <2 x i8> 347 %rhs = bitcast i16 %rhs.arg to <2 x i8> 348 %result = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> %lhs, <2 x i8> %rhs) 349 %cast.result = bitcast <2 x i8> %result to i16 350 ret i16 %cast.result 351} 352 353define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { 354; GFX6-LABEL: s_ssubsat_v2i8: 355; GFX6: ; %bb.0: 356; GFX6-NEXT: s_lshr_b32 s2, s0, 8 357; GFX6-NEXT: s_lshl_b32 s0, s0, 24 358; GFX6-NEXT: s_max_i32 s4, s0, -1 359; GFX6-NEXT: s_lshr_b32 s3, s1, 8 360; GFX6-NEXT: s_lshl_b32 s1, s1, 24 361; GFX6-NEXT: s_add_i32 s4, s4, 0x80000001 362; GFX6-NEXT: s_min_i32 s5, s0, -1 363; GFX6-NEXT: s_add_i32 s5, s5, 0x80000000 364; GFX6-NEXT: s_max_i32 s1, s4, s1 365; GFX6-NEXT: s_min_i32 s1, s1, s5 366; GFX6-NEXT: s_sub_i32 s0, s0, s1 367; GFX6-NEXT: s_lshl_b32 s1, s2, 24 368; GFX6-NEXT: s_lshl_b32 s2, s3, 24 369; GFX6-NEXT: s_max_i32 s3, s1, -1 370; GFX6-NEXT: s_add_i32 s3, s3, 0x80000001 371; GFX6-NEXT: s_min_i32 s4, s1, -1 372; GFX6-NEXT: s_add_i32 s4, s4, 0x80000000 373; GFX6-NEXT: s_max_i32 s2, s3, s2 374; GFX6-NEXT: s_min_i32 s2, s2, s4 375; GFX6-NEXT: s_sub_i32 s1, s1, s2 376; GFX6-NEXT: s_ashr_i32 s1, s1, 24 377; GFX6-NEXT: s_ashr_i32 s0, s0, 24 378; GFX6-NEXT: s_and_b32 s1, s1, 0xff 379; GFX6-NEXT: s_and_b32 s0, s0, 0xff 380; GFX6-NEXT: s_lshl_b32 s1, s1, 8 381; GFX6-NEXT: s_or_b32 s0, s0, s1 382; GFX6-NEXT: ; return to shader part epilog 383; 384; GFX8-LABEL: s_ssubsat_v2i8: 385; GFX8: ; %bb.0: 386; GFX8-NEXT: s_lshr_b32 s2, s0, 8 387; GFX8-NEXT: s_lshl_b32 s0, s0, 8 388; GFX8-NEXT: s_sext_i32_i16 s4, s0 389; GFX8-NEXT: s_sext_i32_i16 s5, -1 390; GFX8-NEXT: s_max_i32 s6, s4, s5 391; GFX8-NEXT: s_lshr_b32 s3, s1, 8 392; GFX8-NEXT: s_lshl_b32 s1, s1, 8 393; GFX8-NEXT: s_addk_i32 s6, 0x8001 394; GFX8-NEXT: s_min_i32 s4, s4, s5 395; GFX8-NEXT: s_sext_i32_i16 s6, s6 396; GFX8-NEXT: s_sext_i32_i16 s1, s1 397; GFX8-NEXT: s_addk_i32 s4, 0x8000 398; GFX8-NEXT: s_max_i32 s1, s6, s1 399; GFX8-NEXT: s_sext_i32_i16 s1, s1 400; GFX8-NEXT: s_sext_i32_i16 s4, s4 401; GFX8-NEXT: s_min_i32 s1, s1, s4 402; GFX8-NEXT: s_sub_i32 s0, s0, s1 403; GFX8-NEXT: s_lshl_b32 s1, s2, 8 404; GFX8-NEXT: s_lshl_b32 s2, s3, 8 405; GFX8-NEXT: s_sext_i32_i16 s3, s1 406; GFX8-NEXT: s_max_i32 s4, s3, s5 407; GFX8-NEXT: s_addk_i32 s4, 0x8001 408; GFX8-NEXT: s_min_i32 s3, s3, s5 409; GFX8-NEXT: s_sext_i32_i16 s4, s4 410; GFX8-NEXT: s_sext_i32_i16 s2, s2 411; GFX8-NEXT: s_addk_i32 s3, 0x8000 412; GFX8-NEXT: s_max_i32 s2, s4, s2 413; GFX8-NEXT: s_sext_i32_i16 s2, s2 414; GFX8-NEXT: s_sext_i32_i16 s3, s3 415; GFX8-NEXT: s_min_i32 s2, s2, s3 416; GFX8-NEXT: s_sub_i32 s1, s1, s2 417; GFX8-NEXT: s_sext_i32_i16 s1, s1 418; GFX8-NEXT: s_sext_i32_i16 s0, s0 419; GFX8-NEXT: s_ashr_i32 s1, s1, 8 420; GFX8-NEXT: s_ashr_i32 s0, s0, 8 421; GFX8-NEXT: s_and_b32 s1, s1, 0xff 422; GFX8-NEXT: s_and_b32 s0, s0, 0xff 423; GFX8-NEXT: s_lshl_b32 s1, s1, 8 424; GFX8-NEXT: s_or_b32 s0, s0, s1 425; GFX8-NEXT: ; return to shader part epilog 426; 427; GFX9-LABEL: s_ssubsat_v2i8: 428; GFX9: ; %bb.0: 429; GFX9-NEXT: s_lshr_b32 s2, s0, 8 430; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 431; GFX9-NEXT: s_lshr_b32 s3, s1, 8 432; GFX9-NEXT: s_lshr_b32 s2, s0, 16 433; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s3 434; GFX9-NEXT: s_lshl_b32 s0, s0, 0x80008 435; GFX9-NEXT: s_lshl_b32 s2, s2, 8 436; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 437; GFX9-NEXT: s_lshr_b32 s2, s1, 16 438; GFX9-NEXT: s_lshl_b32 s1, s1, 0x80008 439; GFX9-NEXT: s_lshl_b32 s2, s2, 8 440; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2 441; GFX9-NEXT: v_mov_b32_e32 v0, s1 442; GFX9-NEXT: v_pk_sub_i16 v0, s0, v0 clamp 443; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] 444; GFX9-NEXT: v_mov_b32_e32 v1, 0xff 445; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 446; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 447; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 448; GFX9-NEXT: v_readfirstlane_b32 s0, v0 449; GFX9-NEXT: ; return to shader part epilog 450; 451; GFX10-LABEL: s_ssubsat_v2i8: 452; GFX10: ; %bb.0: 453; GFX10-NEXT: s_lshr_b32 s2, s0, 8 454; GFX10-NEXT: s_lshr_b32 s3, s1, 8 455; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 456; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 457; GFX10-NEXT: s_lshr_b32 s2, s0, 16 458; GFX10-NEXT: s_lshr_b32 s3, s1, 16 459; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008 460; GFX10-NEXT: s_lshl_b32 s2, s2, 8 461; GFX10-NEXT: s_lshl_b32 s1, s1, 0x80008 462; GFX10-NEXT: s_lshl_b32 s3, s3, 8 463; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 464; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 465; GFX10-NEXT: v_mov_b32_e32 v1, 0xff 466; GFX10-NEXT: v_pk_sub_i16 v0, s0, s1 clamp 467; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] 468; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 469; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 470; GFX10-NEXT: v_readfirstlane_b32 s0, v0 471; GFX10-NEXT: ; return to shader part epilog 472; 473; GFX11-LABEL: s_ssubsat_v2i8: 474; GFX11: ; %bb.0: 475; GFX11-NEXT: s_lshr_b32 s2, s0, 8 476; GFX11-NEXT: s_lshr_b32 s3, s1, 8 477; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s2 478; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s3 479; GFX11-NEXT: s_lshr_b32 s2, s0, 16 480; GFX11-NEXT: s_lshr_b32 s3, s1, 16 481; GFX11-NEXT: s_lshl_b32 s0, s0, 0x80008 482; GFX11-NEXT: s_lshl_b32 s2, s2, 8 483; GFX11-NEXT: s_lshl_b32 s1, s1, 0x80008 484; GFX11-NEXT: s_lshl_b32 s3, s3, 8 485; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s2 486; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s3 487; GFX11-NEXT: v_pk_sub_i16 v0, s0, s1 clamp 488; GFX11-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] 489; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 490; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 491; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 492; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 493; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 494; GFX11-NEXT: v_readfirstlane_b32 s0, v0 495; GFX11-NEXT: ; return to shader part epilog 496 %lhs = bitcast i16 %lhs.arg to <2 x i8> 497 %rhs = bitcast i16 %rhs.arg to <2 x i8> 498 %result = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> %lhs, <2 x i8> %rhs) 499 %cast.result = bitcast <2 x i8> %result to i16 500 ret i16 %cast.result 501} 502 503define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { 504; GFX6-LABEL: v_ssubsat_v4i8: 505; GFX6: ; %bb.0: 506; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 507; GFX6-NEXT: v_lshrrev_b32_e32 v2, 8, v0 508; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0 509; GFX6-NEXT: v_lshrrev_b32_e32 v4, 24, v0 510; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0 511; GFX6-NEXT: v_max_i32_e32 v8, -1, v0 512; GFX6-NEXT: v_lshrrev_b32_e32 v5, 8, v1 513; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v1 514; GFX6-NEXT: v_lshrrev_b32_e32 v7, 24, v1 515; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 516; GFX6-NEXT: v_add_i32_e32 v8, vcc, 0x80000001, v8 517; GFX6-NEXT: v_min_i32_e32 v10, -1, v0 518; GFX6-NEXT: v_bfrev_b32_e32 v11, 1 519; GFX6-NEXT: v_add_i32_e32 v10, vcc, v10, v11 520; GFX6-NEXT: v_max_i32_e32 v1, v8, v1 521; GFX6-NEXT: v_min_i32_e32 v1, v1, v10 522; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 523; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v2 524; GFX6-NEXT: v_mov_b32_e32 v9, 0x80000001 525; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v5 526; GFX6-NEXT: v_max_i32_e32 v5, -1, v1 527; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v9 528; GFX6-NEXT: v_min_i32_e32 v8, -1, v1 529; GFX6-NEXT: v_add_i32_e32 v8, vcc, v8, v11 530; GFX6-NEXT: v_max_i32_e32 v2, v5, v2 531; GFX6-NEXT: v_min_i32_e32 v2, v2, v8 532; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 533; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v3 534; GFX6-NEXT: v_max_i32_e32 v5, -1, v2 535; GFX6-NEXT: v_lshlrev_b32_e32 v3, 24, v6 536; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v9 537; GFX6-NEXT: v_min_i32_e32 v6, -1, v2 538; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v11 539; GFX6-NEXT: v_max_i32_e32 v3, v5, v3 540; GFX6-NEXT: v_min_i32_e32 v3, v3, v6 541; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 542; GFX6-NEXT: v_lshlrev_b32_e32 v3, 24, v4 543; GFX6-NEXT: v_max_i32_e32 v5, -1, v3 544; GFX6-NEXT: v_ashrrev_i32_e32 v1, 24, v1 545; GFX6-NEXT: v_lshlrev_b32_e32 v4, 24, v7 546; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v9 547; GFX6-NEXT: v_min_i32_e32 v6, -1, v3 548; GFX6-NEXT: v_ashrrev_i32_e32 v0, 24, v0 549; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v11 550; GFX6-NEXT: v_max_i32_e32 v4, v5, v4 551; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v1 552; GFX6-NEXT: v_ashrrev_i32_e32 v2, 24, v2 553; GFX6-NEXT: v_min_i32_e32 v4, v4, v6 554; GFX6-NEXT: v_and_b32_e32 v0, 0xff, v0 555; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 556; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v4 557; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 558; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v2 559; GFX6-NEXT: v_ashrrev_i32_e32 v3, 24, v3 560; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 561; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 562; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v3 563; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 564; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 565; GFX6-NEXT: s_setpc_b64 s[30:31] 566; 567; GFX8-LABEL: v_ssubsat_v4i8: 568; GFX8: ; %bb.0: 569; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 570; GFX8-NEXT: v_mov_b32_e32 v2, 8 571; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 572; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0 573; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v0 574; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 575; GFX8-NEXT: v_max_i16_e32 v8, -1, v0 576; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 577; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v1 578; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v1 579; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 580; GFX8-NEXT: v_add_u16_e32 v8, 0x8001, v8 581; GFX8-NEXT: v_min_i16_e32 v9, -1, v0 582; GFX8-NEXT: v_add_u16_e32 v9, 0x8000, v9 583; GFX8-NEXT: v_max_i16_e32 v1, v8, v1 584; GFX8-NEXT: v_min_i16_e32 v1, v1, v9 585; GFX8-NEXT: v_sub_u16_e32 v0, v0, v1 586; GFX8-NEXT: v_max_i16_e32 v1, -1, v3 587; GFX8-NEXT: v_add_u16_e32 v1, 0x8001, v1 588; GFX8-NEXT: v_min_i16_e32 v8, -1, v3 589; GFX8-NEXT: v_add_u16_e32 v8, 0x8000, v8 590; GFX8-NEXT: v_max_i16_e32 v1, v1, v2 591; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v4 592; GFX8-NEXT: v_min_i16_e32 v1, v1, v8 593; GFX8-NEXT: v_max_i16_e32 v4, -1, v2 594; GFX8-NEXT: v_sub_u16_e32 v1, v3, v1 595; GFX8-NEXT: v_lshlrev_b16_e32 v3, 8, v6 596; GFX8-NEXT: v_add_u16_e32 v4, 0x8001, v4 597; GFX8-NEXT: v_min_i16_e32 v6, -1, v2 598; GFX8-NEXT: v_add_u16_e32 v6, 0x8000, v6 599; GFX8-NEXT: v_max_i16_e32 v3, v4, v3 600; GFX8-NEXT: v_min_i16_e32 v3, v3, v6 601; GFX8-NEXT: v_sub_u16_e32 v2, v2, v3 602; GFX8-NEXT: v_lshlrev_b16_e32 v3, 8, v5 603; GFX8-NEXT: v_max_i16_e32 v5, -1, v3 604; GFX8-NEXT: v_lshlrev_b16_e32 v4, 8, v7 605; GFX8-NEXT: v_add_u16_e32 v5, 0x8001, v5 606; GFX8-NEXT: v_min_i16_e32 v6, -1, v3 607; GFX8-NEXT: v_add_u16_e32 v6, 0x8000, v6 608; GFX8-NEXT: v_max_i16_e32 v4, v5, v4 609; GFX8-NEXT: v_min_i16_e32 v4, v4, v6 610; GFX8-NEXT: v_sub_u16_e32 v3, v3, v4 611; GFX8-NEXT: v_mov_b32_e32 v4, 0xff 612; GFX8-NEXT: v_and_b32_sdwa v1, sext(v1), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD 613; GFX8-NEXT: v_and_b32_sdwa v0, sext(v0), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD 614; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1 615; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 616; GFX8-NEXT: v_and_b32_sdwa v1, sext(v2), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD 617; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 618; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 619; GFX8-NEXT: v_and_b32_sdwa v1, sext(v3), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD 620; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v1 621; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 622; GFX8-NEXT: s_setpc_b64 s[30:31] 623; 624; GFX9-LABEL: v_ssubsat_v4i8: 625; GFX9: ; %bb.0: 626; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 627; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 628; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 629; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 630; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v0 631; GFX9-NEXT: v_alignbit_b32 v0, v3, v0, 16 632; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v1 633; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1 634; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v6 635; GFX9-NEXT: v_lshl_or_b32 v3, v4, 16, v3 636; GFX9-NEXT: v_alignbit_b32 v1, v5, v1, 16 637; GFX9-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] 638; GFX9-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] 639; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] 640; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] 641; GFX9-NEXT: v_pk_sub_i16 v2, v2, v3 clamp 642; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 clamp 643; GFX9-NEXT: v_pk_ashrrev_i16 v1, 8, v2 op_sel_hi:[0,1] 644; GFX9-NEXT: v_mov_b32_e32 v3, 8 645; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] 646; GFX9-NEXT: v_mov_b32_e32 v2, 0xff 647; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 648; GFX9-NEXT: v_and_or_b32 v1, v1, v2, v3 649; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v0 650; GFX9-NEXT: v_mov_b32_e32 v3, 24 651; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 652; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 653; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 654; GFX9-NEXT: s_setpc_b64 s[30:31] 655; 656; GFX10-LABEL: v_ssubsat_v4i8: 657; GFX10: ; %bb.0: 658; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 659; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0 660; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 661; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v0 662; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v1 663; GFX10-NEXT: v_and_b32_e32 v6, 0xffff, v1 664; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1 665; GFX10-NEXT: v_alignbit_b32 v0, v3, v0, 16 666; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v4 667; GFX10-NEXT: v_mov_b32_e32 v4, 24 668; GFX10-NEXT: v_lshl_or_b32 v3, v5, 16, v6 669; GFX10-NEXT: v_alignbit_b32 v1, v7, v1, 16 670; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] 671; GFX10-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] 672; GFX10-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] 673; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] 674; GFX10-NEXT: v_pk_sub_i16 v2, v2, v3 clamp 675; GFX10-NEXT: v_pk_sub_i16 v0, v0, v1 clamp 676; GFX10-NEXT: v_mov_b32_e32 v1, 8 677; GFX10-NEXT: v_pk_ashrrev_i16 v2, 8, v2 op_sel_hi:[0,1] 678; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] 679; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 680; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v0 681; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 682; GFX10-NEXT: v_and_or_b32 v1, 0xff, v2, v1 683; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 684; GFX10-NEXT: v_or3_b32 v0, v1, v2, v0 685; GFX10-NEXT: s_setpc_b64 s[30:31] 686; 687; GFX11-LABEL: v_ssubsat_v4i8: 688; GFX11: ; %bb.0: 689; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 690; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v0 691; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1 692; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v0 693; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v1 694; GFX11-NEXT: v_lshrrev_b32_e32 v6, 24, v0 695; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v1 696; GFX11-NEXT: v_lshl_or_b32 v2, v2, 16, v4 697; GFX11-NEXT: v_lshl_or_b32 v3, v3, 16, v5 698; GFX11-NEXT: v_alignbit_b32 v0, v6, v0, 16 699; GFX11-NEXT: v_alignbit_b32 v1, v7, v1, 16 700; GFX11-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] 701; GFX11-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] 702; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] 703; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] 704; GFX11-NEXT: v_pk_sub_i16 v2, v2, v3 clamp 705; GFX11-NEXT: v_pk_sub_i16 v0, v0, v1 clamp 706; GFX11-NEXT: v_pk_ashrrev_i16 v1, 8, v2 op_sel_hi:[0,1] 707; GFX11-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] 708; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 8 709; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v0 710; GFX11-NEXT: v_bfe_u32 v0, v0, 16, 8 711; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2 712; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 713; GFX11-NEXT: v_lshlrev_b32_e32 v0, 24, v0 714; GFX11-NEXT: v_and_or_b32 v1, 0xff, v1, v2 715; GFX11-NEXT: v_or3_b32 v0, v1, v3, v0 716; GFX11-NEXT: s_setpc_b64 s[30:31] 717 %lhs = bitcast i32 %lhs.arg to <4 x i8> 718 %rhs = bitcast i32 %rhs.arg to <4 x i8> 719 %result = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> %lhs, <4 x i8> %rhs) 720 %cast.result = bitcast <4 x i8> %result to i32 721 ret i32 %cast.result 722} 723 724define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { 725; GFX6-LABEL: s_ssubsat_v4i8: 726; GFX6: ; %bb.0: 727; GFX6-NEXT: s_lshr_b32 s2, s0, 8 728; GFX6-NEXT: s_lshr_b32 s3, s0, 16 729; GFX6-NEXT: s_lshr_b32 s4, s0, 24 730; GFX6-NEXT: s_lshl_b32 s0, s0, 24 731; GFX6-NEXT: s_max_i32 s8, s0, -1 732; GFX6-NEXT: s_lshr_b32 s5, s1, 8 733; GFX6-NEXT: s_lshr_b32 s6, s1, 16 734; GFX6-NEXT: s_lshr_b32 s7, s1, 24 735; GFX6-NEXT: s_lshl_b32 s1, s1, 24 736; GFX6-NEXT: s_add_i32 s8, s8, 0x80000001 737; GFX6-NEXT: s_min_i32 s9, s0, -1 738; GFX6-NEXT: s_add_i32 s9, s9, 0x80000000 739; GFX6-NEXT: s_max_i32 s1, s8, s1 740; GFX6-NEXT: s_min_i32 s1, s1, s9 741; GFX6-NEXT: s_sub_i32 s0, s0, s1 742; GFX6-NEXT: s_lshl_b32 s1, s2, 24 743; GFX6-NEXT: s_lshl_b32 s2, s5, 24 744; GFX6-NEXT: s_max_i32 s5, s1, -1 745; GFX6-NEXT: s_add_i32 s5, s5, 0x80000001 746; GFX6-NEXT: s_min_i32 s8, s1, -1 747; GFX6-NEXT: s_add_i32 s8, s8, 0x80000000 748; GFX6-NEXT: s_max_i32 s2, s5, s2 749; GFX6-NEXT: s_min_i32 s2, s2, s8 750; GFX6-NEXT: s_sub_i32 s1, s1, s2 751; GFX6-NEXT: s_lshl_b32 s2, s3, 24 752; GFX6-NEXT: s_max_i32 s5, s2, -1 753; GFX6-NEXT: s_lshl_b32 s3, s6, 24 754; GFX6-NEXT: s_add_i32 s5, s5, 0x80000001 755; GFX6-NEXT: s_min_i32 s6, s2, -1 756; GFX6-NEXT: s_add_i32 s6, s6, 0x80000000 757; GFX6-NEXT: s_max_i32 s3, s5, s3 758; GFX6-NEXT: s_min_i32 s3, s3, s6 759; GFX6-NEXT: s_sub_i32 s2, s2, s3 760; GFX6-NEXT: s_lshl_b32 s3, s4, 24 761; GFX6-NEXT: s_max_i32 s5, s3, -1 762; GFX6-NEXT: s_ashr_i32 s1, s1, 24 763; GFX6-NEXT: s_lshl_b32 s4, s7, 24 764; GFX6-NEXT: s_add_i32 s5, s5, 0x80000001 765; GFX6-NEXT: s_min_i32 s6, s3, -1 766; GFX6-NEXT: s_ashr_i32 s0, s0, 24 767; GFX6-NEXT: s_add_i32 s6, s6, 0x80000000 768; GFX6-NEXT: s_max_i32 s4, s5, s4 769; GFX6-NEXT: s_and_b32 s1, s1, 0xff 770; GFX6-NEXT: s_ashr_i32 s2, s2, 24 771; GFX6-NEXT: s_min_i32 s4, s4, s6 772; GFX6-NEXT: s_and_b32 s0, s0, 0xff 773; GFX6-NEXT: s_lshl_b32 s1, s1, 8 774; GFX6-NEXT: s_sub_i32 s3, s3, s4 775; GFX6-NEXT: s_or_b32 s0, s0, s1 776; GFX6-NEXT: s_and_b32 s1, s2, 0xff 777; GFX6-NEXT: s_ashr_i32 s3, s3, 24 778; GFX6-NEXT: s_lshl_b32 s1, s1, 16 779; GFX6-NEXT: s_or_b32 s0, s0, s1 780; GFX6-NEXT: s_and_b32 s1, s3, 0xff 781; GFX6-NEXT: s_lshl_b32 s1, s1, 24 782; GFX6-NEXT: s_or_b32 s0, s0, s1 783; GFX6-NEXT: ; return to shader part epilog 784; 785; GFX8-LABEL: s_ssubsat_v4i8: 786; GFX8: ; %bb.0: 787; GFX8-NEXT: s_lshr_b32 s2, s0, 8 788; GFX8-NEXT: s_lshr_b32 s3, s0, 16 789; GFX8-NEXT: s_lshr_b32 s4, s0, 24 790; GFX8-NEXT: s_lshl_b32 s0, s0, 8 791; GFX8-NEXT: s_sext_i32_i16 s8, s0 792; GFX8-NEXT: s_sext_i32_i16 s9, -1 793; GFX8-NEXT: s_max_i32 s10, s8, s9 794; GFX8-NEXT: s_lshr_b32 s5, s1, 8 795; GFX8-NEXT: s_lshr_b32 s6, s1, 16 796; GFX8-NEXT: s_lshr_b32 s7, s1, 24 797; GFX8-NEXT: s_lshl_b32 s1, s1, 8 798; GFX8-NEXT: s_addk_i32 s10, 0x8001 799; GFX8-NEXT: s_min_i32 s8, s8, s9 800; GFX8-NEXT: s_sext_i32_i16 s10, s10 801; GFX8-NEXT: s_sext_i32_i16 s1, s1 802; GFX8-NEXT: s_addk_i32 s8, 0x8000 803; GFX8-NEXT: s_max_i32 s1, s10, s1 804; GFX8-NEXT: s_sext_i32_i16 s1, s1 805; GFX8-NEXT: s_sext_i32_i16 s8, s8 806; GFX8-NEXT: s_min_i32 s1, s1, s8 807; GFX8-NEXT: s_sub_i32 s0, s0, s1 808; GFX8-NEXT: s_lshl_b32 s1, s2, 8 809; GFX8-NEXT: s_lshl_b32 s2, s5, 8 810; GFX8-NEXT: s_sext_i32_i16 s5, s1 811; GFX8-NEXT: s_max_i32 s8, s5, s9 812; GFX8-NEXT: s_addk_i32 s8, 0x8001 813; GFX8-NEXT: s_min_i32 s5, s5, s9 814; GFX8-NEXT: s_sext_i32_i16 s8, s8 815; GFX8-NEXT: s_sext_i32_i16 s2, s2 816; GFX8-NEXT: s_addk_i32 s5, 0x8000 817; GFX8-NEXT: s_max_i32 s2, s8, s2 818; GFX8-NEXT: s_sext_i32_i16 s2, s2 819; GFX8-NEXT: s_sext_i32_i16 s5, s5 820; GFX8-NEXT: s_min_i32 s2, s2, s5 821; GFX8-NEXT: s_sub_i32 s1, s1, s2 822; GFX8-NEXT: s_lshl_b32 s2, s3, 8 823; GFX8-NEXT: s_sext_i32_i16 s5, s2 824; GFX8-NEXT: s_lshl_b32 s3, s6, 8 825; GFX8-NEXT: s_max_i32 s6, s5, s9 826; GFX8-NEXT: s_addk_i32 s6, 0x8001 827; GFX8-NEXT: s_min_i32 s5, s5, s9 828; GFX8-NEXT: s_sext_i32_i16 s6, s6 829; GFX8-NEXT: s_sext_i32_i16 s3, s3 830; GFX8-NEXT: s_addk_i32 s5, 0x8000 831; GFX8-NEXT: s_max_i32 s3, s6, s3 832; GFX8-NEXT: s_sext_i32_i16 s3, s3 833; GFX8-NEXT: s_sext_i32_i16 s5, s5 834; GFX8-NEXT: s_min_i32 s3, s3, s5 835; GFX8-NEXT: s_sub_i32 s2, s2, s3 836; GFX8-NEXT: s_lshl_b32 s3, s4, 8 837; GFX8-NEXT: s_sext_i32_i16 s5, s3 838; GFX8-NEXT: s_max_i32 s6, s5, s9 839; GFX8-NEXT: s_lshl_b32 s4, s7, 8 840; GFX8-NEXT: s_addk_i32 s6, 0x8001 841; GFX8-NEXT: s_min_i32 s5, s5, s9 842; GFX8-NEXT: s_sext_i32_i16 s6, s6 843; GFX8-NEXT: s_sext_i32_i16 s4, s4 844; GFX8-NEXT: s_sext_i32_i16 s1, s1 845; GFX8-NEXT: s_addk_i32 s5, 0x8000 846; GFX8-NEXT: s_max_i32 s4, s6, s4 847; GFX8-NEXT: s_sext_i32_i16 s0, s0 848; GFX8-NEXT: s_ashr_i32 s1, s1, 8 849; GFX8-NEXT: s_sext_i32_i16 s4, s4 850; GFX8-NEXT: s_sext_i32_i16 s5, s5 851; GFX8-NEXT: s_ashr_i32 s0, s0, 8 852; GFX8-NEXT: s_sext_i32_i16 s2, s2 853; GFX8-NEXT: s_min_i32 s4, s4, s5 854; GFX8-NEXT: s_and_b32 s1, s1, 0xff 855; GFX8-NEXT: s_ashr_i32 s2, s2, 8 856; GFX8-NEXT: s_sub_i32 s3, s3, s4 857; GFX8-NEXT: s_and_b32 s0, s0, 0xff 858; GFX8-NEXT: s_lshl_b32 s1, s1, 8 859; GFX8-NEXT: s_sext_i32_i16 s3, s3 860; GFX8-NEXT: s_or_b32 s0, s0, s1 861; GFX8-NEXT: s_and_b32 s1, s2, 0xff 862; GFX8-NEXT: s_ashr_i32 s3, s3, 8 863; GFX8-NEXT: s_lshl_b32 s1, s1, 16 864; GFX8-NEXT: s_or_b32 s0, s0, s1 865; GFX8-NEXT: s_and_b32 s1, s3, 0xff 866; GFX8-NEXT: s_lshl_b32 s1, s1, 24 867; GFX8-NEXT: s_or_b32 s0, s0, s1 868; GFX8-NEXT: ; return to shader part epilog 869; 870; GFX9-LABEL: s_ssubsat_v4i8: 871; GFX9: ; %bb.0: 872; GFX9-NEXT: s_lshr_b32 s2, s0, 8 873; GFX9-NEXT: s_lshr_b32 s3, s0, 16 874; GFX9-NEXT: s_lshr_b32 s4, s0, 24 875; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 876; GFX9-NEXT: s_pack_ll_b32_b16 s2, s3, s4 877; GFX9-NEXT: s_lshr_b32 s4, s0, 16 878; GFX9-NEXT: s_lshl_b32 s0, s0, 0x80008 879; GFX9-NEXT: s_lshl_b32 s4, s4, 8 880; GFX9-NEXT: s_lshr_b32 s5, s1, 8 881; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 882; GFX9-NEXT: s_lshr_b32 s4, s2, 16 883; GFX9-NEXT: s_lshr_b32 s6, s1, 16 884; GFX9-NEXT: s_lshr_b32 s7, s1, 24 885; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s5 886; GFX9-NEXT: s_lshl_b32 s2, s2, 0x80008 887; GFX9-NEXT: s_lshl_b32 s4, s4, 8 888; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4 889; GFX9-NEXT: s_lshr_b32 s4, s1, 16 890; GFX9-NEXT: s_pack_ll_b32_b16 s3, s6, s7 891; GFX9-NEXT: s_lshl_b32 s1, s1, 0x80008 892; GFX9-NEXT: s_lshl_b32 s4, s4, 8 893; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4 894; GFX9-NEXT: s_lshr_b32 s4, s3, 16 895; GFX9-NEXT: s_lshl_b32 s3, s3, 0x80008 896; GFX9-NEXT: s_lshl_b32 s4, s4, 8 897; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 898; GFX9-NEXT: v_mov_b32_e32 v0, s1 899; GFX9-NEXT: v_pk_sub_i16 v0, s0, v0 clamp 900; GFX9-NEXT: v_mov_b32_e32 v1, s3 901; GFX9-NEXT: v_pk_sub_i16 v1, s2, v1 clamp 902; GFX9-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] 903; GFX9-NEXT: v_mov_b32_e32 v3, 8 904; GFX9-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1] 905; GFX9-NEXT: v_mov_b32_e32 v2, 0xff 906; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 907; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v3 908; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v1 909; GFX9-NEXT: v_mov_b32_e32 v3, 24 910; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 911; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 912; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 913; GFX9-NEXT: v_readfirstlane_b32 s0, v0 914; GFX9-NEXT: ; return to shader part epilog 915; 916; GFX10-LABEL: s_ssubsat_v4i8: 917; GFX10: ; %bb.0: 918; GFX10-NEXT: s_lshr_b32 s2, s0, 8 919; GFX10-NEXT: s_lshr_b32 s3, s0, 16 920; GFX10-NEXT: s_lshr_b32 s4, s0, 24 921; GFX10-NEXT: s_lshr_b32 s5, s1, 8 922; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 923; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4 924; GFX10-NEXT: s_lshr_b32 s6, s1, 16 925; GFX10-NEXT: s_lshr_b32 s7, s1, 24 926; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 927; GFX10-NEXT: s_lshr_b32 s4, s0, 16 928; GFX10-NEXT: s_lshr_b32 s5, s2, 16 929; GFX10-NEXT: s_pack_ll_b32_b16 s3, s6, s7 930; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008 931; GFX10-NEXT: s_lshl_b32 s4, s4, 8 932; GFX10-NEXT: s_lshl_b32 s2, s2, 0x80008 933; GFX10-NEXT: s_lshl_b32 s5, s5, 8 934; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4 935; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s5 936; GFX10-NEXT: s_lshr_b32 s4, s1, 16 937; GFX10-NEXT: s_lshr_b32 s5, s3, 16 938; GFX10-NEXT: s_lshl_b32 s1, s1, 0x80008 939; GFX10-NEXT: s_lshl_b32 s4, s4, 8 940; GFX10-NEXT: s_lshl_b32 s3, s3, 0x80008 941; GFX10-NEXT: s_lshl_b32 s5, s5, 8 942; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s4 943; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s5 944; GFX10-NEXT: v_pk_sub_i16 v0, s0, s1 clamp 945; GFX10-NEXT: v_pk_sub_i16 v1, s2, s3 clamp 946; GFX10-NEXT: v_mov_b32_e32 v2, 8 947; GFX10-NEXT: v_mov_b32_e32 v4, 24 948; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] 949; GFX10-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1] 950; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 951; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v1 952; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 953; GFX10-NEXT: v_and_or_b32 v0, 0xff, v0, v2 954; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 955; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 956; GFX10-NEXT: v_readfirstlane_b32 s0, v0 957; GFX10-NEXT: ; return to shader part epilog 958; 959; GFX11-LABEL: s_ssubsat_v4i8: 960; GFX11: ; %bb.0: 961; GFX11-NEXT: s_lshr_b32 s2, s0, 8 962; GFX11-NEXT: s_lshr_b32 s3, s0, 24 963; GFX11-NEXT: s_lshr_b32 s4, s1, 8 964; GFX11-NEXT: s_lshr_b32 s5, s1, 24 965; GFX11-NEXT: s_pack_ll_b32_b16 s2, s0, s2 966; GFX11-NEXT: s_pack_hl_b32_b16 s0, s0, s3 967; GFX11-NEXT: s_pack_ll_b32_b16 s3, s1, s4 968; GFX11-NEXT: s_lshr_b32 s4, s2, 16 969; GFX11-NEXT: s_pack_hl_b32_b16 s1, s1, s5 970; GFX11-NEXT: s_lshr_b32 s5, s3, 16 971; GFX11-NEXT: s_lshl_b32 s2, s2, 0x80008 972; GFX11-NEXT: s_lshl_b32 s4, s4, 8 973; GFX11-NEXT: s_lshl_b32 s3, s3, 0x80008 974; GFX11-NEXT: s_lshl_b32 s5, s5, 8 975; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s4 976; GFX11-NEXT: s_pack_ll_b32_b16 s3, s3, s5 977; GFX11-NEXT: s_lshr_b32 s4, s0, 16 978; GFX11-NEXT: s_lshr_b32 s5, s1, 16 979; GFX11-NEXT: v_pk_sub_i16 v0, s2, s3 clamp 980; GFX11-NEXT: s_lshl_b32 s0, s0, 0x80008 981; GFX11-NEXT: s_lshl_b32 s4, s4, 8 982; GFX11-NEXT: s_lshl_b32 s1, s1, 0x80008 983; GFX11-NEXT: s_lshl_b32 s2, s5, 8 984; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s4 985; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s2 986; GFX11-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] 987; GFX11-NEXT: v_pk_sub_i16 v1, s0, s1 clamp 988; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 8 989; GFX11-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1] 990; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2 991; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v1 992; GFX11-NEXT: v_bfe_u32 v1, v1, 16, 8 993; GFX11-NEXT: v_and_or_b32 v0, 0xff, v0, v2 994; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 995; GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v1 996; GFX11-NEXT: v_or3_b32 v0, v0, v2, v1 997; GFX11-NEXT: v_readfirstlane_b32 s0, v0 998; GFX11-NEXT: ; return to shader part epilog 999 %lhs = bitcast i32 %lhs.arg to <4 x i8> 1000 %rhs = bitcast i32 %rhs.arg to <4 x i8> 1001 %result = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> %lhs, <4 x i8> %rhs) 1002 %cast.result = bitcast <4 x i8> %result to i32 1003 ret i32 %cast.result 1004} 1005 1006define i24 @v_ssubsat_i24(i24 %lhs, i24 %rhs) { 1007; GFX6-LABEL: v_ssubsat_i24: 1008; GFX6: ; %bb.0: 1009; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1010; GFX6-NEXT: v_lshlrev_b32_e32 v0, 8, v0 1011; GFX6-NEXT: v_max_i32_e32 v2, -1, v0 1012; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 1013; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0x80000001, v2 1014; GFX6-NEXT: v_min_i32_e32 v3, -1, v0 1015; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0x80000000, v3 1016; GFX6-NEXT: v_max_i32_e32 v1, v2, v1 1017; GFX6-NEXT: v_min_i32_e32 v1, v1, v3 1018; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 1019; GFX6-NEXT: v_ashrrev_i32_e32 v0, 8, v0 1020; GFX6-NEXT: s_setpc_b64 s[30:31] 1021; 1022; GFX8-LABEL: v_ssubsat_i24: 1023; GFX8: ; %bb.0: 1024; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1025; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v0, v1 1026; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 24 1027; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 24 1028; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v0 1029; GFX8-NEXT: v_bfe_i32 v0, v1, 0, 24 1030; GFX8-NEXT: v_cmp_lt_i32_e64 s[6:7], 0, v0 1031; GFX8-NEXT: v_ashrrev_i32_e32 v0, 23, v3 1032; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xff800000, v0 1033; GFX8-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] 1034; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc 1035; GFX8-NEXT: s_setpc_b64 s[30:31] 1036; 1037; GFX9-LABEL: v_ssubsat_i24: 1038; GFX9: ; %bb.0: 1039; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1040; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 1041; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 1042; GFX9-NEXT: v_sub_i32 v0, v0, v1 clamp 1043; GFX9-NEXT: v_ashrrev_i32_e32 v0, 8, v0 1044; GFX9-NEXT: s_setpc_b64 s[30:31] 1045; 1046; GFX10PLUS-LABEL: v_ssubsat_i24: 1047; GFX10PLUS: ; %bb.0: 1048; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1049; GFX10PLUS-NEXT: v_lshlrev_b32_e32 v0, 8, v0 1050; GFX10PLUS-NEXT: v_lshlrev_b32_e32 v1, 8, v1 1051; GFX10PLUS-NEXT: v_sub_nc_i32 v0, v0, v1 clamp 1052; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v0, 8, v0 1053; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 1054 %result = call i24 @llvm.ssub.sat.i24(i24 %lhs, i24 %rhs) 1055 ret i24 %result 1056} 1057 1058define amdgpu_ps i24 @s_ssubsat_i24(i24 inreg %lhs, i24 inreg %rhs) { 1059; GFX6-LABEL: s_ssubsat_i24: 1060; GFX6: ; %bb.0: 1061; GFX6-NEXT: s_lshl_b32 s0, s0, 8 1062; GFX6-NEXT: s_max_i32 s2, s0, -1 1063; GFX6-NEXT: s_lshl_b32 s1, s1, 8 1064; GFX6-NEXT: s_add_i32 s2, s2, 0x80000001 1065; GFX6-NEXT: s_min_i32 s3, s0, -1 1066; GFX6-NEXT: s_add_i32 s3, s3, 0x80000000 1067; GFX6-NEXT: s_max_i32 s1, s2, s1 1068; GFX6-NEXT: s_min_i32 s1, s1, s3 1069; GFX6-NEXT: s_sub_i32 s0, s0, s1 1070; GFX6-NEXT: s_ashr_i32 s0, s0, 8 1071; GFX6-NEXT: ; return to shader part epilog 1072; 1073; GFX8-LABEL: s_ssubsat_i24: 1074; GFX8: ; %bb.0: 1075; GFX8-NEXT: s_sub_i32 s2, s0, s1 1076; GFX8-NEXT: s_bfe_i32 s3, s2, 0x180000 1077; GFX8-NEXT: s_bfe_i32 s0, s0, 0x180000 1078; GFX8-NEXT: s_cmp_lt_i32 s3, s0 1079; GFX8-NEXT: s_cselect_b32 s0, 1, 0 1080; GFX8-NEXT: s_bfe_i32 s1, s1, 0x180000 1081; GFX8-NEXT: s_cmp_gt_i32 s1, 0 1082; GFX8-NEXT: s_cselect_b32 s1, 1, 0 1083; GFX8-NEXT: s_xor_b32 s0, s1, s0 1084; GFX8-NEXT: s_ashr_i32 s1, s3, 23 1085; GFX8-NEXT: s_add_i32 s1, s1, 0xff800000 1086; GFX8-NEXT: s_and_b32 s0, s0, 1 1087; GFX8-NEXT: s_cmp_lg_u32 s0, 0 1088; GFX8-NEXT: s_cselect_b32 s0, s1, s2 1089; GFX8-NEXT: ; return to shader part epilog 1090; 1091; GFX9-LABEL: s_ssubsat_i24: 1092; GFX9: ; %bb.0: 1093; GFX9-NEXT: s_lshl_b32 s1, s1, 8 1094; GFX9-NEXT: s_lshl_b32 s0, s0, 8 1095; GFX9-NEXT: v_mov_b32_e32 v0, s1 1096; GFX9-NEXT: v_sub_i32 v0, s0, v0 clamp 1097; GFX9-NEXT: v_ashrrev_i32_e32 v0, 8, v0 1098; GFX9-NEXT: v_readfirstlane_b32 s0, v0 1099; GFX9-NEXT: ; return to shader part epilog 1100; 1101; GFX10PLUS-LABEL: s_ssubsat_i24: 1102; GFX10PLUS: ; %bb.0: 1103; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 8 1104; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 8 1105; GFX10PLUS-NEXT: v_sub_nc_i32 v0, s0, s1 clamp 1106; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v0, 8, v0 1107; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 1108; GFX10PLUS-NEXT: ; return to shader part epilog 1109 %result = call i24 @llvm.ssub.sat.i24(i24 %lhs, i24 %rhs) 1110 ret i24 %result 1111} 1112 1113define i32 @v_ssubsat_i32(i32 %lhs, i32 %rhs) { 1114; GFX6-LABEL: v_ssubsat_i32: 1115; GFX6: ; %bb.0: 1116; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1117; GFX6-NEXT: v_max_i32_e32 v2, -1, v0 1118; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0x80000001, v2 1119; GFX6-NEXT: v_min_i32_e32 v3, -1, v0 1120; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0x80000000, v3 1121; GFX6-NEXT: v_max_i32_e32 v1, v2, v1 1122; GFX6-NEXT: v_min_i32_e32 v1, v1, v3 1123; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 1124; GFX6-NEXT: s_setpc_b64 s[30:31] 1125; 1126; GFX8-LABEL: v_ssubsat_i32: 1127; GFX8: ; %bb.0: 1128; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1129; GFX8-NEXT: v_max_i32_e32 v2, -1, v0 1130; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x80000001, v2 1131; GFX8-NEXT: v_min_i32_e32 v3, -1, v0 1132; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x80000000, v3 1133; GFX8-NEXT: v_max_i32_e32 v1, v2, v1 1134; GFX8-NEXT: v_min_i32_e32 v1, v1, v3 1135; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v1 1136; GFX8-NEXT: s_setpc_b64 s[30:31] 1137; 1138; GFX9-LABEL: v_ssubsat_i32: 1139; GFX9: ; %bb.0: 1140; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1141; GFX9-NEXT: v_sub_i32 v0, v0, v1 clamp 1142; GFX9-NEXT: s_setpc_b64 s[30:31] 1143; 1144; GFX10PLUS-LABEL: v_ssubsat_i32: 1145; GFX10PLUS: ; %bb.0: 1146; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1147; GFX10PLUS-NEXT: v_sub_nc_i32 v0, v0, v1 clamp 1148; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 1149 %result = call i32 @llvm.ssub.sat.i32(i32 %lhs, i32 %rhs) 1150 ret i32 %result 1151} 1152 1153define amdgpu_ps i32 @s_ssubsat_i32(i32 inreg %lhs, i32 inreg %rhs) { 1154; GFX6-LABEL: s_ssubsat_i32: 1155; GFX6: ; %bb.0: 1156; GFX6-NEXT: s_max_i32 s2, s0, -1 1157; GFX6-NEXT: s_add_i32 s2, s2, 0x80000001 1158; GFX6-NEXT: s_min_i32 s3, s0, -1 1159; GFX6-NEXT: s_add_i32 s3, s3, 0x80000000 1160; GFX6-NEXT: s_max_i32 s1, s2, s1 1161; GFX6-NEXT: s_min_i32 s1, s1, s3 1162; GFX6-NEXT: s_sub_i32 s0, s0, s1 1163; GFX6-NEXT: ; return to shader part epilog 1164; 1165; GFX8-LABEL: s_ssubsat_i32: 1166; GFX8: ; %bb.0: 1167; GFX8-NEXT: s_max_i32 s2, s0, -1 1168; GFX8-NEXT: s_add_i32 s2, s2, 0x80000001 1169; GFX8-NEXT: s_min_i32 s3, s0, -1 1170; GFX8-NEXT: s_add_i32 s3, s3, 0x80000000 1171; GFX8-NEXT: s_max_i32 s1, s2, s1 1172; GFX8-NEXT: s_min_i32 s1, s1, s3 1173; GFX8-NEXT: s_sub_i32 s0, s0, s1 1174; GFX8-NEXT: ; return to shader part epilog 1175; 1176; GFX9-LABEL: s_ssubsat_i32: 1177; GFX9: ; %bb.0: 1178; GFX9-NEXT: v_mov_b32_e32 v0, s1 1179; GFX9-NEXT: v_sub_i32 v0, s0, v0 clamp 1180; GFX9-NEXT: v_readfirstlane_b32 s0, v0 1181; GFX9-NEXT: ; return to shader part epilog 1182; 1183; GFX10PLUS-LABEL: s_ssubsat_i32: 1184; GFX10PLUS: ; %bb.0: 1185; GFX10PLUS-NEXT: v_sub_nc_i32 v0, s0, s1 clamp 1186; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 1187; GFX10PLUS-NEXT: ; return to shader part epilog 1188 %result = call i32 @llvm.ssub.sat.i32(i32 %lhs, i32 %rhs) 1189 ret i32 %result 1190} 1191 1192define amdgpu_ps float @ssubsat_i32_sv(i32 inreg %lhs, i32 %rhs) { 1193; GFX6-LABEL: ssubsat_i32_sv: 1194; GFX6: ; %bb.0: 1195; GFX6-NEXT: s_max_i32 s1, s0, -1 1196; GFX6-NEXT: s_add_i32 s1, s1, 0x80000001 1197; GFX6-NEXT: s_min_i32 s2, s0, -1 1198; GFX6-NEXT: s_add_i32 s2, s2, 0x80000000 1199; GFX6-NEXT: v_max_i32_e32 v0, s1, v0 1200; GFX6-NEXT: v_min_i32_e32 v0, s2, v0 1201; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 1202; GFX6-NEXT: ; return to shader part epilog 1203; 1204; GFX8-LABEL: ssubsat_i32_sv: 1205; GFX8: ; %bb.0: 1206; GFX8-NEXT: s_max_i32 s1, s0, -1 1207; GFX8-NEXT: s_add_i32 s1, s1, 0x80000001 1208; GFX8-NEXT: s_min_i32 s2, s0, -1 1209; GFX8-NEXT: s_add_i32 s2, s2, 0x80000000 1210; GFX8-NEXT: v_max_i32_e32 v0, s1, v0 1211; GFX8-NEXT: v_min_i32_e32 v0, s2, v0 1212; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 1213; GFX8-NEXT: ; return to shader part epilog 1214; 1215; GFX9-LABEL: ssubsat_i32_sv: 1216; GFX9: ; %bb.0: 1217; GFX9-NEXT: v_sub_i32 v0, s0, v0 clamp 1218; GFX9-NEXT: ; return to shader part epilog 1219; 1220; GFX10PLUS-LABEL: ssubsat_i32_sv: 1221; GFX10PLUS: ; %bb.0: 1222; GFX10PLUS-NEXT: v_sub_nc_i32 v0, s0, v0 clamp 1223; GFX10PLUS-NEXT: ; return to shader part epilog 1224 %result = call i32 @llvm.ssub.sat.i32(i32 %lhs, i32 %rhs) 1225 %cast = bitcast i32 %result to float 1226 ret float %cast 1227} 1228 1229define amdgpu_ps float @ssubsat_i32_vs(i32 %lhs, i32 inreg %rhs) { 1230; GFX6-LABEL: ssubsat_i32_vs: 1231; GFX6: ; %bb.0: 1232; GFX6-NEXT: v_max_i32_e32 v1, -1, v0 1233; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0x80000001, v1 1234; GFX6-NEXT: v_min_i32_e32 v2, -1, v0 1235; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0x80000000, v2 1236; GFX6-NEXT: v_max_i32_e32 v1, s0, v1 1237; GFX6-NEXT: v_min_i32_e32 v1, v1, v2 1238; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 1239; GFX6-NEXT: ; return to shader part epilog 1240; 1241; GFX8-LABEL: ssubsat_i32_vs: 1242; GFX8: ; %bb.0: 1243; GFX8-NEXT: v_max_i32_e32 v1, -1, v0 1244; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x80000001, v1 1245; GFX8-NEXT: v_min_i32_e32 v2, -1, v0 1246; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x80000000, v2 1247; GFX8-NEXT: v_max_i32_e32 v1, s0, v1 1248; GFX8-NEXT: v_min_i32_e32 v1, v1, v2 1249; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v1 1250; GFX8-NEXT: ; return to shader part epilog 1251; 1252; GFX9-LABEL: ssubsat_i32_vs: 1253; GFX9: ; %bb.0: 1254; GFX9-NEXT: v_sub_i32 v0, v0, s0 clamp 1255; GFX9-NEXT: ; return to shader part epilog 1256; 1257; GFX10PLUS-LABEL: ssubsat_i32_vs: 1258; GFX10PLUS: ; %bb.0: 1259; GFX10PLUS-NEXT: v_sub_nc_i32 v0, v0, s0 clamp 1260; GFX10PLUS-NEXT: ; return to shader part epilog 1261 %result = call i32 @llvm.ssub.sat.i32(i32 %lhs, i32 %rhs) 1262 %cast = bitcast i32 %result to float 1263 ret float %cast 1264} 1265 1266define <2 x i32> @v_ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { 1267; GFX6-LABEL: v_ssubsat_v2i32: 1268; GFX6: ; %bb.0: 1269; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1270; GFX6-NEXT: v_max_i32_e32 v4, -1, v0 1271; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x80000001, v4 1272; GFX6-NEXT: v_min_i32_e32 v5, -1, v0 1273; GFX6-NEXT: v_add_i32_e32 v5, vcc, 0x80000000, v5 1274; GFX6-NEXT: v_max_i32_e32 v2, v4, v2 1275; GFX6-NEXT: v_min_i32_e32 v2, v2, v5 1276; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 1277; GFX6-NEXT: v_max_i32_e32 v2, -1, v1 1278; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0x80000001, v2 1279; GFX6-NEXT: v_min_i32_e32 v4, -1, v1 1280; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x80000000, v4 1281; GFX6-NEXT: v_max_i32_e32 v2, v2, v3 1282; GFX6-NEXT: v_min_i32_e32 v2, v2, v4 1283; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 1284; GFX6-NEXT: s_setpc_b64 s[30:31] 1285; 1286; GFX8-LABEL: v_ssubsat_v2i32: 1287; GFX8: ; %bb.0: 1288; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1289; GFX8-NEXT: v_max_i32_e32 v4, -1, v0 1290; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x80000001, v4 1291; GFX8-NEXT: v_min_i32_e32 v5, -1, v0 1292; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x80000000, v5 1293; GFX8-NEXT: v_max_i32_e32 v2, v4, v2 1294; GFX8-NEXT: v_min_i32_e32 v2, v2, v5 1295; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v2 1296; GFX8-NEXT: v_max_i32_e32 v2, -1, v1 1297; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x80000001, v2 1298; GFX8-NEXT: v_min_i32_e32 v4, -1, v1 1299; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x80000000, v4 1300; GFX8-NEXT: v_max_i32_e32 v2, v2, v3 1301; GFX8-NEXT: v_min_i32_e32 v2, v2, v4 1302; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v1, v2 1303; GFX8-NEXT: s_setpc_b64 s[30:31] 1304; 1305; GFX9-LABEL: v_ssubsat_v2i32: 1306; GFX9: ; %bb.0: 1307; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1308; GFX9-NEXT: v_sub_i32 v0, v0, v2 clamp 1309; GFX9-NEXT: v_sub_i32 v1, v1, v3 clamp 1310; GFX9-NEXT: s_setpc_b64 s[30:31] 1311; 1312; GFX10PLUS-LABEL: v_ssubsat_v2i32: 1313; GFX10PLUS: ; %bb.0: 1314; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1315; GFX10PLUS-NEXT: v_sub_nc_i32 v0, v0, v2 clamp 1316; GFX10PLUS-NEXT: v_sub_nc_i32 v1, v1, v3 clamp 1317; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 1318 %result = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) 1319 ret <2 x i32> %result 1320} 1321 1322define amdgpu_ps <2 x i32> @s_ssubsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inreg %rhs) { 1323; GFX6-LABEL: s_ssubsat_v2i32: 1324; GFX6: ; %bb.0: 1325; GFX6-NEXT: s_max_i32 s4, s0, -1 1326; GFX6-NEXT: s_add_i32 s4, s4, 0x80000001 1327; GFX6-NEXT: s_min_i32 s5, s0, -1 1328; GFX6-NEXT: s_add_i32 s5, s5, 0x80000000 1329; GFX6-NEXT: s_max_i32 s2, s4, s2 1330; GFX6-NEXT: s_min_i32 s2, s2, s5 1331; GFX6-NEXT: s_sub_i32 s0, s0, s2 1332; GFX6-NEXT: s_max_i32 s2, s1, -1 1333; GFX6-NEXT: s_add_i32 s2, s2, 0x80000001 1334; GFX6-NEXT: s_min_i32 s4, s1, -1 1335; GFX6-NEXT: s_add_i32 s4, s4, 0x80000000 1336; GFX6-NEXT: s_max_i32 s2, s2, s3 1337; GFX6-NEXT: s_min_i32 s2, s2, s4 1338; GFX6-NEXT: s_sub_i32 s1, s1, s2 1339; GFX6-NEXT: ; return to shader part epilog 1340; 1341; GFX8-LABEL: s_ssubsat_v2i32: 1342; GFX8: ; %bb.0: 1343; GFX8-NEXT: s_max_i32 s4, s0, -1 1344; GFX8-NEXT: s_add_i32 s4, s4, 0x80000001 1345; GFX8-NEXT: s_min_i32 s5, s0, -1 1346; GFX8-NEXT: s_add_i32 s5, s5, 0x80000000 1347; GFX8-NEXT: s_max_i32 s2, s4, s2 1348; GFX8-NEXT: s_min_i32 s2, s2, s5 1349; GFX8-NEXT: s_sub_i32 s0, s0, s2 1350; GFX8-NEXT: s_max_i32 s2, s1, -1 1351; GFX8-NEXT: s_add_i32 s2, s2, 0x80000001 1352; GFX8-NEXT: s_min_i32 s4, s1, -1 1353; GFX8-NEXT: s_add_i32 s4, s4, 0x80000000 1354; GFX8-NEXT: s_max_i32 s2, s2, s3 1355; GFX8-NEXT: s_min_i32 s2, s2, s4 1356; GFX8-NEXT: s_sub_i32 s1, s1, s2 1357; GFX8-NEXT: ; return to shader part epilog 1358; 1359; GFX9-LABEL: s_ssubsat_v2i32: 1360; GFX9: ; %bb.0: 1361; GFX9-NEXT: v_mov_b32_e32 v0, s2 1362; GFX9-NEXT: v_mov_b32_e32 v1, s3 1363; GFX9-NEXT: v_sub_i32 v0, s0, v0 clamp 1364; GFX9-NEXT: v_sub_i32 v1, s1, v1 clamp 1365; GFX9-NEXT: v_readfirstlane_b32 s0, v0 1366; GFX9-NEXT: v_readfirstlane_b32 s1, v1 1367; GFX9-NEXT: ; return to shader part epilog 1368; 1369; GFX10PLUS-LABEL: s_ssubsat_v2i32: 1370; GFX10PLUS: ; %bb.0: 1371; GFX10PLUS-NEXT: v_sub_nc_i32 v0, s0, s2 clamp 1372; GFX10PLUS-NEXT: v_sub_nc_i32 v1, s1, s3 clamp 1373; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 1374; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1 1375; GFX10PLUS-NEXT: ; return to shader part epilog 1376 %result = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) 1377 ret <2 x i32> %result 1378} 1379 1380define <3 x i32> @v_ssubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) { 1381; GFX6-LABEL: v_ssubsat_v3i32: 1382; GFX6: ; %bb.0: 1383; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1384; GFX6-NEXT: v_max_i32_e32 v6, -1, v0 1385; GFX6-NEXT: v_add_i32_e32 v6, vcc, 0x80000001, v6 1386; GFX6-NEXT: v_min_i32_e32 v8, -1, v0 1387; GFX6-NEXT: v_bfrev_b32_e32 v9, 1 1388; GFX6-NEXT: v_add_i32_e32 v8, vcc, v8, v9 1389; GFX6-NEXT: v_max_i32_e32 v3, v6, v3 1390; GFX6-NEXT: v_min_i32_e32 v3, v3, v8 1391; GFX6-NEXT: v_mov_b32_e32 v7, 0x80000001 1392; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 1393; GFX6-NEXT: v_max_i32_e32 v3, -1, v1 1394; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v7 1395; GFX6-NEXT: v_min_i32_e32 v6, -1, v1 1396; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v9 1397; GFX6-NEXT: v_max_i32_e32 v3, v3, v4 1398; GFX6-NEXT: v_min_i32_e32 v3, v3, v6 1399; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 1400; GFX6-NEXT: v_max_i32_e32 v3, -1, v2 1401; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0x80000001, v3 1402; GFX6-NEXT: v_min_i32_e32 v4, -1, v2 1403; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x80000000, v4 1404; GFX6-NEXT: v_max_i32_e32 v3, v3, v5 1405; GFX6-NEXT: v_min_i32_e32 v3, v3, v4 1406; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 1407; GFX6-NEXT: s_setpc_b64 s[30:31] 1408; 1409; GFX8-LABEL: v_ssubsat_v3i32: 1410; GFX8: ; %bb.0: 1411; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1412; GFX8-NEXT: v_max_i32_e32 v6, -1, v0 1413; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x80000001, v6 1414; GFX8-NEXT: v_min_i32_e32 v8, -1, v0 1415; GFX8-NEXT: v_bfrev_b32_e32 v9, 1 1416; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v9 1417; GFX8-NEXT: v_max_i32_e32 v3, v6, v3 1418; GFX8-NEXT: v_min_i32_e32 v3, v3, v8 1419; GFX8-NEXT: v_mov_b32_e32 v7, 0x80000001 1420; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v3 1421; GFX8-NEXT: v_max_i32_e32 v3, -1, v1 1422; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v7 1423; GFX8-NEXT: v_min_i32_e32 v6, -1, v1 1424; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v9 1425; GFX8-NEXT: v_max_i32_e32 v3, v3, v4 1426; GFX8-NEXT: v_min_i32_e32 v3, v3, v6 1427; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v1, v3 1428; GFX8-NEXT: v_max_i32_e32 v3, -1, v2 1429; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x80000001, v3 1430; GFX8-NEXT: v_min_i32_e32 v4, -1, v2 1431; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x80000000, v4 1432; GFX8-NEXT: v_max_i32_e32 v3, v3, v5 1433; GFX8-NEXT: v_min_i32_e32 v3, v3, v4 1434; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v3 1435; GFX8-NEXT: s_setpc_b64 s[30:31] 1436; 1437; GFX9-LABEL: v_ssubsat_v3i32: 1438; GFX9: ; %bb.0: 1439; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1440; GFX9-NEXT: v_sub_i32 v0, v0, v3 clamp 1441; GFX9-NEXT: v_sub_i32 v1, v1, v4 clamp 1442; GFX9-NEXT: v_sub_i32 v2, v2, v5 clamp 1443; GFX9-NEXT: s_setpc_b64 s[30:31] 1444; 1445; GFX10PLUS-LABEL: v_ssubsat_v3i32: 1446; GFX10PLUS: ; %bb.0: 1447; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1448; GFX10PLUS-NEXT: v_sub_nc_i32 v0, v0, v3 clamp 1449; GFX10PLUS-NEXT: v_sub_nc_i32 v1, v1, v4 clamp 1450; GFX10PLUS-NEXT: v_sub_nc_i32 v2, v2, v5 clamp 1451; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 1452 %result = call <3 x i32> @llvm.ssub.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs) 1453 ret <3 x i32> %result 1454} 1455 1456define amdgpu_ps <3 x i32> @s_ssubsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inreg %rhs) { 1457; GFX6-LABEL: s_ssubsat_v3i32: 1458; GFX6: ; %bb.0: 1459; GFX6-NEXT: s_max_i32 s6, s0, -1 1460; GFX6-NEXT: s_add_i32 s6, s6, 0x80000001 1461; GFX6-NEXT: s_min_i32 s7, s0, -1 1462; GFX6-NEXT: s_add_i32 s7, s7, 0x80000000 1463; GFX6-NEXT: s_max_i32 s3, s6, s3 1464; GFX6-NEXT: s_min_i32 s3, s3, s7 1465; GFX6-NEXT: s_sub_i32 s0, s0, s3 1466; GFX6-NEXT: s_max_i32 s3, s1, -1 1467; GFX6-NEXT: s_add_i32 s3, s3, 0x80000001 1468; GFX6-NEXT: s_min_i32 s6, s1, -1 1469; GFX6-NEXT: s_add_i32 s6, s6, 0x80000000 1470; GFX6-NEXT: s_max_i32 s3, s3, s4 1471; GFX6-NEXT: s_min_i32 s3, s3, s6 1472; GFX6-NEXT: s_sub_i32 s1, s1, s3 1473; GFX6-NEXT: s_max_i32 s3, s2, -1 1474; GFX6-NEXT: s_add_i32 s3, s3, 0x80000001 1475; GFX6-NEXT: s_min_i32 s4, s2, -1 1476; GFX6-NEXT: s_add_i32 s4, s4, 0x80000000 1477; GFX6-NEXT: s_max_i32 s3, s3, s5 1478; GFX6-NEXT: s_min_i32 s3, s3, s4 1479; GFX6-NEXT: s_sub_i32 s2, s2, s3 1480; GFX6-NEXT: ; return to shader part epilog 1481; 1482; GFX8-LABEL: s_ssubsat_v3i32: 1483; GFX8: ; %bb.0: 1484; GFX8-NEXT: s_max_i32 s6, s0, -1 1485; GFX8-NEXT: s_add_i32 s6, s6, 0x80000001 1486; GFX8-NEXT: s_min_i32 s7, s0, -1 1487; GFX8-NEXT: s_add_i32 s7, s7, 0x80000000 1488; GFX8-NEXT: s_max_i32 s3, s6, s3 1489; GFX8-NEXT: s_min_i32 s3, s3, s7 1490; GFX8-NEXT: s_sub_i32 s0, s0, s3 1491; GFX8-NEXT: s_max_i32 s3, s1, -1 1492; GFX8-NEXT: s_add_i32 s3, s3, 0x80000001 1493; GFX8-NEXT: s_min_i32 s6, s1, -1 1494; GFX8-NEXT: s_add_i32 s6, s6, 0x80000000 1495; GFX8-NEXT: s_max_i32 s3, s3, s4 1496; GFX8-NEXT: s_min_i32 s3, s3, s6 1497; GFX8-NEXT: s_sub_i32 s1, s1, s3 1498; GFX8-NEXT: s_max_i32 s3, s2, -1 1499; GFX8-NEXT: s_add_i32 s3, s3, 0x80000001 1500; GFX8-NEXT: s_min_i32 s4, s2, -1 1501; GFX8-NEXT: s_add_i32 s4, s4, 0x80000000 1502; GFX8-NEXT: s_max_i32 s3, s3, s5 1503; GFX8-NEXT: s_min_i32 s3, s3, s4 1504; GFX8-NEXT: s_sub_i32 s2, s2, s3 1505; GFX8-NEXT: ; return to shader part epilog 1506; 1507; GFX9-LABEL: s_ssubsat_v3i32: 1508; GFX9: ; %bb.0: 1509; GFX9-NEXT: v_mov_b32_e32 v0, s3 1510; GFX9-NEXT: v_mov_b32_e32 v1, s4 1511; GFX9-NEXT: v_mov_b32_e32 v2, s5 1512; GFX9-NEXT: v_sub_i32 v0, s0, v0 clamp 1513; GFX9-NEXT: v_sub_i32 v1, s1, v1 clamp 1514; GFX9-NEXT: v_sub_i32 v2, s2, v2 clamp 1515; GFX9-NEXT: v_readfirstlane_b32 s0, v0 1516; GFX9-NEXT: v_readfirstlane_b32 s1, v1 1517; GFX9-NEXT: v_readfirstlane_b32 s2, v2 1518; GFX9-NEXT: ; return to shader part epilog 1519; 1520; GFX10PLUS-LABEL: s_ssubsat_v3i32: 1521; GFX10PLUS: ; %bb.0: 1522; GFX10PLUS-NEXT: v_sub_nc_i32 v0, s0, s3 clamp 1523; GFX10PLUS-NEXT: v_sub_nc_i32 v1, s1, s4 clamp 1524; GFX10PLUS-NEXT: v_sub_nc_i32 v2, s2, s5 clamp 1525; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 1526; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1 1527; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v2 1528; GFX10PLUS-NEXT: ; return to shader part epilog 1529 %result = call <3 x i32> @llvm.ssub.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs) 1530 ret <3 x i32> %result 1531} 1532 1533define <4 x i32> @v_ssubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { 1534; GFX6-LABEL: v_ssubsat_v4i32: 1535; GFX6: ; %bb.0: 1536; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1537; GFX6-NEXT: v_max_i32_e32 v8, -1, v0 1538; GFX6-NEXT: v_add_i32_e32 v8, vcc, 0x80000001, v8 1539; GFX6-NEXT: v_min_i32_e32 v10, -1, v0 1540; GFX6-NEXT: v_bfrev_b32_e32 v11, 1 1541; GFX6-NEXT: v_add_i32_e32 v10, vcc, v10, v11 1542; GFX6-NEXT: v_max_i32_e32 v4, v8, v4 1543; GFX6-NEXT: v_min_i32_e32 v4, v4, v10 1544; GFX6-NEXT: v_mov_b32_e32 v9, 0x80000001 1545; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 1546; GFX6-NEXT: v_max_i32_e32 v4, -1, v1 1547; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v9 1548; GFX6-NEXT: v_min_i32_e32 v8, -1, v1 1549; GFX6-NEXT: v_add_i32_e32 v8, vcc, v8, v11 1550; GFX6-NEXT: v_max_i32_e32 v4, v4, v5 1551; GFX6-NEXT: v_min_i32_e32 v4, v4, v8 1552; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 1553; GFX6-NEXT: v_max_i32_e32 v4, -1, v2 1554; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v9 1555; GFX6-NEXT: v_min_i32_e32 v5, -1, v2 1556; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v11 1557; GFX6-NEXT: v_max_i32_e32 v4, v4, v6 1558; GFX6-NEXT: v_min_i32_e32 v4, v4, v5 1559; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 1560; GFX6-NEXT: v_max_i32_e32 v4, -1, v3 1561; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x80000001, v4 1562; GFX6-NEXT: v_min_i32_e32 v5, -1, v3 1563; GFX6-NEXT: v_add_i32_e32 v5, vcc, 0x80000000, v5 1564; GFX6-NEXT: v_max_i32_e32 v4, v4, v7 1565; GFX6-NEXT: v_min_i32_e32 v4, v4, v5 1566; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v4 1567; GFX6-NEXT: s_setpc_b64 s[30:31] 1568; 1569; GFX8-LABEL: v_ssubsat_v4i32: 1570; GFX8: ; %bb.0: 1571; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1572; GFX8-NEXT: v_max_i32_e32 v8, -1, v0 1573; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x80000001, v8 1574; GFX8-NEXT: v_min_i32_e32 v10, -1, v0 1575; GFX8-NEXT: v_bfrev_b32_e32 v11, 1 1576; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v11 1577; GFX8-NEXT: v_max_i32_e32 v4, v8, v4 1578; GFX8-NEXT: v_min_i32_e32 v4, v4, v10 1579; GFX8-NEXT: v_mov_b32_e32 v9, 0x80000001 1580; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v4 1581; GFX8-NEXT: v_max_i32_e32 v4, -1, v1 1582; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v9 1583; GFX8-NEXT: v_min_i32_e32 v8, -1, v1 1584; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v11 1585; GFX8-NEXT: v_max_i32_e32 v4, v4, v5 1586; GFX8-NEXT: v_min_i32_e32 v4, v4, v8 1587; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v1, v4 1588; GFX8-NEXT: v_max_i32_e32 v4, -1, v2 1589; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v9 1590; GFX8-NEXT: v_min_i32_e32 v5, -1, v2 1591; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v11 1592; GFX8-NEXT: v_max_i32_e32 v4, v4, v6 1593; GFX8-NEXT: v_min_i32_e32 v4, v4, v5 1594; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v4 1595; GFX8-NEXT: v_max_i32_e32 v4, -1, v3 1596; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x80000001, v4 1597; GFX8-NEXT: v_min_i32_e32 v5, -1, v3 1598; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x80000000, v5 1599; GFX8-NEXT: v_max_i32_e32 v4, v4, v7 1600; GFX8-NEXT: v_min_i32_e32 v4, v4, v5 1601; GFX8-NEXT: v_sub_u32_e32 v3, vcc, v3, v4 1602; GFX8-NEXT: s_setpc_b64 s[30:31] 1603; 1604; GFX9-LABEL: v_ssubsat_v4i32: 1605; GFX9: ; %bb.0: 1606; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1607; GFX9-NEXT: v_sub_i32 v0, v0, v4 clamp 1608; GFX9-NEXT: v_sub_i32 v1, v1, v5 clamp 1609; GFX9-NEXT: v_sub_i32 v2, v2, v6 clamp 1610; GFX9-NEXT: v_sub_i32 v3, v3, v7 clamp 1611; GFX9-NEXT: s_setpc_b64 s[30:31] 1612; 1613; GFX10PLUS-LABEL: v_ssubsat_v4i32: 1614; GFX10PLUS: ; %bb.0: 1615; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1616; GFX10PLUS-NEXT: v_sub_nc_i32 v0, v0, v4 clamp 1617; GFX10PLUS-NEXT: v_sub_nc_i32 v1, v1, v5 clamp 1618; GFX10PLUS-NEXT: v_sub_nc_i32 v2, v2, v6 clamp 1619; GFX10PLUS-NEXT: v_sub_nc_i32 v3, v3, v7 clamp 1620; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 1621 %result = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) 1622 ret <4 x i32> %result 1623} 1624 1625define amdgpu_ps <4 x i32> @s_ssubsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inreg %rhs) { 1626; GFX6-LABEL: s_ssubsat_v4i32: 1627; GFX6: ; %bb.0: 1628; GFX6-NEXT: s_max_i32 s8, s0, -1 1629; GFX6-NEXT: s_add_i32 s8, s8, 0x80000001 1630; GFX6-NEXT: s_min_i32 s9, s0, -1 1631; GFX6-NEXT: s_add_i32 s9, s9, 0x80000000 1632; GFX6-NEXT: s_max_i32 s4, s8, s4 1633; GFX6-NEXT: s_min_i32 s4, s4, s9 1634; GFX6-NEXT: s_sub_i32 s0, s0, s4 1635; GFX6-NEXT: s_max_i32 s4, s1, -1 1636; GFX6-NEXT: s_add_i32 s4, s4, 0x80000001 1637; GFX6-NEXT: s_min_i32 s8, s1, -1 1638; GFX6-NEXT: s_add_i32 s8, s8, 0x80000000 1639; GFX6-NEXT: s_max_i32 s4, s4, s5 1640; GFX6-NEXT: s_min_i32 s4, s4, s8 1641; GFX6-NEXT: s_sub_i32 s1, s1, s4 1642; GFX6-NEXT: s_max_i32 s4, s2, -1 1643; GFX6-NEXT: s_add_i32 s4, s4, 0x80000001 1644; GFX6-NEXT: s_min_i32 s5, s2, -1 1645; GFX6-NEXT: s_add_i32 s5, s5, 0x80000000 1646; GFX6-NEXT: s_max_i32 s4, s4, s6 1647; GFX6-NEXT: s_min_i32 s4, s4, s5 1648; GFX6-NEXT: s_sub_i32 s2, s2, s4 1649; GFX6-NEXT: s_max_i32 s4, s3, -1 1650; GFX6-NEXT: s_add_i32 s4, s4, 0x80000001 1651; GFX6-NEXT: s_min_i32 s5, s3, -1 1652; GFX6-NEXT: s_add_i32 s5, s5, 0x80000000 1653; GFX6-NEXT: s_max_i32 s4, s4, s7 1654; GFX6-NEXT: s_min_i32 s4, s4, s5 1655; GFX6-NEXT: s_sub_i32 s3, s3, s4 1656; GFX6-NEXT: ; return to shader part epilog 1657; 1658; GFX8-LABEL: s_ssubsat_v4i32: 1659; GFX8: ; %bb.0: 1660; GFX8-NEXT: s_max_i32 s8, s0, -1 1661; GFX8-NEXT: s_add_i32 s8, s8, 0x80000001 1662; GFX8-NEXT: s_min_i32 s9, s0, -1 1663; GFX8-NEXT: s_add_i32 s9, s9, 0x80000000 1664; GFX8-NEXT: s_max_i32 s4, s8, s4 1665; GFX8-NEXT: s_min_i32 s4, s4, s9 1666; GFX8-NEXT: s_sub_i32 s0, s0, s4 1667; GFX8-NEXT: s_max_i32 s4, s1, -1 1668; GFX8-NEXT: s_add_i32 s4, s4, 0x80000001 1669; GFX8-NEXT: s_min_i32 s8, s1, -1 1670; GFX8-NEXT: s_add_i32 s8, s8, 0x80000000 1671; GFX8-NEXT: s_max_i32 s4, s4, s5 1672; GFX8-NEXT: s_min_i32 s4, s4, s8 1673; GFX8-NEXT: s_sub_i32 s1, s1, s4 1674; GFX8-NEXT: s_max_i32 s4, s2, -1 1675; GFX8-NEXT: s_add_i32 s4, s4, 0x80000001 1676; GFX8-NEXT: s_min_i32 s5, s2, -1 1677; GFX8-NEXT: s_add_i32 s5, s5, 0x80000000 1678; GFX8-NEXT: s_max_i32 s4, s4, s6 1679; GFX8-NEXT: s_min_i32 s4, s4, s5 1680; GFX8-NEXT: s_sub_i32 s2, s2, s4 1681; GFX8-NEXT: s_max_i32 s4, s3, -1 1682; GFX8-NEXT: s_add_i32 s4, s4, 0x80000001 1683; GFX8-NEXT: s_min_i32 s5, s3, -1 1684; GFX8-NEXT: s_add_i32 s5, s5, 0x80000000 1685; GFX8-NEXT: s_max_i32 s4, s4, s7 1686; GFX8-NEXT: s_min_i32 s4, s4, s5 1687; GFX8-NEXT: s_sub_i32 s3, s3, s4 1688; GFX8-NEXT: ; return to shader part epilog 1689; 1690; GFX9-LABEL: s_ssubsat_v4i32: 1691; GFX9: ; %bb.0: 1692; GFX9-NEXT: v_mov_b32_e32 v0, s4 1693; GFX9-NEXT: v_mov_b32_e32 v1, s5 1694; GFX9-NEXT: v_mov_b32_e32 v2, s6 1695; GFX9-NEXT: v_mov_b32_e32 v3, s7 1696; GFX9-NEXT: v_sub_i32 v0, s0, v0 clamp 1697; GFX9-NEXT: v_sub_i32 v1, s1, v1 clamp 1698; GFX9-NEXT: v_sub_i32 v2, s2, v2 clamp 1699; GFX9-NEXT: v_sub_i32 v3, s3, v3 clamp 1700; GFX9-NEXT: v_readfirstlane_b32 s0, v0 1701; GFX9-NEXT: v_readfirstlane_b32 s1, v1 1702; GFX9-NEXT: v_readfirstlane_b32 s2, v2 1703; GFX9-NEXT: v_readfirstlane_b32 s3, v3 1704; GFX9-NEXT: ; return to shader part epilog 1705; 1706; GFX10PLUS-LABEL: s_ssubsat_v4i32: 1707; GFX10PLUS: ; %bb.0: 1708; GFX10PLUS-NEXT: v_sub_nc_i32 v0, s0, s4 clamp 1709; GFX10PLUS-NEXT: v_sub_nc_i32 v1, s1, s5 clamp 1710; GFX10PLUS-NEXT: v_sub_nc_i32 v2, s2, s6 clamp 1711; GFX10PLUS-NEXT: v_sub_nc_i32 v3, s3, s7 clamp 1712; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 1713; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1 1714; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v2 1715; GFX10PLUS-NEXT: v_readfirstlane_b32 s3, v3 1716; GFX10PLUS-NEXT: ; return to shader part epilog 1717 %result = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) 1718 ret <4 x i32> %result 1719} 1720 1721define <5 x i32> @v_ssubsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) { 1722; GFX6-LABEL: v_ssubsat_v5i32: 1723; GFX6: ; %bb.0: 1724; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1725; GFX6-NEXT: v_max_i32_e32 v10, -1, v0 1726; GFX6-NEXT: v_add_i32_e32 v10, vcc, 0x80000001, v10 1727; GFX6-NEXT: v_min_i32_e32 v12, -1, v0 1728; GFX6-NEXT: v_bfrev_b32_e32 v13, 1 1729; GFX6-NEXT: v_add_i32_e32 v12, vcc, v12, v13 1730; GFX6-NEXT: v_max_i32_e32 v5, v10, v5 1731; GFX6-NEXT: v_min_i32_e32 v5, v5, v12 1732; GFX6-NEXT: v_mov_b32_e32 v11, 0x80000001 1733; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 1734; GFX6-NEXT: v_max_i32_e32 v5, -1, v1 1735; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v11 1736; GFX6-NEXT: v_min_i32_e32 v10, -1, v1 1737; GFX6-NEXT: v_add_i32_e32 v10, vcc, v10, v13 1738; GFX6-NEXT: v_max_i32_e32 v5, v5, v6 1739; GFX6-NEXT: v_min_i32_e32 v5, v5, v10 1740; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v5 1741; GFX6-NEXT: v_max_i32_e32 v5, -1, v2 1742; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v11 1743; GFX6-NEXT: v_min_i32_e32 v6, -1, v2 1744; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v13 1745; GFX6-NEXT: v_max_i32_e32 v5, v5, v7 1746; GFX6-NEXT: v_min_i32_e32 v5, v5, v6 1747; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 1748; GFX6-NEXT: v_max_i32_e32 v5, -1, v3 1749; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v11 1750; GFX6-NEXT: v_min_i32_e32 v6, -1, v3 1751; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v13 1752; GFX6-NEXT: v_max_i32_e32 v5, v5, v8 1753; GFX6-NEXT: v_min_i32_e32 v5, v5, v6 1754; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v5 1755; GFX6-NEXT: v_max_i32_e32 v5, -1, v4 1756; GFX6-NEXT: v_add_i32_e32 v5, vcc, 0x80000001, v5 1757; GFX6-NEXT: v_min_i32_e32 v6, -1, v4 1758; GFX6-NEXT: v_add_i32_e32 v6, vcc, 0x80000000, v6 1759; GFX6-NEXT: v_max_i32_e32 v5, v5, v9 1760; GFX6-NEXT: v_min_i32_e32 v5, v5, v6 1761; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v5 1762; GFX6-NEXT: s_setpc_b64 s[30:31] 1763; 1764; GFX8-LABEL: v_ssubsat_v5i32: 1765; GFX8: ; %bb.0: 1766; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1767; GFX8-NEXT: v_max_i32_e32 v10, -1, v0 1768; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0x80000001, v10 1769; GFX8-NEXT: v_min_i32_e32 v12, -1, v0 1770; GFX8-NEXT: v_bfrev_b32_e32 v13, 1 1771; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v13 1772; GFX8-NEXT: v_max_i32_e32 v5, v10, v5 1773; GFX8-NEXT: v_min_i32_e32 v5, v5, v12 1774; GFX8-NEXT: v_mov_b32_e32 v11, 0x80000001 1775; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v5 1776; GFX8-NEXT: v_max_i32_e32 v5, -1, v1 1777; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v11 1778; GFX8-NEXT: v_min_i32_e32 v10, -1, v1 1779; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v13 1780; GFX8-NEXT: v_max_i32_e32 v5, v5, v6 1781; GFX8-NEXT: v_min_i32_e32 v5, v5, v10 1782; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v1, v5 1783; GFX8-NEXT: v_max_i32_e32 v5, -1, v2 1784; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v11 1785; GFX8-NEXT: v_min_i32_e32 v6, -1, v2 1786; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v13 1787; GFX8-NEXT: v_max_i32_e32 v5, v5, v7 1788; GFX8-NEXT: v_min_i32_e32 v5, v5, v6 1789; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v5 1790; GFX8-NEXT: v_max_i32_e32 v5, -1, v3 1791; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v11 1792; GFX8-NEXT: v_min_i32_e32 v6, -1, v3 1793; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v13 1794; GFX8-NEXT: v_max_i32_e32 v5, v5, v8 1795; GFX8-NEXT: v_min_i32_e32 v5, v5, v6 1796; GFX8-NEXT: v_sub_u32_e32 v3, vcc, v3, v5 1797; GFX8-NEXT: v_max_i32_e32 v5, -1, v4 1798; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x80000001, v5 1799; GFX8-NEXT: v_min_i32_e32 v6, -1, v4 1800; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x80000000, v6 1801; GFX8-NEXT: v_max_i32_e32 v5, v5, v9 1802; GFX8-NEXT: v_min_i32_e32 v5, v5, v6 1803; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v5 1804; GFX8-NEXT: s_setpc_b64 s[30:31] 1805; 1806; GFX9-LABEL: v_ssubsat_v5i32: 1807; GFX9: ; %bb.0: 1808; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1809; GFX9-NEXT: v_sub_i32 v0, v0, v5 clamp 1810; GFX9-NEXT: v_sub_i32 v1, v1, v6 clamp 1811; GFX9-NEXT: v_sub_i32 v2, v2, v7 clamp 1812; GFX9-NEXT: v_sub_i32 v3, v3, v8 clamp 1813; GFX9-NEXT: v_sub_i32 v4, v4, v9 clamp 1814; GFX9-NEXT: s_setpc_b64 s[30:31] 1815; 1816; GFX10PLUS-LABEL: v_ssubsat_v5i32: 1817; GFX10PLUS: ; %bb.0: 1818; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1819; GFX10PLUS-NEXT: v_sub_nc_i32 v0, v0, v5 clamp 1820; GFX10PLUS-NEXT: v_sub_nc_i32 v1, v1, v6 clamp 1821; GFX10PLUS-NEXT: v_sub_nc_i32 v2, v2, v7 clamp 1822; GFX10PLUS-NEXT: v_sub_nc_i32 v3, v3, v8 clamp 1823; GFX10PLUS-NEXT: v_sub_nc_i32 v4, v4, v9 clamp 1824; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 1825 %result = call <5 x i32> @llvm.ssub.sat.v5i32(<5 x i32> %lhs, <5 x i32> %rhs) 1826 ret <5 x i32> %result 1827} 1828 1829define amdgpu_ps <5 x i32> @s_ssubsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inreg %rhs) { 1830; GFX6-LABEL: s_ssubsat_v5i32: 1831; GFX6: ; %bb.0: 1832; GFX6-NEXT: s_max_i32 s10, s0, -1 1833; GFX6-NEXT: s_add_i32 s10, s10, 0x80000001 1834; GFX6-NEXT: s_min_i32 s11, s0, -1 1835; GFX6-NEXT: s_add_i32 s11, s11, 0x80000000 1836; GFX6-NEXT: s_max_i32 s5, s10, s5 1837; GFX6-NEXT: s_min_i32 s5, s5, s11 1838; GFX6-NEXT: s_sub_i32 s0, s0, s5 1839; GFX6-NEXT: s_max_i32 s5, s1, -1 1840; GFX6-NEXT: s_add_i32 s5, s5, 0x80000001 1841; GFX6-NEXT: s_min_i32 s10, s1, -1 1842; GFX6-NEXT: s_add_i32 s10, s10, 0x80000000 1843; GFX6-NEXT: s_max_i32 s5, s5, s6 1844; GFX6-NEXT: s_min_i32 s5, s5, s10 1845; GFX6-NEXT: s_sub_i32 s1, s1, s5 1846; GFX6-NEXT: s_max_i32 s5, s2, -1 1847; GFX6-NEXT: s_add_i32 s5, s5, 0x80000001 1848; GFX6-NEXT: s_min_i32 s6, s2, -1 1849; GFX6-NEXT: s_add_i32 s6, s6, 0x80000000 1850; GFX6-NEXT: s_max_i32 s5, s5, s7 1851; GFX6-NEXT: s_min_i32 s5, s5, s6 1852; GFX6-NEXT: s_sub_i32 s2, s2, s5 1853; GFX6-NEXT: s_max_i32 s5, s3, -1 1854; GFX6-NEXT: s_add_i32 s5, s5, 0x80000001 1855; GFX6-NEXT: s_min_i32 s6, s3, -1 1856; GFX6-NEXT: s_add_i32 s6, s6, 0x80000000 1857; GFX6-NEXT: s_max_i32 s5, s5, s8 1858; GFX6-NEXT: s_min_i32 s5, s5, s6 1859; GFX6-NEXT: s_sub_i32 s3, s3, s5 1860; GFX6-NEXT: s_max_i32 s5, s4, -1 1861; GFX6-NEXT: s_add_i32 s5, s5, 0x80000001 1862; GFX6-NEXT: s_min_i32 s6, s4, -1 1863; GFX6-NEXT: s_add_i32 s6, s6, 0x80000000 1864; GFX6-NEXT: s_max_i32 s5, s5, s9 1865; GFX6-NEXT: s_min_i32 s5, s5, s6 1866; GFX6-NEXT: s_sub_i32 s4, s4, s5 1867; GFX6-NEXT: ; return to shader part epilog 1868; 1869; GFX8-LABEL: s_ssubsat_v5i32: 1870; GFX8: ; %bb.0: 1871; GFX8-NEXT: s_max_i32 s10, s0, -1 1872; GFX8-NEXT: s_add_i32 s10, s10, 0x80000001 1873; GFX8-NEXT: s_min_i32 s11, s0, -1 1874; GFX8-NEXT: s_add_i32 s11, s11, 0x80000000 1875; GFX8-NEXT: s_max_i32 s5, s10, s5 1876; GFX8-NEXT: s_min_i32 s5, s5, s11 1877; GFX8-NEXT: s_sub_i32 s0, s0, s5 1878; GFX8-NEXT: s_max_i32 s5, s1, -1 1879; GFX8-NEXT: s_add_i32 s5, s5, 0x80000001 1880; GFX8-NEXT: s_min_i32 s10, s1, -1 1881; GFX8-NEXT: s_add_i32 s10, s10, 0x80000000 1882; GFX8-NEXT: s_max_i32 s5, s5, s6 1883; GFX8-NEXT: s_min_i32 s5, s5, s10 1884; GFX8-NEXT: s_sub_i32 s1, s1, s5 1885; GFX8-NEXT: s_max_i32 s5, s2, -1 1886; GFX8-NEXT: s_add_i32 s5, s5, 0x80000001 1887; GFX8-NEXT: s_min_i32 s6, s2, -1 1888; GFX8-NEXT: s_add_i32 s6, s6, 0x80000000 1889; GFX8-NEXT: s_max_i32 s5, s5, s7 1890; GFX8-NEXT: s_min_i32 s5, s5, s6 1891; GFX8-NEXT: s_sub_i32 s2, s2, s5 1892; GFX8-NEXT: s_max_i32 s5, s3, -1 1893; GFX8-NEXT: s_add_i32 s5, s5, 0x80000001 1894; GFX8-NEXT: s_min_i32 s6, s3, -1 1895; GFX8-NEXT: s_add_i32 s6, s6, 0x80000000 1896; GFX8-NEXT: s_max_i32 s5, s5, s8 1897; GFX8-NEXT: s_min_i32 s5, s5, s6 1898; GFX8-NEXT: s_sub_i32 s3, s3, s5 1899; GFX8-NEXT: s_max_i32 s5, s4, -1 1900; GFX8-NEXT: s_add_i32 s5, s5, 0x80000001 1901; GFX8-NEXT: s_min_i32 s6, s4, -1 1902; GFX8-NEXT: s_add_i32 s6, s6, 0x80000000 1903; GFX8-NEXT: s_max_i32 s5, s5, s9 1904; GFX8-NEXT: s_min_i32 s5, s5, s6 1905; GFX8-NEXT: s_sub_i32 s4, s4, s5 1906; GFX8-NEXT: ; return to shader part epilog 1907; 1908; GFX9-LABEL: s_ssubsat_v5i32: 1909; GFX9: ; %bb.0: 1910; GFX9-NEXT: v_mov_b32_e32 v0, s5 1911; GFX9-NEXT: v_mov_b32_e32 v1, s6 1912; GFX9-NEXT: v_mov_b32_e32 v2, s7 1913; GFX9-NEXT: v_mov_b32_e32 v3, s8 1914; GFX9-NEXT: v_mov_b32_e32 v4, s9 1915; GFX9-NEXT: v_sub_i32 v0, s0, v0 clamp 1916; GFX9-NEXT: v_sub_i32 v1, s1, v1 clamp 1917; GFX9-NEXT: v_sub_i32 v2, s2, v2 clamp 1918; GFX9-NEXT: v_sub_i32 v3, s3, v3 clamp 1919; GFX9-NEXT: v_sub_i32 v4, s4, v4 clamp 1920; GFX9-NEXT: v_readfirstlane_b32 s0, v0 1921; GFX9-NEXT: v_readfirstlane_b32 s1, v1 1922; GFX9-NEXT: v_readfirstlane_b32 s2, v2 1923; GFX9-NEXT: v_readfirstlane_b32 s3, v3 1924; GFX9-NEXT: v_readfirstlane_b32 s4, v4 1925; GFX9-NEXT: ; return to shader part epilog 1926; 1927; GFX10PLUS-LABEL: s_ssubsat_v5i32: 1928; GFX10PLUS: ; %bb.0: 1929; GFX10PLUS-NEXT: v_sub_nc_i32 v0, s0, s5 clamp 1930; GFX10PLUS-NEXT: v_sub_nc_i32 v1, s1, s6 clamp 1931; GFX10PLUS-NEXT: v_sub_nc_i32 v2, s2, s7 clamp 1932; GFX10PLUS-NEXT: v_sub_nc_i32 v3, s3, s8 clamp 1933; GFX10PLUS-NEXT: v_sub_nc_i32 v4, s4, s9 clamp 1934; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 1935; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1 1936; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v2 1937; GFX10PLUS-NEXT: v_readfirstlane_b32 s3, v3 1938; GFX10PLUS-NEXT: v_readfirstlane_b32 s4, v4 1939; GFX10PLUS-NEXT: ; return to shader part epilog 1940 %result = call <5 x i32> @llvm.ssub.sat.v5i32(<5 x i32> %lhs, <5 x i32> %rhs) 1941 ret <5 x i32> %result 1942} 1943 1944define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { 1945; GFX6-LABEL: v_ssubsat_v16i32: 1946; GFX6: ; %bb.0: 1947; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1948; GFX6-NEXT: v_max_i32_e32 v32, -1, v0 1949; GFX6-NEXT: v_mov_b32_e32 v31, 0x80000001 1950; GFX6-NEXT: v_add_i32_e32 v32, vcc, v32, v31 1951; GFX6-NEXT: v_max_i32_e32 v32, v32, v16 1952; GFX6-NEXT: v_min_i32_e32 v33, -1, v0 1953; GFX6-NEXT: v_bfrev_b32_e32 v16, 1 1954; GFX6-NEXT: v_add_i32_e32 v33, vcc, v33, v16 1955; GFX6-NEXT: v_min_i32_e32 v32, v32, v33 1956; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v32 1957; GFX6-NEXT: v_max_i32_e32 v32, -1, v1 1958; GFX6-NEXT: v_add_i32_e32 v32, vcc, v32, v31 1959; GFX6-NEXT: v_max_i32_e32 v17, v32, v17 1960; GFX6-NEXT: v_min_i32_e32 v32, -1, v1 1961; GFX6-NEXT: v_add_i32_e32 v32, vcc, v32, v16 1962; GFX6-NEXT: v_min_i32_e32 v17, v17, v32 1963; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v17 1964; GFX6-NEXT: v_max_i32_e32 v17, -1, v2 1965; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 1966; GFX6-NEXT: v_max_i32_e32 v17, v17, v18 1967; GFX6-NEXT: v_min_i32_e32 v18, -1, v2 1968; GFX6-NEXT: v_add_i32_e32 v18, vcc, v18, v16 1969; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 1970; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v17 1971; GFX6-NEXT: v_max_i32_e32 v17, -1, v3 1972; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 1973; GFX6-NEXT: v_max_i32_e32 v17, v17, v19 1974; GFX6-NEXT: buffer_load_dword v19, off, s[0:3], s32 1975; GFX6-NEXT: v_min_i32_e32 v18, -1, v3 1976; GFX6-NEXT: v_add_i32_e32 v18, vcc, v18, v16 1977; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 1978; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v17 1979; GFX6-NEXT: v_max_i32_e32 v17, -1, v4 1980; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 1981; GFX6-NEXT: v_min_i32_e32 v18, -1, v4 1982; GFX6-NEXT: v_max_i32_e32 v17, v17, v20 1983; GFX6-NEXT: v_add_i32_e32 v18, vcc, v18, v16 1984; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 1985; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v17 1986; GFX6-NEXT: v_max_i32_e32 v17, -1, v5 1987; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 1988; GFX6-NEXT: v_min_i32_e32 v18, -1, v5 1989; GFX6-NEXT: v_max_i32_e32 v17, v17, v21 1990; GFX6-NEXT: v_add_i32_e32 v18, vcc, v18, v16 1991; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 1992; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v17 1993; GFX6-NEXT: v_max_i32_e32 v17, -1, v6 1994; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 1995; GFX6-NEXT: v_min_i32_e32 v18, -1, v6 1996; GFX6-NEXT: v_max_i32_e32 v17, v17, v22 1997; GFX6-NEXT: v_add_i32_e32 v18, vcc, v18, v16 1998; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 1999; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v17 2000; GFX6-NEXT: v_max_i32_e32 v17, -1, v7 2001; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 2002; GFX6-NEXT: v_min_i32_e32 v18, -1, v7 2003; GFX6-NEXT: v_max_i32_e32 v17, v17, v23 2004; GFX6-NEXT: v_add_i32_e32 v18, vcc, v18, v16 2005; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 2006; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v17 2007; GFX6-NEXT: v_max_i32_e32 v17, -1, v8 2008; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 2009; GFX6-NEXT: v_min_i32_e32 v18, -1, v8 2010; GFX6-NEXT: v_max_i32_e32 v17, v17, v24 2011; GFX6-NEXT: v_add_i32_e32 v18, vcc, v18, v16 2012; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 2013; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v17 2014; GFX6-NEXT: v_max_i32_e32 v17, -1, v9 2015; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 2016; GFX6-NEXT: v_min_i32_e32 v18, -1, v9 2017; GFX6-NEXT: v_max_i32_e32 v17, v17, v25 2018; GFX6-NEXT: v_add_i32_e32 v18, vcc, v18, v16 2019; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 2020; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v17 2021; GFX6-NEXT: v_max_i32_e32 v17, -1, v10 2022; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 2023; GFX6-NEXT: v_min_i32_e32 v18, -1, v10 2024; GFX6-NEXT: v_max_i32_e32 v17, v17, v26 2025; GFX6-NEXT: v_add_i32_e32 v18, vcc, v18, v16 2026; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 2027; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v17 2028; GFX6-NEXT: v_max_i32_e32 v17, -1, v11 2029; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 2030; GFX6-NEXT: v_min_i32_e32 v18, -1, v11 2031; GFX6-NEXT: v_max_i32_e32 v17, v17, v27 2032; GFX6-NEXT: v_add_i32_e32 v18, vcc, v18, v16 2033; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 2034; GFX6-NEXT: v_sub_i32_e32 v11, vcc, v11, v17 2035; GFX6-NEXT: v_max_i32_e32 v17, -1, v12 2036; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 2037; GFX6-NEXT: v_min_i32_e32 v18, -1, v12 2038; GFX6-NEXT: v_max_i32_e32 v17, v17, v28 2039; GFX6-NEXT: v_add_i32_e32 v18, vcc, v18, v16 2040; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 2041; GFX6-NEXT: v_sub_i32_e32 v12, vcc, v12, v17 2042; GFX6-NEXT: v_max_i32_e32 v17, -1, v13 2043; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 2044; GFX6-NEXT: v_min_i32_e32 v18, -1, v13 2045; GFX6-NEXT: v_max_i32_e32 v17, v17, v29 2046; GFX6-NEXT: v_add_i32_e32 v18, vcc, v18, v16 2047; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 2048; GFX6-NEXT: v_sub_i32_e32 v13, vcc, v13, v17 2049; GFX6-NEXT: v_max_i32_e32 v17, -1, v14 2050; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 2051; GFX6-NEXT: v_min_i32_e32 v18, -1, v14 2052; GFX6-NEXT: v_max_i32_e32 v17, v17, v30 2053; GFX6-NEXT: v_add_i32_e32 v18, vcc, v18, v16 2054; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 2055; GFX6-NEXT: v_sub_i32_e32 v14, vcc, v14, v17 2056; GFX6-NEXT: v_max_i32_e32 v17, -1, v15 2057; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 2058; GFX6-NEXT: v_min_i32_e32 v18, -1, v15 2059; GFX6-NEXT: v_add_i32_e32 v16, vcc, v18, v16 2060; GFX6-NEXT: s_waitcnt vmcnt(0) 2061; GFX6-NEXT: v_max_i32_e32 v17, v17, v19 2062; GFX6-NEXT: v_min_i32_e32 v16, v17, v16 2063; GFX6-NEXT: v_sub_i32_e32 v15, vcc, v15, v16 2064; GFX6-NEXT: s_setpc_b64 s[30:31] 2065; 2066; GFX8-LABEL: v_ssubsat_v16i32: 2067; GFX8: ; %bb.0: 2068; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2069; GFX8-NEXT: v_max_i32_e32 v32, -1, v0 2070; GFX8-NEXT: v_mov_b32_e32 v31, 0x80000001 2071; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v31 2072; GFX8-NEXT: v_max_i32_e32 v32, v32, v16 2073; GFX8-NEXT: v_min_i32_e32 v33, -1, v0 2074; GFX8-NEXT: v_bfrev_b32_e32 v16, 1 2075; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v16 2076; GFX8-NEXT: v_min_i32_e32 v32, v32, v33 2077; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v32 2078; GFX8-NEXT: v_max_i32_e32 v32, -1, v1 2079; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v31 2080; GFX8-NEXT: v_max_i32_e32 v17, v32, v17 2081; GFX8-NEXT: v_min_i32_e32 v32, -1, v1 2082; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v16 2083; GFX8-NEXT: v_min_i32_e32 v17, v17, v32 2084; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v1, v17 2085; GFX8-NEXT: v_max_i32_e32 v17, -1, v2 2086; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 2087; GFX8-NEXT: v_max_i32_e32 v17, v17, v18 2088; GFX8-NEXT: v_min_i32_e32 v18, -1, v2 2089; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v16 2090; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 2091; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v17 2092; GFX8-NEXT: v_max_i32_e32 v17, -1, v3 2093; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 2094; GFX8-NEXT: v_max_i32_e32 v17, v17, v19 2095; GFX8-NEXT: buffer_load_dword v19, off, s[0:3], s32 2096; GFX8-NEXT: v_min_i32_e32 v18, -1, v3 2097; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v16 2098; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 2099; GFX8-NEXT: v_sub_u32_e32 v3, vcc, v3, v17 2100; GFX8-NEXT: v_max_i32_e32 v17, -1, v4 2101; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 2102; GFX8-NEXT: v_min_i32_e32 v18, -1, v4 2103; GFX8-NEXT: v_max_i32_e32 v17, v17, v20 2104; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v16 2105; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 2106; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v17 2107; GFX8-NEXT: v_max_i32_e32 v17, -1, v5 2108; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 2109; GFX8-NEXT: v_min_i32_e32 v18, -1, v5 2110; GFX8-NEXT: v_max_i32_e32 v17, v17, v21 2111; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v16 2112; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 2113; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v5, v17 2114; GFX8-NEXT: v_max_i32_e32 v17, -1, v6 2115; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 2116; GFX8-NEXT: v_min_i32_e32 v18, -1, v6 2117; GFX8-NEXT: v_max_i32_e32 v17, v17, v22 2118; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v16 2119; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 2120; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v6, v17 2121; GFX8-NEXT: v_max_i32_e32 v17, -1, v7 2122; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 2123; GFX8-NEXT: v_min_i32_e32 v18, -1, v7 2124; GFX8-NEXT: v_max_i32_e32 v17, v17, v23 2125; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v16 2126; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 2127; GFX8-NEXT: v_sub_u32_e32 v7, vcc, v7, v17 2128; GFX8-NEXT: v_max_i32_e32 v17, -1, v8 2129; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 2130; GFX8-NEXT: v_min_i32_e32 v18, -1, v8 2131; GFX8-NEXT: v_max_i32_e32 v17, v17, v24 2132; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v16 2133; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 2134; GFX8-NEXT: v_sub_u32_e32 v8, vcc, v8, v17 2135; GFX8-NEXT: v_max_i32_e32 v17, -1, v9 2136; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 2137; GFX8-NEXT: v_min_i32_e32 v18, -1, v9 2138; GFX8-NEXT: v_max_i32_e32 v17, v17, v25 2139; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v16 2140; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 2141; GFX8-NEXT: v_sub_u32_e32 v9, vcc, v9, v17 2142; GFX8-NEXT: v_max_i32_e32 v17, -1, v10 2143; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 2144; GFX8-NEXT: v_min_i32_e32 v18, -1, v10 2145; GFX8-NEXT: v_max_i32_e32 v17, v17, v26 2146; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v16 2147; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 2148; GFX8-NEXT: v_sub_u32_e32 v10, vcc, v10, v17 2149; GFX8-NEXT: v_max_i32_e32 v17, -1, v11 2150; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 2151; GFX8-NEXT: v_min_i32_e32 v18, -1, v11 2152; GFX8-NEXT: v_max_i32_e32 v17, v17, v27 2153; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v16 2154; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 2155; GFX8-NEXT: v_sub_u32_e32 v11, vcc, v11, v17 2156; GFX8-NEXT: v_max_i32_e32 v17, -1, v12 2157; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 2158; GFX8-NEXT: v_min_i32_e32 v18, -1, v12 2159; GFX8-NEXT: v_max_i32_e32 v17, v17, v28 2160; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v16 2161; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 2162; GFX8-NEXT: v_sub_u32_e32 v12, vcc, v12, v17 2163; GFX8-NEXT: v_max_i32_e32 v17, -1, v13 2164; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 2165; GFX8-NEXT: v_min_i32_e32 v18, -1, v13 2166; GFX8-NEXT: v_max_i32_e32 v17, v17, v29 2167; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v16 2168; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 2169; GFX8-NEXT: v_sub_u32_e32 v13, vcc, v13, v17 2170; GFX8-NEXT: v_max_i32_e32 v17, -1, v14 2171; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 2172; GFX8-NEXT: v_min_i32_e32 v18, -1, v14 2173; GFX8-NEXT: v_max_i32_e32 v17, v17, v30 2174; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v16 2175; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 2176; GFX8-NEXT: v_sub_u32_e32 v14, vcc, v14, v17 2177; GFX8-NEXT: v_max_i32_e32 v17, -1, v15 2178; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 2179; GFX8-NEXT: v_min_i32_e32 v18, -1, v15 2180; GFX8-NEXT: v_add_u32_e32 v16, vcc, v18, v16 2181; GFX8-NEXT: s_waitcnt vmcnt(0) 2182; GFX8-NEXT: v_max_i32_e32 v17, v17, v19 2183; GFX8-NEXT: v_min_i32_e32 v16, v17, v16 2184; GFX8-NEXT: v_sub_u32_e32 v15, vcc, v15, v16 2185; GFX8-NEXT: s_setpc_b64 s[30:31] 2186; 2187; GFX9-LABEL: v_ssubsat_v16i32: 2188; GFX9: ; %bb.0: 2189; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2190; GFX9-NEXT: v_sub_i32 v0, v0, v16 clamp 2191; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 2192; GFX9-NEXT: v_sub_i32 v1, v1, v17 clamp 2193; GFX9-NEXT: v_sub_i32 v2, v2, v18 clamp 2194; GFX9-NEXT: v_sub_i32 v3, v3, v19 clamp 2195; GFX9-NEXT: v_sub_i32 v4, v4, v20 clamp 2196; GFX9-NEXT: v_sub_i32 v5, v5, v21 clamp 2197; GFX9-NEXT: v_sub_i32 v6, v6, v22 clamp 2198; GFX9-NEXT: v_sub_i32 v7, v7, v23 clamp 2199; GFX9-NEXT: v_sub_i32 v8, v8, v24 clamp 2200; GFX9-NEXT: v_sub_i32 v9, v9, v25 clamp 2201; GFX9-NEXT: v_sub_i32 v10, v10, v26 clamp 2202; GFX9-NEXT: v_sub_i32 v11, v11, v27 clamp 2203; GFX9-NEXT: v_sub_i32 v12, v12, v28 clamp 2204; GFX9-NEXT: v_sub_i32 v13, v13, v29 clamp 2205; GFX9-NEXT: v_sub_i32 v14, v14, v30 clamp 2206; GFX9-NEXT: s_waitcnt vmcnt(0) 2207; GFX9-NEXT: v_sub_i32 v15, v15, v16 clamp 2208; GFX9-NEXT: s_setpc_b64 s[30:31] 2209; 2210; GFX10-LABEL: v_ssubsat_v16i32: 2211; GFX10: ; %bb.0: 2212; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2213; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 2214; GFX10-NEXT: v_sub_nc_i32 v0, v0, v16 clamp 2215; GFX10-NEXT: v_sub_nc_i32 v1, v1, v17 clamp 2216; GFX10-NEXT: v_sub_nc_i32 v2, v2, v18 clamp 2217; GFX10-NEXT: v_sub_nc_i32 v3, v3, v19 clamp 2218; GFX10-NEXT: v_sub_nc_i32 v4, v4, v20 clamp 2219; GFX10-NEXT: v_sub_nc_i32 v5, v5, v21 clamp 2220; GFX10-NEXT: v_sub_nc_i32 v6, v6, v22 clamp 2221; GFX10-NEXT: v_sub_nc_i32 v7, v7, v23 clamp 2222; GFX10-NEXT: v_sub_nc_i32 v8, v8, v24 clamp 2223; GFX10-NEXT: v_sub_nc_i32 v9, v9, v25 clamp 2224; GFX10-NEXT: v_sub_nc_i32 v10, v10, v26 clamp 2225; GFX10-NEXT: v_sub_nc_i32 v11, v11, v27 clamp 2226; GFX10-NEXT: v_sub_nc_i32 v12, v12, v28 clamp 2227; GFX10-NEXT: v_sub_nc_i32 v13, v13, v29 clamp 2228; GFX10-NEXT: v_sub_nc_i32 v14, v14, v30 clamp 2229; GFX10-NEXT: s_waitcnt vmcnt(0) 2230; GFX10-NEXT: v_sub_nc_i32 v15, v15, v31 clamp 2231; GFX10-NEXT: s_setpc_b64 s[30:31] 2232; 2233; GFX11-LABEL: v_ssubsat_v16i32: 2234; GFX11: ; %bb.0: 2235; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2236; GFX11-NEXT: scratch_load_b32 v31, off, s32 2237; GFX11-NEXT: v_sub_nc_i32 v0, v0, v16 clamp 2238; GFX11-NEXT: v_sub_nc_i32 v1, v1, v17 clamp 2239; GFX11-NEXT: v_sub_nc_i32 v2, v2, v18 clamp 2240; GFX11-NEXT: v_sub_nc_i32 v3, v3, v19 clamp 2241; GFX11-NEXT: v_sub_nc_i32 v4, v4, v20 clamp 2242; GFX11-NEXT: v_sub_nc_i32 v5, v5, v21 clamp 2243; GFX11-NEXT: v_sub_nc_i32 v6, v6, v22 clamp 2244; GFX11-NEXT: v_sub_nc_i32 v7, v7, v23 clamp 2245; GFX11-NEXT: v_sub_nc_i32 v8, v8, v24 clamp 2246; GFX11-NEXT: v_sub_nc_i32 v9, v9, v25 clamp 2247; GFX11-NEXT: v_sub_nc_i32 v10, v10, v26 clamp 2248; GFX11-NEXT: v_sub_nc_i32 v11, v11, v27 clamp 2249; GFX11-NEXT: v_sub_nc_i32 v12, v12, v28 clamp 2250; GFX11-NEXT: v_sub_nc_i32 v13, v13, v29 clamp 2251; GFX11-NEXT: v_sub_nc_i32 v14, v14, v30 clamp 2252; GFX11-NEXT: s_waitcnt vmcnt(0) 2253; GFX11-NEXT: v_sub_nc_i32 v15, v15, v31 clamp 2254; GFX11-NEXT: s_setpc_b64 s[30:31] 2255 %result = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs) 2256 ret <16 x i32> %result 2257} 2258 2259define amdgpu_ps <16 x i32> @s_ssubsat_v16i32(<16 x i32> inreg %lhs, <16 x i32> inreg %rhs) { 2260; GFX6-LABEL: s_ssubsat_v16i32: 2261; GFX6: ; %bb.0: 2262; GFX6-NEXT: s_max_i32 s32, s0, -1 2263; GFX6-NEXT: s_add_i32 s32, s32, 0x80000001 2264; GFX6-NEXT: s_min_i32 s33, s0, -1 2265; GFX6-NEXT: s_add_i32 s33, s33, 0x80000000 2266; GFX6-NEXT: s_max_i32 s16, s32, s16 2267; GFX6-NEXT: s_min_i32 s16, s16, s33 2268; GFX6-NEXT: s_sub_i32 s0, s0, s16 2269; GFX6-NEXT: s_max_i32 s16, s1, -1 2270; GFX6-NEXT: s_add_i32 s16, s16, 0x80000001 2271; GFX6-NEXT: s_min_i32 s32, s1, -1 2272; GFX6-NEXT: s_add_i32 s32, s32, 0x80000000 2273; GFX6-NEXT: s_max_i32 s16, s16, s17 2274; GFX6-NEXT: s_min_i32 s16, s16, s32 2275; GFX6-NEXT: s_sub_i32 s1, s1, s16 2276; GFX6-NEXT: s_max_i32 s16, s2, -1 2277; GFX6-NEXT: s_add_i32 s16, s16, 0x80000001 2278; GFX6-NEXT: s_min_i32 s17, s2, -1 2279; GFX6-NEXT: s_add_i32 s17, s17, 0x80000000 2280; GFX6-NEXT: s_max_i32 s16, s16, s18 2281; GFX6-NEXT: s_min_i32 s16, s16, s17 2282; GFX6-NEXT: s_sub_i32 s2, s2, s16 2283; GFX6-NEXT: s_max_i32 s16, s3, -1 2284; GFX6-NEXT: s_add_i32 s16, s16, 0x80000001 2285; GFX6-NEXT: s_min_i32 s17, s3, -1 2286; GFX6-NEXT: s_add_i32 s17, s17, 0x80000000 2287; GFX6-NEXT: s_max_i32 s16, s16, s19 2288; GFX6-NEXT: s_min_i32 s16, s16, s17 2289; GFX6-NEXT: s_sub_i32 s3, s3, s16 2290; GFX6-NEXT: s_max_i32 s16, s4, -1 2291; GFX6-NEXT: s_add_i32 s16, s16, 0x80000001 2292; GFX6-NEXT: s_min_i32 s17, s4, -1 2293; GFX6-NEXT: s_add_i32 s17, s17, 0x80000000 2294; GFX6-NEXT: s_max_i32 s16, s16, s20 2295; GFX6-NEXT: s_min_i32 s16, s16, s17 2296; GFX6-NEXT: s_sub_i32 s4, s4, s16 2297; GFX6-NEXT: s_max_i32 s16, s5, -1 2298; GFX6-NEXT: s_add_i32 s16, s16, 0x80000001 2299; GFX6-NEXT: s_min_i32 s17, s5, -1 2300; GFX6-NEXT: s_add_i32 s17, s17, 0x80000000 2301; GFX6-NEXT: s_max_i32 s16, s16, s21 2302; GFX6-NEXT: s_min_i32 s16, s16, s17 2303; GFX6-NEXT: s_sub_i32 s5, s5, s16 2304; GFX6-NEXT: s_max_i32 s16, s6, -1 2305; GFX6-NEXT: s_add_i32 s16, s16, 0x80000001 2306; GFX6-NEXT: s_min_i32 s17, s6, -1 2307; GFX6-NEXT: s_add_i32 s17, s17, 0x80000000 2308; GFX6-NEXT: s_max_i32 s16, s16, s22 2309; GFX6-NEXT: s_min_i32 s16, s16, s17 2310; GFX6-NEXT: s_sub_i32 s6, s6, s16 2311; GFX6-NEXT: s_max_i32 s16, s7, -1 2312; GFX6-NEXT: s_add_i32 s16, s16, 0x80000001 2313; GFX6-NEXT: s_min_i32 s17, s7, -1 2314; GFX6-NEXT: s_add_i32 s17, s17, 0x80000000 2315; GFX6-NEXT: s_max_i32 s16, s16, s23 2316; GFX6-NEXT: s_min_i32 s16, s16, s17 2317; GFX6-NEXT: s_sub_i32 s7, s7, s16 2318; GFX6-NEXT: s_max_i32 s16, s8, -1 2319; GFX6-NEXT: s_add_i32 s16, s16, 0x80000001 2320; GFX6-NEXT: s_min_i32 s17, s8, -1 2321; GFX6-NEXT: s_add_i32 s17, s17, 0x80000000 2322; GFX6-NEXT: s_max_i32 s16, s16, s24 2323; GFX6-NEXT: s_min_i32 s16, s16, s17 2324; GFX6-NEXT: s_sub_i32 s8, s8, s16 2325; GFX6-NEXT: s_max_i32 s16, s9, -1 2326; GFX6-NEXT: s_add_i32 s16, s16, 0x80000001 2327; GFX6-NEXT: s_min_i32 s17, s9, -1 2328; GFX6-NEXT: s_add_i32 s17, s17, 0x80000000 2329; GFX6-NEXT: s_max_i32 s16, s16, s25 2330; GFX6-NEXT: s_min_i32 s16, s16, s17 2331; GFX6-NEXT: s_sub_i32 s9, s9, s16 2332; GFX6-NEXT: s_max_i32 s16, s10, -1 2333; GFX6-NEXT: s_add_i32 s16, s16, 0x80000001 2334; GFX6-NEXT: s_min_i32 s17, s10, -1 2335; GFX6-NEXT: s_add_i32 s17, s17, 0x80000000 2336; GFX6-NEXT: s_max_i32 s16, s16, s26 2337; GFX6-NEXT: s_min_i32 s16, s16, s17 2338; GFX6-NEXT: s_sub_i32 s10, s10, s16 2339; GFX6-NEXT: s_max_i32 s16, s11, -1 2340; GFX6-NEXT: s_add_i32 s16, s16, 0x80000001 2341; GFX6-NEXT: s_min_i32 s17, s11, -1 2342; GFX6-NEXT: s_add_i32 s17, s17, 0x80000000 2343; GFX6-NEXT: s_max_i32 s16, s16, s27 2344; GFX6-NEXT: s_min_i32 s16, s16, s17 2345; GFX6-NEXT: s_sub_i32 s11, s11, s16 2346; GFX6-NEXT: s_max_i32 s16, s12, -1 2347; GFX6-NEXT: s_add_i32 s16, s16, 0x80000001 2348; GFX6-NEXT: s_min_i32 s17, s12, -1 2349; GFX6-NEXT: s_add_i32 s17, s17, 0x80000000 2350; GFX6-NEXT: s_max_i32 s16, s16, s28 2351; GFX6-NEXT: s_min_i32 s16, s16, s17 2352; GFX6-NEXT: s_sub_i32 s12, s12, s16 2353; GFX6-NEXT: s_max_i32 s16, s13, -1 2354; GFX6-NEXT: s_add_i32 s16, s16, 0x80000001 2355; GFX6-NEXT: s_min_i32 s17, s13, -1 2356; GFX6-NEXT: s_add_i32 s17, s17, 0x80000000 2357; GFX6-NEXT: s_max_i32 s16, s16, s29 2358; GFX6-NEXT: s_min_i32 s16, s16, s17 2359; GFX6-NEXT: s_sub_i32 s13, s13, s16 2360; GFX6-NEXT: s_max_i32 s16, s14, -1 2361; GFX6-NEXT: s_add_i32 s16, s16, 0x80000001 2362; GFX6-NEXT: s_min_i32 s17, s14, -1 2363; GFX6-NEXT: s_add_i32 s17, s17, 0x80000000 2364; GFX6-NEXT: s_max_i32 s16, s16, s30 2365; GFX6-NEXT: s_min_i32 s16, s16, s17 2366; GFX6-NEXT: s_sub_i32 s14, s14, s16 2367; GFX6-NEXT: s_max_i32 s16, s15, -1 2368; GFX6-NEXT: s_add_i32 s16, s16, 0x80000001 2369; GFX6-NEXT: s_min_i32 s17, s15, -1 2370; GFX6-NEXT: s_add_i32 s17, s17, 0x80000000 2371; GFX6-NEXT: s_max_i32 s16, s16, s31 2372; GFX6-NEXT: s_min_i32 s16, s16, s17 2373; GFX6-NEXT: s_sub_i32 s15, s15, s16 2374; GFX6-NEXT: ; return to shader part epilog 2375; 2376; GFX8-LABEL: s_ssubsat_v16i32: 2377; GFX8: ; %bb.0: 2378; GFX8-NEXT: s_max_i32 s32, s0, -1 2379; GFX8-NEXT: s_add_i32 s32, s32, 0x80000001 2380; GFX8-NEXT: s_min_i32 s33, s0, -1 2381; GFX8-NEXT: s_add_i32 s33, s33, 0x80000000 2382; GFX8-NEXT: s_max_i32 s16, s32, s16 2383; GFX8-NEXT: s_min_i32 s16, s16, s33 2384; GFX8-NEXT: s_sub_i32 s0, s0, s16 2385; GFX8-NEXT: s_max_i32 s16, s1, -1 2386; GFX8-NEXT: s_add_i32 s16, s16, 0x80000001 2387; GFX8-NEXT: s_min_i32 s32, s1, -1 2388; GFX8-NEXT: s_add_i32 s32, s32, 0x80000000 2389; GFX8-NEXT: s_max_i32 s16, s16, s17 2390; GFX8-NEXT: s_min_i32 s16, s16, s32 2391; GFX8-NEXT: s_sub_i32 s1, s1, s16 2392; GFX8-NEXT: s_max_i32 s16, s2, -1 2393; GFX8-NEXT: s_add_i32 s16, s16, 0x80000001 2394; GFX8-NEXT: s_min_i32 s17, s2, -1 2395; GFX8-NEXT: s_add_i32 s17, s17, 0x80000000 2396; GFX8-NEXT: s_max_i32 s16, s16, s18 2397; GFX8-NEXT: s_min_i32 s16, s16, s17 2398; GFX8-NEXT: s_sub_i32 s2, s2, s16 2399; GFX8-NEXT: s_max_i32 s16, s3, -1 2400; GFX8-NEXT: s_add_i32 s16, s16, 0x80000001 2401; GFX8-NEXT: s_min_i32 s17, s3, -1 2402; GFX8-NEXT: s_add_i32 s17, s17, 0x80000000 2403; GFX8-NEXT: s_max_i32 s16, s16, s19 2404; GFX8-NEXT: s_min_i32 s16, s16, s17 2405; GFX8-NEXT: s_sub_i32 s3, s3, s16 2406; GFX8-NEXT: s_max_i32 s16, s4, -1 2407; GFX8-NEXT: s_add_i32 s16, s16, 0x80000001 2408; GFX8-NEXT: s_min_i32 s17, s4, -1 2409; GFX8-NEXT: s_add_i32 s17, s17, 0x80000000 2410; GFX8-NEXT: s_max_i32 s16, s16, s20 2411; GFX8-NEXT: s_min_i32 s16, s16, s17 2412; GFX8-NEXT: s_sub_i32 s4, s4, s16 2413; GFX8-NEXT: s_max_i32 s16, s5, -1 2414; GFX8-NEXT: s_add_i32 s16, s16, 0x80000001 2415; GFX8-NEXT: s_min_i32 s17, s5, -1 2416; GFX8-NEXT: s_add_i32 s17, s17, 0x80000000 2417; GFX8-NEXT: s_max_i32 s16, s16, s21 2418; GFX8-NEXT: s_min_i32 s16, s16, s17 2419; GFX8-NEXT: s_sub_i32 s5, s5, s16 2420; GFX8-NEXT: s_max_i32 s16, s6, -1 2421; GFX8-NEXT: s_add_i32 s16, s16, 0x80000001 2422; GFX8-NEXT: s_min_i32 s17, s6, -1 2423; GFX8-NEXT: s_add_i32 s17, s17, 0x80000000 2424; GFX8-NEXT: s_max_i32 s16, s16, s22 2425; GFX8-NEXT: s_min_i32 s16, s16, s17 2426; GFX8-NEXT: s_sub_i32 s6, s6, s16 2427; GFX8-NEXT: s_max_i32 s16, s7, -1 2428; GFX8-NEXT: s_add_i32 s16, s16, 0x80000001 2429; GFX8-NEXT: s_min_i32 s17, s7, -1 2430; GFX8-NEXT: s_add_i32 s17, s17, 0x80000000 2431; GFX8-NEXT: s_max_i32 s16, s16, s23 2432; GFX8-NEXT: s_min_i32 s16, s16, s17 2433; GFX8-NEXT: s_sub_i32 s7, s7, s16 2434; GFX8-NEXT: s_max_i32 s16, s8, -1 2435; GFX8-NEXT: s_add_i32 s16, s16, 0x80000001 2436; GFX8-NEXT: s_min_i32 s17, s8, -1 2437; GFX8-NEXT: s_add_i32 s17, s17, 0x80000000 2438; GFX8-NEXT: s_max_i32 s16, s16, s24 2439; GFX8-NEXT: s_min_i32 s16, s16, s17 2440; GFX8-NEXT: s_sub_i32 s8, s8, s16 2441; GFX8-NEXT: s_max_i32 s16, s9, -1 2442; GFX8-NEXT: s_add_i32 s16, s16, 0x80000001 2443; GFX8-NEXT: s_min_i32 s17, s9, -1 2444; GFX8-NEXT: s_add_i32 s17, s17, 0x80000000 2445; GFX8-NEXT: s_max_i32 s16, s16, s25 2446; GFX8-NEXT: s_min_i32 s16, s16, s17 2447; GFX8-NEXT: s_sub_i32 s9, s9, s16 2448; GFX8-NEXT: s_max_i32 s16, s10, -1 2449; GFX8-NEXT: s_add_i32 s16, s16, 0x80000001 2450; GFX8-NEXT: s_min_i32 s17, s10, -1 2451; GFX8-NEXT: s_add_i32 s17, s17, 0x80000000 2452; GFX8-NEXT: s_max_i32 s16, s16, s26 2453; GFX8-NEXT: s_min_i32 s16, s16, s17 2454; GFX8-NEXT: s_sub_i32 s10, s10, s16 2455; GFX8-NEXT: s_max_i32 s16, s11, -1 2456; GFX8-NEXT: s_add_i32 s16, s16, 0x80000001 2457; GFX8-NEXT: s_min_i32 s17, s11, -1 2458; GFX8-NEXT: s_add_i32 s17, s17, 0x80000000 2459; GFX8-NEXT: s_max_i32 s16, s16, s27 2460; GFX8-NEXT: s_min_i32 s16, s16, s17 2461; GFX8-NEXT: s_sub_i32 s11, s11, s16 2462; GFX8-NEXT: s_max_i32 s16, s12, -1 2463; GFX8-NEXT: s_add_i32 s16, s16, 0x80000001 2464; GFX8-NEXT: s_min_i32 s17, s12, -1 2465; GFX8-NEXT: s_add_i32 s17, s17, 0x80000000 2466; GFX8-NEXT: s_max_i32 s16, s16, s28 2467; GFX8-NEXT: s_min_i32 s16, s16, s17 2468; GFX8-NEXT: s_sub_i32 s12, s12, s16 2469; GFX8-NEXT: s_max_i32 s16, s13, -1 2470; GFX8-NEXT: s_add_i32 s16, s16, 0x80000001 2471; GFX8-NEXT: s_min_i32 s17, s13, -1 2472; GFX8-NEXT: s_add_i32 s17, s17, 0x80000000 2473; GFX8-NEXT: s_max_i32 s16, s16, s29 2474; GFX8-NEXT: s_min_i32 s16, s16, s17 2475; GFX8-NEXT: s_sub_i32 s13, s13, s16 2476; GFX8-NEXT: s_max_i32 s16, s14, -1 2477; GFX8-NEXT: s_add_i32 s16, s16, 0x80000001 2478; GFX8-NEXT: s_min_i32 s17, s14, -1 2479; GFX8-NEXT: s_add_i32 s17, s17, 0x80000000 2480; GFX8-NEXT: s_max_i32 s16, s16, s30 2481; GFX8-NEXT: s_min_i32 s16, s16, s17 2482; GFX8-NEXT: s_sub_i32 s14, s14, s16 2483; GFX8-NEXT: s_max_i32 s16, s15, -1 2484; GFX8-NEXT: s_add_i32 s16, s16, 0x80000001 2485; GFX8-NEXT: s_min_i32 s17, s15, -1 2486; GFX8-NEXT: s_add_i32 s17, s17, 0x80000000 2487; GFX8-NEXT: s_max_i32 s16, s16, s31 2488; GFX8-NEXT: s_min_i32 s16, s16, s17 2489; GFX8-NEXT: s_sub_i32 s15, s15, s16 2490; GFX8-NEXT: ; return to shader part epilog 2491; 2492; GFX9-LABEL: s_ssubsat_v16i32: 2493; GFX9: ; %bb.0: 2494; GFX9-NEXT: v_mov_b32_e32 v0, s16 2495; GFX9-NEXT: v_mov_b32_e32 v1, s17 2496; GFX9-NEXT: v_mov_b32_e32 v2, s18 2497; GFX9-NEXT: v_mov_b32_e32 v3, s19 2498; GFX9-NEXT: v_mov_b32_e32 v4, s20 2499; GFX9-NEXT: v_mov_b32_e32 v5, s21 2500; GFX9-NEXT: v_mov_b32_e32 v6, s22 2501; GFX9-NEXT: v_mov_b32_e32 v7, s23 2502; GFX9-NEXT: v_mov_b32_e32 v8, s24 2503; GFX9-NEXT: v_mov_b32_e32 v9, s25 2504; GFX9-NEXT: v_mov_b32_e32 v10, s26 2505; GFX9-NEXT: v_mov_b32_e32 v11, s27 2506; GFX9-NEXT: v_mov_b32_e32 v12, s28 2507; GFX9-NEXT: v_mov_b32_e32 v13, s29 2508; GFX9-NEXT: v_mov_b32_e32 v14, s30 2509; GFX9-NEXT: v_mov_b32_e32 v15, s31 2510; GFX9-NEXT: v_sub_i32 v0, s0, v0 clamp 2511; GFX9-NEXT: v_sub_i32 v1, s1, v1 clamp 2512; GFX9-NEXT: v_sub_i32 v2, s2, v2 clamp 2513; GFX9-NEXT: v_sub_i32 v3, s3, v3 clamp 2514; GFX9-NEXT: v_sub_i32 v4, s4, v4 clamp 2515; GFX9-NEXT: v_sub_i32 v5, s5, v5 clamp 2516; GFX9-NEXT: v_sub_i32 v6, s6, v6 clamp 2517; GFX9-NEXT: v_sub_i32 v7, s7, v7 clamp 2518; GFX9-NEXT: v_sub_i32 v8, s8, v8 clamp 2519; GFX9-NEXT: v_sub_i32 v9, s9, v9 clamp 2520; GFX9-NEXT: v_sub_i32 v10, s10, v10 clamp 2521; GFX9-NEXT: v_sub_i32 v11, s11, v11 clamp 2522; GFX9-NEXT: v_sub_i32 v12, s12, v12 clamp 2523; GFX9-NEXT: v_sub_i32 v13, s13, v13 clamp 2524; GFX9-NEXT: v_sub_i32 v14, s14, v14 clamp 2525; GFX9-NEXT: v_sub_i32 v15, s15, v15 clamp 2526; GFX9-NEXT: v_readfirstlane_b32 s0, v0 2527; GFX9-NEXT: v_readfirstlane_b32 s1, v1 2528; GFX9-NEXT: v_readfirstlane_b32 s2, v2 2529; GFX9-NEXT: v_readfirstlane_b32 s3, v3 2530; GFX9-NEXT: v_readfirstlane_b32 s4, v4 2531; GFX9-NEXT: v_readfirstlane_b32 s5, v5 2532; GFX9-NEXT: v_readfirstlane_b32 s6, v6 2533; GFX9-NEXT: v_readfirstlane_b32 s7, v7 2534; GFX9-NEXT: v_readfirstlane_b32 s8, v8 2535; GFX9-NEXT: v_readfirstlane_b32 s9, v9 2536; GFX9-NEXT: v_readfirstlane_b32 s10, v10 2537; GFX9-NEXT: v_readfirstlane_b32 s11, v11 2538; GFX9-NEXT: v_readfirstlane_b32 s12, v12 2539; GFX9-NEXT: v_readfirstlane_b32 s13, v13 2540; GFX9-NEXT: v_readfirstlane_b32 s14, v14 2541; GFX9-NEXT: v_readfirstlane_b32 s15, v15 2542; GFX9-NEXT: ; return to shader part epilog 2543; 2544; GFX10PLUS-LABEL: s_ssubsat_v16i32: 2545; GFX10PLUS: ; %bb.0: 2546; GFX10PLUS-NEXT: v_sub_nc_i32 v0, s0, s16 clamp 2547; GFX10PLUS-NEXT: v_sub_nc_i32 v1, s1, s17 clamp 2548; GFX10PLUS-NEXT: v_sub_nc_i32 v2, s2, s18 clamp 2549; GFX10PLUS-NEXT: v_sub_nc_i32 v3, s3, s19 clamp 2550; GFX10PLUS-NEXT: v_sub_nc_i32 v4, s4, s20 clamp 2551; GFX10PLUS-NEXT: v_sub_nc_i32 v5, s5, s21 clamp 2552; GFX10PLUS-NEXT: v_sub_nc_i32 v6, s6, s22 clamp 2553; GFX10PLUS-NEXT: v_sub_nc_i32 v7, s7, s23 clamp 2554; GFX10PLUS-NEXT: v_sub_nc_i32 v8, s8, s24 clamp 2555; GFX10PLUS-NEXT: v_sub_nc_i32 v9, s9, s25 clamp 2556; GFX10PLUS-NEXT: v_sub_nc_i32 v10, s10, s26 clamp 2557; GFX10PLUS-NEXT: v_sub_nc_i32 v11, s11, s27 clamp 2558; GFX10PLUS-NEXT: v_sub_nc_i32 v12, s12, s28 clamp 2559; GFX10PLUS-NEXT: v_sub_nc_i32 v13, s13, s29 clamp 2560; GFX10PLUS-NEXT: v_sub_nc_i32 v14, s14, s30 clamp 2561; GFX10PLUS-NEXT: v_sub_nc_i32 v15, s15, s31 clamp 2562; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 2563; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1 2564; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v2 2565; GFX10PLUS-NEXT: v_readfirstlane_b32 s3, v3 2566; GFX10PLUS-NEXT: v_readfirstlane_b32 s4, v4 2567; GFX10PLUS-NEXT: v_readfirstlane_b32 s5, v5 2568; GFX10PLUS-NEXT: v_readfirstlane_b32 s6, v6 2569; GFX10PLUS-NEXT: v_readfirstlane_b32 s7, v7 2570; GFX10PLUS-NEXT: v_readfirstlane_b32 s8, v8 2571; GFX10PLUS-NEXT: v_readfirstlane_b32 s9, v9 2572; GFX10PLUS-NEXT: v_readfirstlane_b32 s10, v10 2573; GFX10PLUS-NEXT: v_readfirstlane_b32 s11, v11 2574; GFX10PLUS-NEXT: v_readfirstlane_b32 s12, v12 2575; GFX10PLUS-NEXT: v_readfirstlane_b32 s13, v13 2576; GFX10PLUS-NEXT: v_readfirstlane_b32 s14, v14 2577; GFX10PLUS-NEXT: v_readfirstlane_b32 s15, v15 2578; GFX10PLUS-NEXT: ; return to shader part epilog 2579 %result = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs) 2580 ret <16 x i32> %result 2581} 2582 2583define i16 @v_ssubsat_i16(i16 %lhs, i16 %rhs) { 2584; GFX6-LABEL: v_ssubsat_i16: 2585; GFX6: ; %bb.0: 2586; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2587; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 2588; GFX6-NEXT: v_max_i32_e32 v2, -1, v0 2589; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 2590; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0x80000001, v2 2591; GFX6-NEXT: v_min_i32_e32 v3, -1, v0 2592; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0x80000000, v3 2593; GFX6-NEXT: v_max_i32_e32 v1, v2, v1 2594; GFX6-NEXT: v_min_i32_e32 v1, v1, v3 2595; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 2596; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 2597; GFX6-NEXT: s_setpc_b64 s[30:31] 2598; 2599; GFX8-LABEL: v_ssubsat_i16: 2600; GFX8: ; %bb.0: 2601; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2602; GFX8-NEXT: v_max_i16_e32 v2, -1, v0 2603; GFX8-NEXT: v_add_u16_e32 v2, 0x8001, v2 2604; GFX8-NEXT: v_min_i16_e32 v3, -1, v0 2605; GFX8-NEXT: v_add_u16_e32 v3, 0x8000, v3 2606; GFX8-NEXT: v_max_i16_e32 v1, v2, v1 2607; GFX8-NEXT: v_min_i16_e32 v1, v1, v3 2608; GFX8-NEXT: v_sub_u16_e32 v0, v0, v1 2609; GFX8-NEXT: s_setpc_b64 s[30:31] 2610; 2611; GFX9-LABEL: v_ssubsat_i16: 2612; GFX9: ; %bb.0: 2613; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2614; GFX9-NEXT: v_sub_i16 v0, v0, v1 clamp 2615; GFX9-NEXT: s_setpc_b64 s[30:31] 2616; 2617; GFX10PLUS-LABEL: v_ssubsat_i16: 2618; GFX10PLUS: ; %bb.0: 2619; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2620; GFX10PLUS-NEXT: v_sub_nc_i16 v0, v0, v1 clamp 2621; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 2622 %result = call i16 @llvm.ssub.sat.i16(i16 %lhs, i16 %rhs) 2623 ret i16 %result 2624} 2625 2626define amdgpu_ps i16 @s_ssubsat_i16(i16 inreg %lhs, i16 inreg %rhs) { 2627; GFX6-LABEL: s_ssubsat_i16: 2628; GFX6: ; %bb.0: 2629; GFX6-NEXT: s_lshl_b32 s0, s0, 16 2630; GFX6-NEXT: s_max_i32 s2, s0, -1 2631; GFX6-NEXT: s_lshl_b32 s1, s1, 16 2632; GFX6-NEXT: s_add_i32 s2, s2, 0x80000001 2633; GFX6-NEXT: s_min_i32 s3, s0, -1 2634; GFX6-NEXT: s_add_i32 s3, s3, 0x80000000 2635; GFX6-NEXT: s_max_i32 s1, s2, s1 2636; GFX6-NEXT: s_min_i32 s1, s1, s3 2637; GFX6-NEXT: s_sub_i32 s0, s0, s1 2638; GFX6-NEXT: s_ashr_i32 s0, s0, 16 2639; GFX6-NEXT: ; return to shader part epilog 2640; 2641; GFX8-LABEL: s_ssubsat_i16: 2642; GFX8: ; %bb.0: 2643; GFX8-NEXT: s_sext_i32_i16 s2, s0 2644; GFX8-NEXT: s_sext_i32_i16 s3, -1 2645; GFX8-NEXT: s_max_i32 s4, s2, s3 2646; GFX8-NEXT: s_addk_i32 s4, 0x8001 2647; GFX8-NEXT: s_min_i32 s2, s2, s3 2648; GFX8-NEXT: s_sext_i32_i16 s3, s4 2649; GFX8-NEXT: s_sext_i32_i16 s1, s1 2650; GFX8-NEXT: s_addk_i32 s2, 0x8000 2651; GFX8-NEXT: s_max_i32 s1, s3, s1 2652; GFX8-NEXT: s_sext_i32_i16 s1, s1 2653; GFX8-NEXT: s_sext_i32_i16 s2, s2 2654; GFX8-NEXT: s_min_i32 s1, s1, s2 2655; GFX8-NEXT: s_sub_i32 s0, s0, s1 2656; GFX8-NEXT: ; return to shader part epilog 2657; 2658; GFX9-LABEL: s_ssubsat_i16: 2659; GFX9: ; %bb.0: 2660; GFX9-NEXT: v_mov_b32_e32 v0, s1 2661; GFX9-NEXT: v_sub_i16 v0, s0, v0 clamp 2662; GFX9-NEXT: v_readfirstlane_b32 s0, v0 2663; GFX9-NEXT: ; return to shader part epilog 2664; 2665; GFX10PLUS-LABEL: s_ssubsat_i16: 2666; GFX10PLUS: ; %bb.0: 2667; GFX10PLUS-NEXT: v_sub_nc_i16 v0, s0, s1 clamp 2668; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 2669; GFX10PLUS-NEXT: ; return to shader part epilog 2670 %result = call i16 @llvm.ssub.sat.i16(i16 %lhs, i16 %rhs) 2671 ret i16 %result 2672} 2673 2674define amdgpu_ps half @ssubsat_i16_sv(i16 inreg %lhs, i16 %rhs) { 2675; GFX6-LABEL: ssubsat_i16_sv: 2676; GFX6: ; %bb.0: 2677; GFX6-NEXT: s_lshl_b32 s0, s0, 16 2678; GFX6-NEXT: s_max_i32 s1, s0, -1 2679; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 2680; GFX6-NEXT: s_add_i32 s1, s1, 0x80000001 2681; GFX6-NEXT: s_min_i32 s2, s0, -1 2682; GFX6-NEXT: s_add_i32 s2, s2, 0x80000000 2683; GFX6-NEXT: v_max_i32_e32 v0, s1, v0 2684; GFX6-NEXT: v_min_i32_e32 v0, s2, v0 2685; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 2686; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 2687; GFX6-NEXT: ; return to shader part epilog 2688; 2689; GFX8-LABEL: ssubsat_i16_sv: 2690; GFX8: ; %bb.0: 2691; GFX8-NEXT: s_sext_i32_i16 s1, s0 2692; GFX8-NEXT: s_sext_i32_i16 s2, -1 2693; GFX8-NEXT: s_max_i32 s3, s1, s2 2694; GFX8-NEXT: s_addk_i32 s3, 0x8001 2695; GFX8-NEXT: s_min_i32 s1, s1, s2 2696; GFX8-NEXT: s_addk_i32 s1, 0x8000 2697; GFX8-NEXT: v_max_i16_e32 v0, s3, v0 2698; GFX8-NEXT: v_min_i16_e32 v0, s1, v0 2699; GFX8-NEXT: v_sub_u16_e32 v0, s0, v0 2700; GFX8-NEXT: ; return to shader part epilog 2701; 2702; GFX9-LABEL: ssubsat_i16_sv: 2703; GFX9: ; %bb.0: 2704; GFX9-NEXT: v_sub_i16 v0, s0, v0 clamp 2705; GFX9-NEXT: ; return to shader part epilog 2706; 2707; GFX10PLUS-LABEL: ssubsat_i16_sv: 2708; GFX10PLUS: ; %bb.0: 2709; GFX10PLUS-NEXT: v_sub_nc_i16 v0, s0, v0 clamp 2710; GFX10PLUS-NEXT: ; return to shader part epilog 2711 %result = call i16 @llvm.ssub.sat.i16(i16 %lhs, i16 %rhs) 2712 %cast = bitcast i16 %result to half 2713 ret half %cast 2714} 2715 2716define amdgpu_ps half @ssubsat_i16_vs(i16 %lhs, i16 inreg %rhs) { 2717; GFX6-LABEL: ssubsat_i16_vs: 2718; GFX6: ; %bb.0: 2719; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 2720; GFX6-NEXT: v_max_i32_e32 v1, -1, v0 2721; GFX6-NEXT: s_lshl_b32 s0, s0, 16 2722; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0x80000001, v1 2723; GFX6-NEXT: v_min_i32_e32 v2, -1, v0 2724; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0x80000000, v2 2725; GFX6-NEXT: v_max_i32_e32 v1, s0, v1 2726; GFX6-NEXT: v_min_i32_e32 v1, v1, v2 2727; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 2728; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 2729; GFX6-NEXT: ; return to shader part epilog 2730; 2731; GFX8-LABEL: ssubsat_i16_vs: 2732; GFX8: ; %bb.0: 2733; GFX8-NEXT: v_max_i16_e32 v1, -1, v0 2734; GFX8-NEXT: v_add_u16_e32 v1, 0x8001, v1 2735; GFX8-NEXT: v_min_i16_e32 v2, -1, v0 2736; GFX8-NEXT: v_add_u16_e32 v2, 0x8000, v2 2737; GFX8-NEXT: v_max_i16_e32 v1, s0, v1 2738; GFX8-NEXT: v_min_i16_e32 v1, v1, v2 2739; GFX8-NEXT: v_sub_u16_e32 v0, v0, v1 2740; GFX8-NEXT: ; return to shader part epilog 2741; 2742; GFX9-LABEL: ssubsat_i16_vs: 2743; GFX9: ; %bb.0: 2744; GFX9-NEXT: v_sub_i16 v0, v0, s0 clamp 2745; GFX9-NEXT: ; return to shader part epilog 2746; 2747; GFX10PLUS-LABEL: ssubsat_i16_vs: 2748; GFX10PLUS: ; %bb.0: 2749; GFX10PLUS-NEXT: v_sub_nc_i16 v0, v0, s0 clamp 2750; GFX10PLUS-NEXT: ; return to shader part epilog 2751 %result = call i16 @llvm.ssub.sat.i16(i16 %lhs, i16 %rhs) 2752 %cast = bitcast i16 %result to half 2753 ret half %cast 2754} 2755 2756define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { 2757; GFX6-LABEL: v_ssubsat_v2i16: 2758; GFX6: ; %bb.0: 2759; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2760; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 2761; GFX6-NEXT: v_max_i32_e32 v4, -1, v0 2762; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 2763; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x80000001, v4 2764; GFX6-NEXT: v_min_i32_e32 v5, -1, v0 2765; GFX6-NEXT: v_bfrev_b32_e32 v6, 1 2766; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v6 2767; GFX6-NEXT: v_max_i32_e32 v2, v4, v2 2768; GFX6-NEXT: v_min_i32_e32 v2, v2, v5 2769; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 2770; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 2771; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 2772; GFX6-NEXT: v_max_i32_e32 v3, -1, v1 2773; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0x80000001, v3 2774; GFX6-NEXT: v_min_i32_e32 v4, -1, v1 2775; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x80000000, v4 2776; GFX6-NEXT: v_max_i32_e32 v2, v3, v2 2777; GFX6-NEXT: v_min_i32_e32 v2, v2, v4 2778; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 2779; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 2780; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 2781; GFX6-NEXT: s_setpc_b64 s[30:31] 2782; 2783; GFX8-LABEL: v_ssubsat_v2i16: 2784; GFX8: ; %bb.0: 2785; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2786; GFX8-NEXT: v_max_i16_e32 v2, -1, v0 2787; GFX8-NEXT: v_add_u16_e32 v2, 0x8001, v2 2788; GFX8-NEXT: v_min_i16_e32 v3, -1, v0 2789; GFX8-NEXT: v_add_u16_e32 v3, 0x8000, v3 2790; GFX8-NEXT: v_max_i16_e32 v2, v2, v1 2791; GFX8-NEXT: v_min_i16_e32 v2, v2, v3 2792; GFX8-NEXT: v_mov_b32_e32 v3, -1 2793; GFX8-NEXT: v_max_i16_sdwa v4, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 2794; GFX8-NEXT: v_add_u16_e32 v4, 0x8001, v4 2795; GFX8-NEXT: v_min_i16_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 2796; GFX8-NEXT: v_add_u16_e32 v3, 0x8000, v3 2797; GFX8-NEXT: v_max_i16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2798; GFX8-NEXT: v_min_i16_e32 v1, v1, v3 2799; GFX8-NEXT: v_sub_u16_e32 v2, v0, v2 2800; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 2801; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 2802; GFX8-NEXT: s_setpc_b64 s[30:31] 2803; 2804; GFX9-LABEL: v_ssubsat_v2i16: 2805; GFX9: ; %bb.0: 2806; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2807; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 clamp 2808; GFX9-NEXT: s_setpc_b64 s[30:31] 2809; 2810; GFX10PLUS-LABEL: v_ssubsat_v2i16: 2811; GFX10PLUS: ; %bb.0: 2812; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2813; GFX10PLUS-NEXT: v_pk_sub_i16 v0, v0, v1 clamp 2814; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 2815 %result = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) 2816 ret <2 x i16> %result 2817} 2818 2819define amdgpu_ps i32 @s_ssubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs) { 2820; GFX6-LABEL: s_ssubsat_v2i16: 2821; GFX6: ; %bb.0: 2822; GFX6-NEXT: s_lshl_b32 s0, s0, 16 2823; GFX6-NEXT: s_max_i32 s4, s0, -1 2824; GFX6-NEXT: s_lshl_b32 s2, s2, 16 2825; GFX6-NEXT: s_add_i32 s4, s4, 0x80000001 2826; GFX6-NEXT: s_min_i32 s5, s0, -1 2827; GFX6-NEXT: s_add_i32 s5, s5, 0x80000000 2828; GFX6-NEXT: s_max_i32 s2, s4, s2 2829; GFX6-NEXT: s_min_i32 s2, s2, s5 2830; GFX6-NEXT: s_lshl_b32 s1, s1, 16 2831; GFX6-NEXT: s_sub_i32 s0, s0, s2 2832; GFX6-NEXT: s_lshl_b32 s2, s3, 16 2833; GFX6-NEXT: s_max_i32 s3, s1, -1 2834; GFX6-NEXT: s_add_i32 s3, s3, 0x80000001 2835; GFX6-NEXT: s_min_i32 s4, s1, -1 2836; GFX6-NEXT: s_add_i32 s4, s4, 0x80000000 2837; GFX6-NEXT: s_max_i32 s2, s3, s2 2838; GFX6-NEXT: s_min_i32 s2, s2, s4 2839; GFX6-NEXT: s_sub_i32 s1, s1, s2 2840; GFX6-NEXT: s_ashr_i32 s1, s1, 16 2841; GFX6-NEXT: s_ashr_i32 s0, s0, 16 2842; GFX6-NEXT: s_and_b32 s1, s1, 0xffff 2843; GFX6-NEXT: s_and_b32 s0, s0, 0xffff 2844; GFX6-NEXT: s_lshl_b32 s1, s1, 16 2845; GFX6-NEXT: s_or_b32 s0, s0, s1 2846; GFX6-NEXT: ; return to shader part epilog 2847; 2848; GFX8-LABEL: s_ssubsat_v2i16: 2849; GFX8: ; %bb.0: 2850; GFX8-NEXT: s_sext_i32_i16 s4, s0 2851; GFX8-NEXT: s_sext_i32_i16 s5, -1 2852; GFX8-NEXT: s_max_i32 s6, s4, s5 2853; GFX8-NEXT: s_addk_i32 s6, 0x8001 2854; GFX8-NEXT: s_lshr_b32 s3, s1, 16 2855; GFX8-NEXT: s_min_i32 s4, s4, s5 2856; GFX8-NEXT: s_sext_i32_i16 s6, s6 2857; GFX8-NEXT: s_sext_i32_i16 s1, s1 2858; GFX8-NEXT: s_addk_i32 s4, 0x8000 2859; GFX8-NEXT: s_max_i32 s1, s6, s1 2860; GFX8-NEXT: s_sext_i32_i16 s1, s1 2861; GFX8-NEXT: s_sext_i32_i16 s4, s4 2862; GFX8-NEXT: s_lshr_b32 s2, s0, 16 2863; GFX8-NEXT: s_min_i32 s1, s1, s4 2864; GFX8-NEXT: s_sub_i32 s0, s0, s1 2865; GFX8-NEXT: s_sext_i32_i16 s1, s2 2866; GFX8-NEXT: s_max_i32 s4, s1, s5 2867; GFX8-NEXT: s_addk_i32 s4, 0x8001 2868; GFX8-NEXT: s_min_i32 s1, s1, s5 2869; GFX8-NEXT: s_sext_i32_i16 s4, s4 2870; GFX8-NEXT: s_sext_i32_i16 s3, s3 2871; GFX8-NEXT: s_addk_i32 s1, 0x8000 2872; GFX8-NEXT: s_max_i32 s3, s4, s3 2873; GFX8-NEXT: s_sext_i32_i16 s3, s3 2874; GFX8-NEXT: s_sext_i32_i16 s1, s1 2875; GFX8-NEXT: s_min_i32 s1, s3, s1 2876; GFX8-NEXT: s_sub_i32 s1, s2, s1 2877; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 2878; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 2879; GFX8-NEXT: s_lshl_b32 s1, s1, 16 2880; GFX8-NEXT: s_or_b32 s0, s0, s1 2881; GFX8-NEXT: ; return to shader part epilog 2882; 2883; GFX9-LABEL: s_ssubsat_v2i16: 2884; GFX9: ; %bb.0: 2885; GFX9-NEXT: v_mov_b32_e32 v0, s1 2886; GFX9-NEXT: v_pk_sub_i16 v0, s0, v0 clamp 2887; GFX9-NEXT: v_readfirstlane_b32 s0, v0 2888; GFX9-NEXT: ; return to shader part epilog 2889; 2890; GFX10PLUS-LABEL: s_ssubsat_v2i16: 2891; GFX10PLUS: ; %bb.0: 2892; GFX10PLUS-NEXT: v_pk_sub_i16 v0, s0, s1 clamp 2893; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 2894; GFX10PLUS-NEXT: ; return to shader part epilog 2895 %result = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) 2896 %cast = bitcast <2 x i16> %result to i32 2897 ret i32 %cast 2898} 2899 2900define amdgpu_ps float @ssubsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) { 2901; GFX6-LABEL: ssubsat_v2i16_sv: 2902; GFX6: ; %bb.0: 2903; GFX6-NEXT: s_lshl_b32 s0, s0, 16 2904; GFX6-NEXT: s_max_i32 s2, s0, -1 2905; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 2906; GFX6-NEXT: s_add_i32 s2, s2, 0x80000001 2907; GFX6-NEXT: s_min_i32 s3, s0, -1 2908; GFX6-NEXT: s_add_i32 s3, s3, 0x80000000 2909; GFX6-NEXT: v_max_i32_e32 v0, s2, v0 2910; GFX6-NEXT: v_min_i32_e32 v0, s3, v0 2911; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 2912; GFX6-NEXT: s_lshl_b32 s0, s1, 16 2913; GFX6-NEXT: s_max_i32 s1, s0, -1 2914; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 2915; GFX6-NEXT: s_add_i32 s1, s1, 0x80000001 2916; GFX6-NEXT: s_min_i32 s2, s0, -1 2917; GFX6-NEXT: s_add_i32 s2, s2, 0x80000000 2918; GFX6-NEXT: v_max_i32_e32 v1, s1, v1 2919; GFX6-NEXT: v_min_i32_e32 v1, s2, v1 2920; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s0, v1 2921; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 2922; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 2923; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 2924; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 2925; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 2926; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 2927; GFX6-NEXT: ; return to shader part epilog 2928; 2929; GFX8-LABEL: ssubsat_v2i16_sv: 2930; GFX8: ; %bb.0: 2931; GFX8-NEXT: s_sext_i32_i16 s2, s0 2932; GFX8-NEXT: s_sext_i32_i16 s3, -1 2933; GFX8-NEXT: s_max_i32 s4, s2, s3 2934; GFX8-NEXT: s_addk_i32 s4, 0x8001 2935; GFX8-NEXT: s_min_i32 s2, s2, s3 2936; GFX8-NEXT: s_lshr_b32 s1, s0, 16 2937; GFX8-NEXT: s_addk_i32 s2, 0x8000 2938; GFX8-NEXT: v_max_i16_e32 v1, s4, v0 2939; GFX8-NEXT: v_min_i16_e32 v1, s2, v1 2940; GFX8-NEXT: s_sext_i32_i16 s2, s1 2941; GFX8-NEXT: s_max_i32 s4, s2, s3 2942; GFX8-NEXT: s_addk_i32 s4, 0x8001 2943; GFX8-NEXT: s_min_i32 s2, s2, s3 2944; GFX8-NEXT: v_mov_b32_e32 v2, s4 2945; GFX8-NEXT: s_addk_i32 s2, 0x8000 2946; GFX8-NEXT: v_max_i16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 2947; GFX8-NEXT: v_min_i16_e32 v0, s2, v0 2948; GFX8-NEXT: v_mov_b32_e32 v2, s1 2949; GFX8-NEXT: v_sub_u16_e32 v1, s0, v1 2950; GFX8-NEXT: v_sub_u16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2951; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 2952; GFX8-NEXT: ; return to shader part epilog 2953; 2954; GFX9-LABEL: ssubsat_v2i16_sv: 2955; GFX9: ; %bb.0: 2956; GFX9-NEXT: v_pk_sub_i16 v0, s0, v0 clamp 2957; GFX9-NEXT: ; return to shader part epilog 2958; 2959; GFX10PLUS-LABEL: ssubsat_v2i16_sv: 2960; GFX10PLUS: ; %bb.0: 2961; GFX10PLUS-NEXT: v_pk_sub_i16 v0, s0, v0 clamp 2962; GFX10PLUS-NEXT: ; return to shader part epilog 2963 %result = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) 2964 %cast = bitcast <2 x i16> %result to float 2965 ret float %cast 2966} 2967 2968define amdgpu_ps float @ssubsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { 2969; GFX6-LABEL: ssubsat_v2i16_vs: 2970; GFX6: ; %bb.0: 2971; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 2972; GFX6-NEXT: v_max_i32_e32 v2, -1, v0 2973; GFX6-NEXT: s_lshl_b32 s0, s0, 16 2974; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0x80000001, v2 2975; GFX6-NEXT: v_min_i32_e32 v4, -1, v0 2976; GFX6-NEXT: v_bfrev_b32_e32 v5, 1 2977; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 2978; GFX6-NEXT: v_max_i32_e32 v2, s0, v2 2979; GFX6-NEXT: v_min_i32_e32 v2, v2, v4 2980; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 2981; GFX6-NEXT: v_mov_b32_e32 v3, 0x80000001 2982; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 2983; GFX6-NEXT: v_max_i32_e32 v2, -1, v1 2984; GFX6-NEXT: s_lshl_b32 s0, s1, 16 2985; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 2986; GFX6-NEXT: v_min_i32_e32 v3, -1, v1 2987; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0x80000000, v3 2988; GFX6-NEXT: v_max_i32_e32 v2, s0, v2 2989; GFX6-NEXT: v_min_i32_e32 v2, v2, v3 2990; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 2991; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 2992; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 2993; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 2994; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 2995; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 2996; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 2997; GFX6-NEXT: ; return to shader part epilog 2998; 2999; GFX8-LABEL: ssubsat_v2i16_vs: 3000; GFX8: ; %bb.0: 3001; GFX8-NEXT: v_max_i16_e32 v1, -1, v0 3002; GFX8-NEXT: v_add_u16_e32 v1, 0x8001, v1 3003; GFX8-NEXT: v_min_i16_e32 v2, -1, v0 3004; GFX8-NEXT: v_add_u16_e32 v2, 0x8000, v2 3005; GFX8-NEXT: v_max_i16_e32 v1, s0, v1 3006; GFX8-NEXT: v_min_i16_e32 v1, v1, v2 3007; GFX8-NEXT: v_mov_b32_e32 v2, -1 3008; GFX8-NEXT: v_max_i16_sdwa v3, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3009; GFX8-NEXT: s_lshr_b32 s1, s0, 16 3010; GFX8-NEXT: v_add_u16_e32 v3, 0x8001, v3 3011; GFX8-NEXT: v_min_i16_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3012; GFX8-NEXT: v_add_u16_e32 v2, 0x8000, v2 3013; GFX8-NEXT: v_max_i16_e32 v3, s1, v3 3014; GFX8-NEXT: v_min_i16_e32 v2, v3, v2 3015; GFX8-NEXT: v_sub_u16_e32 v1, v0, v1 3016; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3017; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 3018; GFX8-NEXT: ; return to shader part epilog 3019; 3020; GFX9-LABEL: ssubsat_v2i16_vs: 3021; GFX9: ; %bb.0: 3022; GFX9-NEXT: v_pk_sub_i16 v0, v0, s0 clamp 3023; GFX9-NEXT: ; return to shader part epilog 3024; 3025; GFX10PLUS-LABEL: ssubsat_v2i16_vs: 3026; GFX10PLUS: ; %bb.0: 3027; GFX10PLUS-NEXT: v_pk_sub_i16 v0, v0, s0 clamp 3028; GFX10PLUS-NEXT: ; return to shader part epilog 3029 %result = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) 3030 %cast = bitcast <2 x i16> %result to float 3031 ret float %cast 3032} 3033 3034; FIXME: v3i16 insert/extract 3035; define <3 x i16> @v_ssubsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) { 3036; %result = call <3 x i16> @llvm.ssub.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs) 3037; ret <3 x i16> %result 3038; } 3039 3040; define amdgpu_ps <3 x i16> @s_ssubsat_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs) { 3041; %result = call <3 x i16> @llvm.ssub.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs) 3042; ret <3 x i16> %result 3043; } 3044 3045define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { 3046; GFX6-LABEL: v_ssubsat_v4i16: 3047; GFX6: ; %bb.0: 3048; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3049; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 3050; GFX6-NEXT: v_max_i32_e32 v8, -1, v0 3051; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 3052; GFX6-NEXT: v_add_i32_e32 v8, vcc, 0x80000001, v8 3053; GFX6-NEXT: v_min_i32_e32 v10, -1, v0 3054; GFX6-NEXT: v_bfrev_b32_e32 v11, 1 3055; GFX6-NEXT: v_add_i32_e32 v10, vcc, v10, v11 3056; GFX6-NEXT: v_max_i32_e32 v4, v8, v4 3057; GFX6-NEXT: v_min_i32_e32 v4, v4, v10 3058; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 3059; GFX6-NEXT: v_mov_b32_e32 v9, 0x80000001 3060; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 3061; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 3062; GFX6-NEXT: v_max_i32_e32 v5, -1, v1 3063; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v9 3064; GFX6-NEXT: v_min_i32_e32 v8, -1, v1 3065; GFX6-NEXT: v_add_i32_e32 v8, vcc, v8, v11 3066; GFX6-NEXT: v_max_i32_e32 v4, v5, v4 3067; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 3068; GFX6-NEXT: v_min_i32_e32 v4, v4, v8 3069; GFX6-NEXT: v_max_i32_e32 v5, -1, v2 3070; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 3071; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 3072; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v9 3073; GFX6-NEXT: v_min_i32_e32 v6, -1, v2 3074; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v11 3075; GFX6-NEXT: v_max_i32_e32 v4, v5, v4 3076; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 3077; GFX6-NEXT: v_min_i32_e32 v4, v4, v6 3078; GFX6-NEXT: v_max_i32_e32 v5, -1, v3 3079; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 3080; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v7 3081; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v9 3082; GFX6-NEXT: v_min_i32_e32 v6, -1, v3 3083; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v11 3084; GFX6-NEXT: v_max_i32_e32 v4, v5, v4 3085; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 3086; GFX6-NEXT: v_min_i32_e32 v4, v4, v6 3087; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 3088; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v4 3089; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 3090; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v2 3091; GFX6-NEXT: v_ashrrev_i32_e32 v3, 16, v3 3092; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 3093; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 3094; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 3095; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2 3096; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 3097; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 3098; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 3099; GFX6-NEXT: s_setpc_b64 s[30:31] 3100; 3101; GFX8-LABEL: v_ssubsat_v4i16: 3102; GFX8: ; %bb.0: 3103; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3104; GFX8-NEXT: v_max_i16_e32 v4, -1, v0 3105; GFX8-NEXT: v_add_u16_e32 v4, 0x8001, v4 3106; GFX8-NEXT: v_min_i16_e32 v5, -1, v0 3107; GFX8-NEXT: v_add_u16_e32 v5, 0x8000, v5 3108; GFX8-NEXT: v_max_i16_e32 v4, v4, v2 3109; GFX8-NEXT: v_min_i16_e32 v4, v4, v5 3110; GFX8-NEXT: v_mov_b32_e32 v5, -1 3111; GFX8-NEXT: v_max_i16_sdwa v6, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3112; GFX8-NEXT: v_add_u16_e32 v6, 0x8001, v6 3113; GFX8-NEXT: v_min_i16_sdwa v7, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3114; GFX8-NEXT: v_add_u16_e32 v7, 0x8000, v7 3115; GFX8-NEXT: v_max_i16_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 3116; GFX8-NEXT: v_max_i16_e32 v6, -1, v1 3117; GFX8-NEXT: v_min_i16_e32 v2, v2, v7 3118; GFX8-NEXT: v_add_u16_e32 v6, 0x8001, v6 3119; GFX8-NEXT: v_min_i16_e32 v7, -1, v1 3120; GFX8-NEXT: v_add_u16_e32 v7, 0x8000, v7 3121; GFX8-NEXT: v_max_i16_e32 v6, v6, v3 3122; GFX8-NEXT: v_min_i16_e32 v6, v6, v7 3123; GFX8-NEXT: v_max_i16_sdwa v7, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3124; GFX8-NEXT: v_add_u16_e32 v7, 0x8001, v7 3125; GFX8-NEXT: v_min_i16_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3126; GFX8-NEXT: v_add_u16_e32 v5, 0x8000, v5 3127; GFX8-NEXT: v_max_i16_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 3128; GFX8-NEXT: v_min_i16_e32 v3, v3, v5 3129; GFX8-NEXT: v_sub_u16_e32 v4, v0, v4 3130; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3131; GFX8-NEXT: v_sub_u16_e32 v2, v1, v6 3132; GFX8-NEXT: v_sub_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3133; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 3134; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 3135; GFX8-NEXT: s_setpc_b64 s[30:31] 3136; 3137; GFX9-LABEL: v_ssubsat_v4i16: 3138; GFX9: ; %bb.0: 3139; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3140; GFX9-NEXT: v_pk_sub_i16 v0, v0, v2 clamp 3141; GFX9-NEXT: v_pk_sub_i16 v1, v1, v3 clamp 3142; GFX9-NEXT: s_setpc_b64 s[30:31] 3143; 3144; GFX10PLUS-LABEL: v_ssubsat_v4i16: 3145; GFX10PLUS: ; %bb.0: 3146; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3147; GFX10PLUS-NEXT: v_pk_sub_i16 v0, v0, v2 clamp 3148; GFX10PLUS-NEXT: v_pk_sub_i16 v1, v1, v3 clamp 3149; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 3150 %result = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) 3151 %cast = bitcast <4 x i16> %result to <2 x float> 3152 ret <2 x float> %cast 3153} 3154 3155define amdgpu_ps <2 x i32> @s_ssubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %rhs) { 3156; GFX6-LABEL: s_ssubsat_v4i16: 3157; GFX6: ; %bb.0: 3158; GFX6-NEXT: s_lshl_b32 s0, s0, 16 3159; GFX6-NEXT: s_max_i32 s8, s0, -1 3160; GFX6-NEXT: s_lshl_b32 s4, s4, 16 3161; GFX6-NEXT: s_add_i32 s8, s8, 0x80000001 3162; GFX6-NEXT: s_min_i32 s9, s0, -1 3163; GFX6-NEXT: s_add_i32 s9, s9, 0x80000000 3164; GFX6-NEXT: s_max_i32 s4, s8, s4 3165; GFX6-NEXT: s_min_i32 s4, s4, s9 3166; GFX6-NEXT: s_lshl_b32 s1, s1, 16 3167; GFX6-NEXT: s_sub_i32 s0, s0, s4 3168; GFX6-NEXT: s_lshl_b32 s4, s5, 16 3169; GFX6-NEXT: s_max_i32 s5, s1, -1 3170; GFX6-NEXT: s_add_i32 s5, s5, 0x80000001 3171; GFX6-NEXT: s_min_i32 s8, s1, -1 3172; GFX6-NEXT: s_add_i32 s8, s8, 0x80000000 3173; GFX6-NEXT: s_max_i32 s4, s5, s4 3174; GFX6-NEXT: s_lshl_b32 s2, s2, 16 3175; GFX6-NEXT: s_min_i32 s4, s4, s8 3176; GFX6-NEXT: s_max_i32 s5, s2, -1 3177; GFX6-NEXT: s_sub_i32 s1, s1, s4 3178; GFX6-NEXT: s_lshl_b32 s4, s6, 16 3179; GFX6-NEXT: s_add_i32 s5, s5, 0x80000001 3180; GFX6-NEXT: s_min_i32 s6, s2, -1 3181; GFX6-NEXT: s_add_i32 s6, s6, 0x80000000 3182; GFX6-NEXT: s_max_i32 s4, s5, s4 3183; GFX6-NEXT: s_lshl_b32 s3, s3, 16 3184; GFX6-NEXT: s_min_i32 s4, s4, s6 3185; GFX6-NEXT: s_max_i32 s5, s3, -1 3186; GFX6-NEXT: s_sub_i32 s2, s2, s4 3187; GFX6-NEXT: s_lshl_b32 s4, s7, 16 3188; GFX6-NEXT: s_add_i32 s5, s5, 0x80000001 3189; GFX6-NEXT: s_min_i32 s6, s3, -1 3190; GFX6-NEXT: s_add_i32 s6, s6, 0x80000000 3191; GFX6-NEXT: s_max_i32 s4, s5, s4 3192; GFX6-NEXT: s_ashr_i32 s1, s1, 16 3193; GFX6-NEXT: s_min_i32 s4, s4, s6 3194; GFX6-NEXT: s_ashr_i32 s0, s0, 16 3195; GFX6-NEXT: s_sub_i32 s3, s3, s4 3196; GFX6-NEXT: s_and_b32 s1, s1, 0xffff 3197; GFX6-NEXT: s_ashr_i32 s2, s2, 16 3198; GFX6-NEXT: s_ashr_i32 s3, s3, 16 3199; GFX6-NEXT: s_and_b32 s0, s0, 0xffff 3200; GFX6-NEXT: s_lshl_b32 s1, s1, 16 3201; GFX6-NEXT: s_or_b32 s0, s0, s1 3202; GFX6-NEXT: s_and_b32 s1, s2, 0xffff 3203; GFX6-NEXT: s_and_b32 s2, s3, 0xffff 3204; GFX6-NEXT: s_lshl_b32 s2, s2, 16 3205; GFX6-NEXT: s_or_b32 s1, s1, s2 3206; GFX6-NEXT: ; return to shader part epilog 3207; 3208; GFX8-LABEL: s_ssubsat_v4i16: 3209; GFX8: ; %bb.0: 3210; GFX8-NEXT: s_sext_i32_i16 s8, s0 3211; GFX8-NEXT: s_sext_i32_i16 s9, -1 3212; GFX8-NEXT: s_max_i32 s10, s8, s9 3213; GFX8-NEXT: s_addk_i32 s10, 0x8001 3214; GFX8-NEXT: s_lshr_b32 s6, s2, 16 3215; GFX8-NEXT: s_min_i32 s8, s8, s9 3216; GFX8-NEXT: s_sext_i32_i16 s10, s10 3217; GFX8-NEXT: s_sext_i32_i16 s2, s2 3218; GFX8-NEXT: s_addk_i32 s8, 0x8000 3219; GFX8-NEXT: s_max_i32 s2, s10, s2 3220; GFX8-NEXT: s_sext_i32_i16 s2, s2 3221; GFX8-NEXT: s_sext_i32_i16 s8, s8 3222; GFX8-NEXT: s_lshr_b32 s4, s0, 16 3223; GFX8-NEXT: s_min_i32 s2, s2, s8 3224; GFX8-NEXT: s_sub_i32 s0, s0, s2 3225; GFX8-NEXT: s_sext_i32_i16 s2, s4 3226; GFX8-NEXT: s_max_i32 s8, s2, s9 3227; GFX8-NEXT: s_addk_i32 s8, 0x8001 3228; GFX8-NEXT: s_min_i32 s2, s2, s9 3229; GFX8-NEXT: s_sext_i32_i16 s8, s8 3230; GFX8-NEXT: s_sext_i32_i16 s6, s6 3231; GFX8-NEXT: s_addk_i32 s2, 0x8000 3232; GFX8-NEXT: s_max_i32 s6, s8, s6 3233; GFX8-NEXT: s_sext_i32_i16 s6, s6 3234; GFX8-NEXT: s_sext_i32_i16 s2, s2 3235; GFX8-NEXT: s_min_i32 s2, s6, s2 3236; GFX8-NEXT: s_sub_i32 s2, s4, s2 3237; GFX8-NEXT: s_sext_i32_i16 s4, s1 3238; GFX8-NEXT: s_max_i32 s6, s4, s9 3239; GFX8-NEXT: s_addk_i32 s6, 0x8001 3240; GFX8-NEXT: s_lshr_b32 s7, s3, 16 3241; GFX8-NEXT: s_min_i32 s4, s4, s9 3242; GFX8-NEXT: s_sext_i32_i16 s6, s6 3243; GFX8-NEXT: s_sext_i32_i16 s3, s3 3244; GFX8-NEXT: s_addk_i32 s4, 0x8000 3245; GFX8-NEXT: s_max_i32 s3, s6, s3 3246; GFX8-NEXT: s_sext_i32_i16 s3, s3 3247; GFX8-NEXT: s_sext_i32_i16 s4, s4 3248; GFX8-NEXT: s_lshr_b32 s5, s1, 16 3249; GFX8-NEXT: s_min_i32 s3, s3, s4 3250; GFX8-NEXT: s_sub_i32 s1, s1, s3 3251; GFX8-NEXT: s_sext_i32_i16 s3, s5 3252; GFX8-NEXT: s_max_i32 s4, s3, s9 3253; GFX8-NEXT: s_addk_i32 s4, 0x8001 3254; GFX8-NEXT: s_min_i32 s3, s3, s9 3255; GFX8-NEXT: s_sext_i32_i16 s4, s4 3256; GFX8-NEXT: s_sext_i32_i16 s6, s7 3257; GFX8-NEXT: s_addk_i32 s3, 0x8000 3258; GFX8-NEXT: s_max_i32 s4, s4, s6 3259; GFX8-NEXT: s_sext_i32_i16 s4, s4 3260; GFX8-NEXT: s_sext_i32_i16 s3, s3 3261; GFX8-NEXT: s_min_i32 s3, s4, s3 3262; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 3263; GFX8-NEXT: s_sub_i32 s3, s5, s3 3264; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 3265; GFX8-NEXT: s_lshl_b32 s2, s2, 16 3266; GFX8-NEXT: s_or_b32 s0, s0, s2 3267; GFX8-NEXT: s_and_b32 s2, 0xffff, s3 3268; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 3269; GFX8-NEXT: s_lshl_b32 s2, s2, 16 3270; GFX8-NEXT: s_or_b32 s1, s1, s2 3271; GFX8-NEXT: ; return to shader part epilog 3272; 3273; GFX9-LABEL: s_ssubsat_v4i16: 3274; GFX9: ; %bb.0: 3275; GFX9-NEXT: v_mov_b32_e32 v0, s2 3276; GFX9-NEXT: v_mov_b32_e32 v1, s3 3277; GFX9-NEXT: v_pk_sub_i16 v0, s0, v0 clamp 3278; GFX9-NEXT: v_pk_sub_i16 v1, s1, v1 clamp 3279; GFX9-NEXT: v_readfirstlane_b32 s0, v0 3280; GFX9-NEXT: v_readfirstlane_b32 s1, v1 3281; GFX9-NEXT: ; return to shader part epilog 3282; 3283; GFX10PLUS-LABEL: s_ssubsat_v4i16: 3284; GFX10PLUS: ; %bb.0: 3285; GFX10PLUS-NEXT: v_pk_sub_i16 v0, s0, s2 clamp 3286; GFX10PLUS-NEXT: v_pk_sub_i16 v1, s1, s3 clamp 3287; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 3288; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1 3289; GFX10PLUS-NEXT: ; return to shader part epilog 3290 %result = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) 3291 %cast = bitcast <4 x i16> %result to <2 x i32> 3292 ret <2 x i32> %cast 3293} 3294 3295; FIXME 3296; define <5 x i16> @v_ssubsat_v5i16(<5 x i16> %lhs, <5 x i16> %rhs) { 3297; %result = call <5 x i16> @llvm.ssub.sat.v5i16(<5 x i16> %lhs, <5 x i16> %rhs) 3298; ret <5 x i16> %result 3299; } 3300 3301; define amdgpu_ps <5 x i16> @s_ssubsat_v5i16(<5 x i16> inreg %lhs, <5 x i16> inreg %rhs) { 3302; %result = call <5 x i16> @llvm.ssub.sat.v5i16(<5 x i16> %lhs, <5 x i16> %rhs) 3303; ret <5 x i16> %result 3304; } 3305 3306define <3 x float> @v_ssubsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { 3307; GFX6-LABEL: v_ssubsat_v6i16: 3308; GFX6: ; %bb.0: 3309; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3310; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 3311; GFX6-NEXT: v_max_i32_e32 v12, -1, v0 3312; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6 3313; GFX6-NEXT: v_add_i32_e32 v12, vcc, 0x80000001, v12 3314; GFX6-NEXT: v_min_i32_e32 v14, -1, v0 3315; GFX6-NEXT: v_bfrev_b32_e32 v15, 1 3316; GFX6-NEXT: v_add_i32_e32 v14, vcc, v14, v15 3317; GFX6-NEXT: v_max_i32_e32 v6, v12, v6 3318; GFX6-NEXT: v_min_i32_e32 v6, v6, v14 3319; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 3320; GFX6-NEXT: v_mov_b32_e32 v13, 0x80000001 3321; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 3322; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v7 3323; GFX6-NEXT: v_max_i32_e32 v7, -1, v1 3324; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v13 3325; GFX6-NEXT: v_min_i32_e32 v12, -1, v1 3326; GFX6-NEXT: v_add_i32_e32 v12, vcc, v12, v15 3327; GFX6-NEXT: v_max_i32_e32 v6, v7, v6 3328; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 3329; GFX6-NEXT: v_min_i32_e32 v6, v6, v12 3330; GFX6-NEXT: v_max_i32_e32 v7, -1, v2 3331; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v6 3332; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v8 3333; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v13 3334; GFX6-NEXT: v_min_i32_e32 v8, -1, v2 3335; GFX6-NEXT: v_add_i32_e32 v8, vcc, v8, v15 3336; GFX6-NEXT: v_max_i32_e32 v6, v7, v6 3337; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 3338; GFX6-NEXT: v_min_i32_e32 v6, v6, v8 3339; GFX6-NEXT: v_max_i32_e32 v7, -1, v3 3340; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 3341; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v9 3342; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v13 3343; GFX6-NEXT: v_min_i32_e32 v8, -1, v3 3344; GFX6-NEXT: v_add_i32_e32 v8, vcc, v8, v15 3345; GFX6-NEXT: v_max_i32_e32 v6, v7, v6 3346; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 3347; GFX6-NEXT: v_min_i32_e32 v6, v6, v8 3348; GFX6-NEXT: v_max_i32_e32 v7, -1, v4 3349; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v6 3350; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v10 3351; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v13 3352; GFX6-NEXT: v_min_i32_e32 v8, -1, v4 3353; GFX6-NEXT: v_add_i32_e32 v8, vcc, v8, v15 3354; GFX6-NEXT: v_max_i32_e32 v6, v7, v6 3355; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 3356; GFX6-NEXT: v_min_i32_e32 v6, v6, v8 3357; GFX6-NEXT: v_max_i32_e32 v7, -1, v5 3358; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v6 3359; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v11 3360; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v13 3361; GFX6-NEXT: v_min_i32_e32 v8, -1, v5 3362; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 3363; GFX6-NEXT: v_add_i32_e32 v8, vcc, v8, v15 3364; GFX6-NEXT: v_max_i32_e32 v6, v7, v6 3365; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 3366; GFX6-NEXT: v_min_i32_e32 v6, v6, v8 3367; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 3368; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v2 3369; GFX6-NEXT: v_ashrrev_i32_e32 v3, 16, v3 3370; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v6 3371; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 3372; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 3373; GFX6-NEXT: v_ashrrev_i32_e32 v5, 16, v5 3374; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 3375; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2 3376; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 3377; GFX6-NEXT: v_ashrrev_i32_e32 v4, 16, v4 3378; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 3379; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v5 3380; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 3381; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v4 3382; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 3383; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 3384; GFX6-NEXT: s_setpc_b64 s[30:31] 3385; 3386; GFX8-LABEL: v_ssubsat_v6i16: 3387; GFX8: ; %bb.0: 3388; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3389; GFX8-NEXT: v_max_i16_e32 v6, -1, v0 3390; GFX8-NEXT: v_add_u16_e32 v6, 0x8001, v6 3391; GFX8-NEXT: v_min_i16_e32 v7, -1, v0 3392; GFX8-NEXT: v_add_u16_e32 v7, 0x8000, v7 3393; GFX8-NEXT: v_max_i16_e32 v6, v6, v3 3394; GFX8-NEXT: v_min_i16_e32 v6, v6, v7 3395; GFX8-NEXT: v_mov_b32_e32 v7, -1 3396; GFX8-NEXT: v_max_i16_sdwa v8, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3397; GFX8-NEXT: v_add_u16_e32 v8, 0x8001, v8 3398; GFX8-NEXT: v_min_i16_sdwa v9, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3399; GFX8-NEXT: v_add_u16_e32 v9, 0x8000, v9 3400; GFX8-NEXT: v_max_i16_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 3401; GFX8-NEXT: v_max_i16_e32 v8, -1, v1 3402; GFX8-NEXT: v_min_i16_e32 v3, v3, v9 3403; GFX8-NEXT: v_add_u16_e32 v8, 0x8001, v8 3404; GFX8-NEXT: v_min_i16_e32 v9, -1, v1 3405; GFX8-NEXT: v_add_u16_e32 v9, 0x8000, v9 3406; GFX8-NEXT: v_max_i16_e32 v8, v8, v4 3407; GFX8-NEXT: v_min_i16_e32 v8, v8, v9 3408; GFX8-NEXT: v_max_i16_sdwa v9, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3409; GFX8-NEXT: v_add_u16_e32 v9, 0x8001, v9 3410; GFX8-NEXT: v_min_i16_sdwa v10, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3411; GFX8-NEXT: v_add_u16_e32 v10, 0x8000, v10 3412; GFX8-NEXT: v_max_i16_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 3413; GFX8-NEXT: v_max_i16_e32 v9, -1, v2 3414; GFX8-NEXT: v_min_i16_e32 v4, v4, v10 3415; GFX8-NEXT: v_add_u16_e32 v9, 0x8001, v9 3416; GFX8-NEXT: v_min_i16_e32 v10, -1, v2 3417; GFX8-NEXT: v_add_u16_e32 v10, 0x8000, v10 3418; GFX8-NEXT: v_max_i16_e32 v9, v9, v5 3419; GFX8-NEXT: v_min_i16_e32 v9, v9, v10 3420; GFX8-NEXT: v_max_i16_sdwa v10, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3421; GFX8-NEXT: v_add_u16_e32 v10, 0x8001, v10 3422; GFX8-NEXT: v_min_i16_sdwa v7, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3423; GFX8-NEXT: v_add_u16_e32 v7, 0x8000, v7 3424; GFX8-NEXT: v_max_i16_sdwa v5, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 3425; GFX8-NEXT: v_min_i16_e32 v5, v5, v7 3426; GFX8-NEXT: v_sub_u16_e32 v6, v0, v6 3427; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3428; GFX8-NEXT: v_sub_u16_e32 v3, v1, v8 3429; GFX8-NEXT: v_sub_u16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3430; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 3431; GFX8-NEXT: v_sub_u16_e32 v3, v2, v9 3432; GFX8-NEXT: v_sub_u16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3433; GFX8-NEXT: v_or_b32_e32 v0, v6, v0 3434; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 3435; GFX8-NEXT: s_setpc_b64 s[30:31] 3436; 3437; GFX9-LABEL: v_ssubsat_v6i16: 3438; GFX9: ; %bb.0: 3439; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3440; GFX9-NEXT: v_pk_sub_i16 v0, v0, v3 clamp 3441; GFX9-NEXT: v_pk_sub_i16 v1, v1, v4 clamp 3442; GFX9-NEXT: v_pk_sub_i16 v2, v2, v5 clamp 3443; GFX9-NEXT: s_setpc_b64 s[30:31] 3444; 3445; GFX10PLUS-LABEL: v_ssubsat_v6i16: 3446; GFX10PLUS: ; %bb.0: 3447; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3448; GFX10PLUS-NEXT: v_pk_sub_i16 v0, v0, v3 clamp 3449; GFX10PLUS-NEXT: v_pk_sub_i16 v1, v1, v4 clamp 3450; GFX10PLUS-NEXT: v_pk_sub_i16 v2, v2, v5 clamp 3451; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 3452 %result = call <6 x i16> @llvm.ssub.sat.v6i16(<6 x i16> %lhs, <6 x i16> %rhs) 3453 %cast = bitcast <6 x i16> %result to <3 x float> 3454 ret <3 x float> %cast 3455} 3456 3457define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inreg %rhs) { 3458; GFX6-LABEL: s_ssubsat_v6i16: 3459; GFX6: ; %bb.0: 3460; GFX6-NEXT: s_lshl_b32 s0, s0, 16 3461; GFX6-NEXT: s_max_i32 s12, s0, -1 3462; GFX6-NEXT: s_lshl_b32 s6, s6, 16 3463; GFX6-NEXT: s_add_i32 s12, s12, 0x80000001 3464; GFX6-NEXT: s_min_i32 s13, s0, -1 3465; GFX6-NEXT: s_add_i32 s13, s13, 0x80000000 3466; GFX6-NEXT: s_max_i32 s6, s12, s6 3467; GFX6-NEXT: s_min_i32 s6, s6, s13 3468; GFX6-NEXT: s_lshl_b32 s1, s1, 16 3469; GFX6-NEXT: s_sub_i32 s0, s0, s6 3470; GFX6-NEXT: s_lshl_b32 s6, s7, 16 3471; GFX6-NEXT: s_max_i32 s7, s1, -1 3472; GFX6-NEXT: s_add_i32 s7, s7, 0x80000001 3473; GFX6-NEXT: s_min_i32 s12, s1, -1 3474; GFX6-NEXT: s_add_i32 s12, s12, 0x80000000 3475; GFX6-NEXT: s_max_i32 s6, s7, s6 3476; GFX6-NEXT: s_lshl_b32 s2, s2, 16 3477; GFX6-NEXT: s_min_i32 s6, s6, s12 3478; GFX6-NEXT: s_max_i32 s7, s2, -1 3479; GFX6-NEXT: s_sub_i32 s1, s1, s6 3480; GFX6-NEXT: s_lshl_b32 s6, s8, 16 3481; GFX6-NEXT: s_add_i32 s7, s7, 0x80000001 3482; GFX6-NEXT: s_min_i32 s8, s2, -1 3483; GFX6-NEXT: s_add_i32 s8, s8, 0x80000000 3484; GFX6-NEXT: s_max_i32 s6, s7, s6 3485; GFX6-NEXT: s_lshl_b32 s3, s3, 16 3486; GFX6-NEXT: s_min_i32 s6, s6, s8 3487; GFX6-NEXT: s_max_i32 s7, s3, -1 3488; GFX6-NEXT: s_sub_i32 s2, s2, s6 3489; GFX6-NEXT: s_lshl_b32 s6, s9, 16 3490; GFX6-NEXT: s_add_i32 s7, s7, 0x80000001 3491; GFX6-NEXT: s_min_i32 s8, s3, -1 3492; GFX6-NEXT: s_add_i32 s8, s8, 0x80000000 3493; GFX6-NEXT: s_max_i32 s6, s7, s6 3494; GFX6-NEXT: s_lshl_b32 s4, s4, 16 3495; GFX6-NEXT: s_min_i32 s6, s6, s8 3496; GFX6-NEXT: s_max_i32 s7, s4, -1 3497; GFX6-NEXT: s_sub_i32 s3, s3, s6 3498; GFX6-NEXT: s_lshl_b32 s6, s10, 16 3499; GFX6-NEXT: s_add_i32 s7, s7, 0x80000001 3500; GFX6-NEXT: s_min_i32 s8, s4, -1 3501; GFX6-NEXT: s_add_i32 s8, s8, 0x80000000 3502; GFX6-NEXT: s_max_i32 s6, s7, s6 3503; GFX6-NEXT: s_lshl_b32 s5, s5, 16 3504; GFX6-NEXT: s_min_i32 s6, s6, s8 3505; GFX6-NEXT: s_max_i32 s7, s5, -1 3506; GFX6-NEXT: s_sub_i32 s4, s4, s6 3507; GFX6-NEXT: s_lshl_b32 s6, s11, 16 3508; GFX6-NEXT: s_add_i32 s7, s7, 0x80000001 3509; GFX6-NEXT: s_min_i32 s8, s5, -1 3510; GFX6-NEXT: s_ashr_i32 s1, s1, 16 3511; GFX6-NEXT: s_add_i32 s8, s8, 0x80000000 3512; GFX6-NEXT: s_max_i32 s6, s7, s6 3513; GFX6-NEXT: s_ashr_i32 s0, s0, 16 3514; GFX6-NEXT: s_min_i32 s6, s6, s8 3515; GFX6-NEXT: s_and_b32 s1, s1, 0xffff 3516; GFX6-NEXT: s_ashr_i32 s2, s2, 16 3517; GFX6-NEXT: s_ashr_i32 s3, s3, 16 3518; GFX6-NEXT: s_sub_i32 s5, s5, s6 3519; GFX6-NEXT: s_and_b32 s0, s0, 0xffff 3520; GFX6-NEXT: s_lshl_b32 s1, s1, 16 3521; GFX6-NEXT: s_ashr_i32 s5, s5, 16 3522; GFX6-NEXT: s_or_b32 s0, s0, s1 3523; GFX6-NEXT: s_and_b32 s1, s2, 0xffff 3524; GFX6-NEXT: s_and_b32 s2, s3, 0xffff 3525; GFX6-NEXT: s_ashr_i32 s4, s4, 16 3526; GFX6-NEXT: s_lshl_b32 s2, s2, 16 3527; GFX6-NEXT: s_and_b32 s3, s5, 0xffff 3528; GFX6-NEXT: s_or_b32 s1, s1, s2 3529; GFX6-NEXT: s_and_b32 s2, s4, 0xffff 3530; GFX6-NEXT: s_lshl_b32 s3, s3, 16 3531; GFX6-NEXT: s_or_b32 s2, s2, s3 3532; GFX6-NEXT: ; return to shader part epilog 3533; 3534; GFX8-LABEL: s_ssubsat_v6i16: 3535; GFX8: ; %bb.0: 3536; GFX8-NEXT: s_sext_i32_i16 s12, s0 3537; GFX8-NEXT: s_sext_i32_i16 s13, -1 3538; GFX8-NEXT: s_max_i32 s14, s12, s13 3539; GFX8-NEXT: s_addk_i32 s14, 0x8001 3540; GFX8-NEXT: s_lshr_b32 s9, s3, 16 3541; GFX8-NEXT: s_min_i32 s12, s12, s13 3542; GFX8-NEXT: s_sext_i32_i16 s14, s14 3543; GFX8-NEXT: s_sext_i32_i16 s3, s3 3544; GFX8-NEXT: s_addk_i32 s12, 0x8000 3545; GFX8-NEXT: s_max_i32 s3, s14, s3 3546; GFX8-NEXT: s_sext_i32_i16 s3, s3 3547; GFX8-NEXT: s_sext_i32_i16 s12, s12 3548; GFX8-NEXT: s_lshr_b32 s6, s0, 16 3549; GFX8-NEXT: s_min_i32 s3, s3, s12 3550; GFX8-NEXT: s_sub_i32 s0, s0, s3 3551; GFX8-NEXT: s_sext_i32_i16 s3, s6 3552; GFX8-NEXT: s_max_i32 s12, s3, s13 3553; GFX8-NEXT: s_addk_i32 s12, 0x8001 3554; GFX8-NEXT: s_min_i32 s3, s3, s13 3555; GFX8-NEXT: s_sext_i32_i16 s12, s12 3556; GFX8-NEXT: s_sext_i32_i16 s9, s9 3557; GFX8-NEXT: s_addk_i32 s3, 0x8000 3558; GFX8-NEXT: s_max_i32 s9, s12, s9 3559; GFX8-NEXT: s_sext_i32_i16 s9, s9 3560; GFX8-NEXT: s_sext_i32_i16 s3, s3 3561; GFX8-NEXT: s_min_i32 s3, s9, s3 3562; GFX8-NEXT: s_sub_i32 s3, s6, s3 3563; GFX8-NEXT: s_sext_i32_i16 s6, s1 3564; GFX8-NEXT: s_max_i32 s9, s6, s13 3565; GFX8-NEXT: s_addk_i32 s9, 0x8001 3566; GFX8-NEXT: s_lshr_b32 s10, s4, 16 3567; GFX8-NEXT: s_min_i32 s6, s6, s13 3568; GFX8-NEXT: s_sext_i32_i16 s9, s9 3569; GFX8-NEXT: s_sext_i32_i16 s4, s4 3570; GFX8-NEXT: s_addk_i32 s6, 0x8000 3571; GFX8-NEXT: s_max_i32 s4, s9, s4 3572; GFX8-NEXT: s_sext_i32_i16 s4, s4 3573; GFX8-NEXT: s_sext_i32_i16 s6, s6 3574; GFX8-NEXT: s_lshr_b32 s7, s1, 16 3575; GFX8-NEXT: s_min_i32 s4, s4, s6 3576; GFX8-NEXT: s_sub_i32 s1, s1, s4 3577; GFX8-NEXT: s_sext_i32_i16 s4, s7 3578; GFX8-NEXT: s_max_i32 s6, s4, s13 3579; GFX8-NEXT: s_addk_i32 s6, 0x8001 3580; GFX8-NEXT: s_min_i32 s4, s4, s13 3581; GFX8-NEXT: s_sext_i32_i16 s6, s6 3582; GFX8-NEXT: s_sext_i32_i16 s9, s10 3583; GFX8-NEXT: s_addk_i32 s4, 0x8000 3584; GFX8-NEXT: s_max_i32 s6, s6, s9 3585; GFX8-NEXT: s_sext_i32_i16 s6, s6 3586; GFX8-NEXT: s_sext_i32_i16 s4, s4 3587; GFX8-NEXT: s_min_i32 s4, s6, s4 3588; GFX8-NEXT: s_sext_i32_i16 s6, s2 3589; GFX8-NEXT: s_sub_i32 s4, s7, s4 3590; GFX8-NEXT: s_max_i32 s7, s6, s13 3591; GFX8-NEXT: s_addk_i32 s7, 0x8001 3592; GFX8-NEXT: s_lshr_b32 s11, s5, 16 3593; GFX8-NEXT: s_min_i32 s6, s6, s13 3594; GFX8-NEXT: s_sext_i32_i16 s7, s7 3595; GFX8-NEXT: s_sext_i32_i16 s5, s5 3596; GFX8-NEXT: s_addk_i32 s6, 0x8000 3597; GFX8-NEXT: s_max_i32 s5, s7, s5 3598; GFX8-NEXT: s_sext_i32_i16 s5, s5 3599; GFX8-NEXT: s_sext_i32_i16 s6, s6 3600; GFX8-NEXT: s_lshr_b32 s8, s2, 16 3601; GFX8-NEXT: s_min_i32 s5, s5, s6 3602; GFX8-NEXT: s_sub_i32 s2, s2, s5 3603; GFX8-NEXT: s_sext_i32_i16 s5, s8 3604; GFX8-NEXT: s_max_i32 s6, s5, s13 3605; GFX8-NEXT: s_addk_i32 s6, 0x8001 3606; GFX8-NEXT: s_min_i32 s5, s5, s13 3607; GFX8-NEXT: s_sext_i32_i16 s6, s6 3608; GFX8-NEXT: s_sext_i32_i16 s7, s11 3609; GFX8-NEXT: s_addk_i32 s5, 0x8000 3610; GFX8-NEXT: s_max_i32 s6, s6, s7 3611; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 3612; GFX8-NEXT: s_sext_i32_i16 s6, s6 3613; GFX8-NEXT: s_sext_i32_i16 s5, s5 3614; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 3615; GFX8-NEXT: s_lshl_b32 s3, s3, 16 3616; GFX8-NEXT: s_min_i32 s5, s6, s5 3617; GFX8-NEXT: s_or_b32 s0, s0, s3 3618; GFX8-NEXT: s_and_b32 s3, 0xffff, s4 3619; GFX8-NEXT: s_sub_i32 s5, s8, s5 3620; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 3621; GFX8-NEXT: s_lshl_b32 s3, s3, 16 3622; GFX8-NEXT: s_or_b32 s1, s1, s3 3623; GFX8-NEXT: s_and_b32 s3, 0xffff, s5 3624; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 3625; GFX8-NEXT: s_lshl_b32 s3, s3, 16 3626; GFX8-NEXT: s_or_b32 s2, s2, s3 3627; GFX8-NEXT: ; return to shader part epilog 3628; 3629; GFX9-LABEL: s_ssubsat_v6i16: 3630; GFX9: ; %bb.0: 3631; GFX9-NEXT: v_mov_b32_e32 v0, s3 3632; GFX9-NEXT: v_mov_b32_e32 v1, s4 3633; GFX9-NEXT: v_mov_b32_e32 v2, s5 3634; GFX9-NEXT: v_pk_sub_i16 v0, s0, v0 clamp 3635; GFX9-NEXT: v_pk_sub_i16 v1, s1, v1 clamp 3636; GFX9-NEXT: v_pk_sub_i16 v2, s2, v2 clamp 3637; GFX9-NEXT: v_readfirstlane_b32 s0, v0 3638; GFX9-NEXT: v_readfirstlane_b32 s1, v1 3639; GFX9-NEXT: v_readfirstlane_b32 s2, v2 3640; GFX9-NEXT: ; return to shader part epilog 3641; 3642; GFX10PLUS-LABEL: s_ssubsat_v6i16: 3643; GFX10PLUS: ; %bb.0: 3644; GFX10PLUS-NEXT: v_pk_sub_i16 v0, s0, s3 clamp 3645; GFX10PLUS-NEXT: v_pk_sub_i16 v1, s1, s4 clamp 3646; GFX10PLUS-NEXT: v_pk_sub_i16 v2, s2, s5 clamp 3647; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 3648; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1 3649; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v2 3650; GFX10PLUS-NEXT: ; return to shader part epilog 3651 %result = call <6 x i16> @llvm.ssub.sat.v6i16(<6 x i16> %lhs, <6 x i16> %rhs) 3652 %cast = bitcast <6 x i16> %result to <3 x i32> 3653 ret <3 x i32> %cast 3654} 3655 3656define <4 x float> @v_ssubsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { 3657; GFX6-LABEL: v_ssubsat_v8i16: 3658; GFX6: ; %bb.0: 3659; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3660; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 3661; GFX6-NEXT: v_max_i32_e32 v16, -1, v0 3662; GFX6-NEXT: v_mov_b32_e32 v17, 0x80000001 3663; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v8 3664; GFX6-NEXT: v_add_i32_e32 v16, vcc, v16, v17 3665; GFX6-NEXT: v_min_i32_e32 v18, -1, v0 3666; GFX6-NEXT: v_bfrev_b32_e32 v19, 1 3667; GFX6-NEXT: v_add_i32_e32 v18, vcc, v18, v19 3668; GFX6-NEXT: v_max_i32_e32 v8, v16, v8 3669; GFX6-NEXT: v_min_i32_e32 v8, v8, v18 3670; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 3671; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 3672; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v9 3673; GFX6-NEXT: v_max_i32_e32 v9, -1, v1 3674; GFX6-NEXT: v_add_i32_e32 v9, vcc, v9, v17 3675; GFX6-NEXT: v_min_i32_e32 v16, -1, v1 3676; GFX6-NEXT: v_add_i32_e32 v16, vcc, v16, v19 3677; GFX6-NEXT: v_max_i32_e32 v8, v9, v8 3678; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 3679; GFX6-NEXT: v_min_i32_e32 v8, v8, v16 3680; GFX6-NEXT: v_max_i32_e32 v9, -1, v2 3681; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v8 3682; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v10 3683; GFX6-NEXT: v_add_i32_e32 v9, vcc, v9, v17 3684; GFX6-NEXT: v_min_i32_e32 v10, -1, v2 3685; GFX6-NEXT: v_add_i32_e32 v10, vcc, v10, v19 3686; GFX6-NEXT: v_max_i32_e32 v8, v9, v8 3687; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 3688; GFX6-NEXT: v_min_i32_e32 v8, v8, v10 3689; GFX6-NEXT: v_max_i32_e32 v9, -1, v3 3690; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 3691; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v11 3692; GFX6-NEXT: v_add_i32_e32 v9, vcc, v9, v17 3693; GFX6-NEXT: v_min_i32_e32 v10, -1, v3 3694; GFX6-NEXT: v_add_i32_e32 v10, vcc, v10, v19 3695; GFX6-NEXT: v_max_i32_e32 v8, v9, v8 3696; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 3697; GFX6-NEXT: v_min_i32_e32 v8, v8, v10 3698; GFX6-NEXT: v_max_i32_e32 v9, -1, v4 3699; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v8 3700; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v12 3701; GFX6-NEXT: v_add_i32_e32 v9, vcc, v9, v17 3702; GFX6-NEXT: v_min_i32_e32 v10, -1, v4 3703; GFX6-NEXT: v_add_i32_e32 v10, vcc, v10, v19 3704; GFX6-NEXT: v_max_i32_e32 v8, v9, v8 3705; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 3706; GFX6-NEXT: v_min_i32_e32 v8, v8, v10 3707; GFX6-NEXT: v_max_i32_e32 v9, -1, v5 3708; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v8 3709; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v13 3710; GFX6-NEXT: v_add_i32_e32 v9, vcc, v9, v17 3711; GFX6-NEXT: v_min_i32_e32 v10, -1, v5 3712; GFX6-NEXT: v_add_i32_e32 v10, vcc, v10, v19 3713; GFX6-NEXT: v_max_i32_e32 v8, v9, v8 3714; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6 3715; GFX6-NEXT: v_min_i32_e32 v8, v8, v10 3716; GFX6-NEXT: v_max_i32_e32 v9, -1, v6 3717; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v8 3718; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v14 3719; GFX6-NEXT: v_add_i32_e32 v9, vcc, v9, v17 3720; GFX6-NEXT: v_min_i32_e32 v10, -1, v6 3721; GFX6-NEXT: v_add_i32_e32 v10, vcc, v10, v19 3722; GFX6-NEXT: v_max_i32_e32 v8, v9, v8 3723; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7 3724; GFX6-NEXT: v_min_i32_e32 v8, v8, v10 3725; GFX6-NEXT: v_max_i32_e32 v9, -1, v7 3726; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 3727; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v8 3728; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v15 3729; GFX6-NEXT: v_add_i32_e32 v9, vcc, v9, v17 3730; GFX6-NEXT: v_min_i32_e32 v10, -1, v7 3731; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 3732; GFX6-NEXT: v_add_i32_e32 v10, vcc, v10, v19 3733; GFX6-NEXT: v_max_i32_e32 v8, v9, v8 3734; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 3735; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v2 3736; GFX6-NEXT: v_ashrrev_i32_e32 v3, 16, v3 3737; GFX6-NEXT: v_min_i32_e32 v8, v8, v10 3738; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 3739; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 3740; GFX6-NEXT: v_ashrrev_i32_e32 v5, 16, v5 3741; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v8 3742; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 3743; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v2 3744; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 3745; GFX6-NEXT: v_ashrrev_i32_e32 v4, 16, v4 3746; GFX6-NEXT: v_ashrrev_i32_e32 v7, 16, v7 3747; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 3748; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v5 3749; GFX6-NEXT: v_ashrrev_i32_e32 v6, 16, v6 3750; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 3751; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v4 3752; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 3753; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v7 3754; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 3755; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v6 3756; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 3757; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 3758; GFX6-NEXT: s_setpc_b64 s[30:31] 3759; 3760; GFX8-LABEL: v_ssubsat_v8i16: 3761; GFX8: ; %bb.0: 3762; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3763; GFX8-NEXT: v_max_i16_e32 v8, -1, v0 3764; GFX8-NEXT: v_add_u16_e32 v8, 0x8001, v8 3765; GFX8-NEXT: v_min_i16_e32 v9, -1, v0 3766; GFX8-NEXT: v_add_u16_e32 v9, 0x8000, v9 3767; GFX8-NEXT: v_max_i16_e32 v8, v8, v4 3768; GFX8-NEXT: v_min_i16_e32 v8, v8, v9 3769; GFX8-NEXT: v_mov_b32_e32 v9, -1 3770; GFX8-NEXT: v_max_i16_sdwa v10, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3771; GFX8-NEXT: v_add_u16_e32 v10, 0x8001, v10 3772; GFX8-NEXT: v_min_i16_sdwa v11, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3773; GFX8-NEXT: v_add_u16_e32 v11, 0x8000, v11 3774; GFX8-NEXT: v_max_i16_sdwa v4, v10, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 3775; GFX8-NEXT: v_max_i16_e32 v10, -1, v1 3776; GFX8-NEXT: v_min_i16_e32 v4, v4, v11 3777; GFX8-NEXT: v_add_u16_e32 v10, 0x8001, v10 3778; GFX8-NEXT: v_min_i16_e32 v11, -1, v1 3779; GFX8-NEXT: v_add_u16_e32 v11, 0x8000, v11 3780; GFX8-NEXT: v_max_i16_e32 v10, v10, v5 3781; GFX8-NEXT: v_min_i16_e32 v10, v10, v11 3782; GFX8-NEXT: v_max_i16_sdwa v11, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3783; GFX8-NEXT: v_add_u16_e32 v11, 0x8001, v11 3784; GFX8-NEXT: v_min_i16_sdwa v12, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3785; GFX8-NEXT: v_add_u16_e32 v12, 0x8000, v12 3786; GFX8-NEXT: v_max_i16_sdwa v5, v11, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 3787; GFX8-NEXT: v_max_i16_e32 v11, -1, v2 3788; GFX8-NEXT: v_min_i16_e32 v5, v5, v12 3789; GFX8-NEXT: v_add_u16_e32 v11, 0x8001, v11 3790; GFX8-NEXT: v_min_i16_e32 v12, -1, v2 3791; GFX8-NEXT: v_add_u16_e32 v12, 0x8000, v12 3792; GFX8-NEXT: v_max_i16_e32 v11, v11, v6 3793; GFX8-NEXT: v_min_i16_e32 v11, v11, v12 3794; GFX8-NEXT: v_max_i16_sdwa v12, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3795; GFX8-NEXT: v_add_u16_e32 v12, 0x8001, v12 3796; GFX8-NEXT: v_min_i16_sdwa v13, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3797; GFX8-NEXT: v_add_u16_e32 v13, 0x8000, v13 3798; GFX8-NEXT: v_max_i16_sdwa v6, v12, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 3799; GFX8-NEXT: v_max_i16_e32 v12, -1, v3 3800; GFX8-NEXT: v_min_i16_e32 v6, v6, v13 3801; GFX8-NEXT: v_add_u16_e32 v12, 0x8001, v12 3802; GFX8-NEXT: v_min_i16_e32 v13, -1, v3 3803; GFX8-NEXT: v_add_u16_e32 v13, 0x8000, v13 3804; GFX8-NEXT: v_max_i16_e32 v12, v12, v7 3805; GFX8-NEXT: v_min_i16_e32 v12, v12, v13 3806; GFX8-NEXT: v_max_i16_sdwa v13, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3807; GFX8-NEXT: v_add_u16_e32 v13, 0x8001, v13 3808; GFX8-NEXT: v_min_i16_sdwa v9, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3809; GFX8-NEXT: v_add_u16_e32 v9, 0x8000, v9 3810; GFX8-NEXT: v_max_i16_sdwa v7, v13, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 3811; GFX8-NEXT: v_sub_u16_e32 v8, v0, v8 3812; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3813; GFX8-NEXT: v_sub_u16_e32 v4, v1, v10 3814; GFX8-NEXT: v_sub_u16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3815; GFX8-NEXT: v_min_i16_e32 v7, v7, v9 3816; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 3817; GFX8-NEXT: v_sub_u16_e32 v4, v2, v11 3818; GFX8-NEXT: v_sub_u16_sdwa v2, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3819; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 3820; GFX8-NEXT: v_sub_u16_e32 v4, v3, v12 3821; GFX8-NEXT: v_sub_u16_sdwa v3, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3822; GFX8-NEXT: v_or_b32_e32 v0, v8, v0 3823; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 3824; GFX8-NEXT: s_setpc_b64 s[30:31] 3825; 3826; GFX9-LABEL: v_ssubsat_v8i16: 3827; GFX9: ; %bb.0: 3828; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3829; GFX9-NEXT: v_pk_sub_i16 v0, v0, v4 clamp 3830; GFX9-NEXT: v_pk_sub_i16 v1, v1, v5 clamp 3831; GFX9-NEXT: v_pk_sub_i16 v2, v2, v6 clamp 3832; GFX9-NEXT: v_pk_sub_i16 v3, v3, v7 clamp 3833; GFX9-NEXT: s_setpc_b64 s[30:31] 3834; 3835; GFX10PLUS-LABEL: v_ssubsat_v8i16: 3836; GFX10PLUS: ; %bb.0: 3837; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3838; GFX10PLUS-NEXT: v_pk_sub_i16 v0, v0, v4 clamp 3839; GFX10PLUS-NEXT: v_pk_sub_i16 v1, v1, v5 clamp 3840; GFX10PLUS-NEXT: v_pk_sub_i16 v2, v2, v6 clamp 3841; GFX10PLUS-NEXT: v_pk_sub_i16 v3, v3, v7 clamp 3842; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 3843 %result = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) 3844 %cast = bitcast <8 x i16> %result to <4 x float> 3845 ret <4 x float> %cast 3846} 3847 3848define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inreg %rhs) { 3849; GFX6-LABEL: s_ssubsat_v8i16: 3850; GFX6: ; %bb.0: 3851; GFX6-NEXT: s_lshl_b32 s0, s0, 16 3852; GFX6-NEXT: s_max_i32 s16, s0, -1 3853; GFX6-NEXT: s_lshl_b32 s8, s8, 16 3854; GFX6-NEXT: s_add_i32 s16, s16, 0x80000001 3855; GFX6-NEXT: s_min_i32 s17, s0, -1 3856; GFX6-NEXT: s_add_i32 s17, s17, 0x80000000 3857; GFX6-NEXT: s_max_i32 s8, s16, s8 3858; GFX6-NEXT: s_min_i32 s8, s8, s17 3859; GFX6-NEXT: s_lshl_b32 s1, s1, 16 3860; GFX6-NEXT: s_sub_i32 s0, s0, s8 3861; GFX6-NEXT: s_lshl_b32 s8, s9, 16 3862; GFX6-NEXT: s_max_i32 s9, s1, -1 3863; GFX6-NEXT: s_add_i32 s9, s9, 0x80000001 3864; GFX6-NEXT: s_min_i32 s16, s1, -1 3865; GFX6-NEXT: s_add_i32 s16, s16, 0x80000000 3866; GFX6-NEXT: s_max_i32 s8, s9, s8 3867; GFX6-NEXT: s_lshl_b32 s2, s2, 16 3868; GFX6-NEXT: s_min_i32 s8, s8, s16 3869; GFX6-NEXT: s_max_i32 s9, s2, -1 3870; GFX6-NEXT: s_sub_i32 s1, s1, s8 3871; GFX6-NEXT: s_lshl_b32 s8, s10, 16 3872; GFX6-NEXT: s_add_i32 s9, s9, 0x80000001 3873; GFX6-NEXT: s_min_i32 s10, s2, -1 3874; GFX6-NEXT: s_add_i32 s10, s10, 0x80000000 3875; GFX6-NEXT: s_max_i32 s8, s9, s8 3876; GFX6-NEXT: s_lshl_b32 s3, s3, 16 3877; GFX6-NEXT: s_min_i32 s8, s8, s10 3878; GFX6-NEXT: s_max_i32 s9, s3, -1 3879; GFX6-NEXT: s_sub_i32 s2, s2, s8 3880; GFX6-NEXT: s_lshl_b32 s8, s11, 16 3881; GFX6-NEXT: s_add_i32 s9, s9, 0x80000001 3882; GFX6-NEXT: s_min_i32 s10, s3, -1 3883; GFX6-NEXT: s_add_i32 s10, s10, 0x80000000 3884; GFX6-NEXT: s_max_i32 s8, s9, s8 3885; GFX6-NEXT: s_lshl_b32 s4, s4, 16 3886; GFX6-NEXT: s_min_i32 s8, s8, s10 3887; GFX6-NEXT: s_max_i32 s9, s4, -1 3888; GFX6-NEXT: s_sub_i32 s3, s3, s8 3889; GFX6-NEXT: s_lshl_b32 s8, s12, 16 3890; GFX6-NEXT: s_add_i32 s9, s9, 0x80000001 3891; GFX6-NEXT: s_min_i32 s10, s4, -1 3892; GFX6-NEXT: s_add_i32 s10, s10, 0x80000000 3893; GFX6-NEXT: s_max_i32 s8, s9, s8 3894; GFX6-NEXT: s_lshl_b32 s5, s5, 16 3895; GFX6-NEXT: s_min_i32 s8, s8, s10 3896; GFX6-NEXT: s_max_i32 s9, s5, -1 3897; GFX6-NEXT: s_sub_i32 s4, s4, s8 3898; GFX6-NEXT: s_lshl_b32 s8, s13, 16 3899; GFX6-NEXT: s_add_i32 s9, s9, 0x80000001 3900; GFX6-NEXT: s_min_i32 s10, s5, -1 3901; GFX6-NEXT: s_add_i32 s10, s10, 0x80000000 3902; GFX6-NEXT: s_max_i32 s8, s9, s8 3903; GFX6-NEXT: s_lshl_b32 s6, s6, 16 3904; GFX6-NEXT: s_min_i32 s8, s8, s10 3905; GFX6-NEXT: s_max_i32 s9, s6, -1 3906; GFX6-NEXT: s_sub_i32 s5, s5, s8 3907; GFX6-NEXT: s_lshl_b32 s8, s14, 16 3908; GFX6-NEXT: s_add_i32 s9, s9, 0x80000001 3909; GFX6-NEXT: s_min_i32 s10, s6, -1 3910; GFX6-NEXT: s_add_i32 s10, s10, 0x80000000 3911; GFX6-NEXT: s_max_i32 s8, s9, s8 3912; GFX6-NEXT: s_lshl_b32 s7, s7, 16 3913; GFX6-NEXT: s_min_i32 s8, s8, s10 3914; GFX6-NEXT: s_max_i32 s9, s7, -1 3915; GFX6-NEXT: s_ashr_i32 s1, s1, 16 3916; GFX6-NEXT: s_sub_i32 s6, s6, s8 3917; GFX6-NEXT: s_lshl_b32 s8, s15, 16 3918; GFX6-NEXT: s_add_i32 s9, s9, 0x80000001 3919; GFX6-NEXT: s_min_i32 s10, s7, -1 3920; GFX6-NEXT: s_ashr_i32 s0, s0, 16 3921; GFX6-NEXT: s_add_i32 s10, s10, 0x80000000 3922; GFX6-NEXT: s_max_i32 s8, s9, s8 3923; GFX6-NEXT: s_and_b32 s1, s1, 0xffff 3924; GFX6-NEXT: s_ashr_i32 s2, s2, 16 3925; GFX6-NEXT: s_ashr_i32 s3, s3, 16 3926; GFX6-NEXT: s_min_i32 s8, s8, s10 3927; GFX6-NEXT: s_and_b32 s0, s0, 0xffff 3928; GFX6-NEXT: s_lshl_b32 s1, s1, 16 3929; GFX6-NEXT: s_ashr_i32 s5, s5, 16 3930; GFX6-NEXT: s_sub_i32 s7, s7, s8 3931; GFX6-NEXT: s_or_b32 s0, s0, s1 3932; GFX6-NEXT: s_and_b32 s1, s2, 0xffff 3933; GFX6-NEXT: s_and_b32 s2, s3, 0xffff 3934; GFX6-NEXT: s_ashr_i32 s4, s4, 16 3935; GFX6-NEXT: s_ashr_i32 s7, s7, 16 3936; GFX6-NEXT: s_lshl_b32 s2, s2, 16 3937; GFX6-NEXT: s_and_b32 s3, s5, 0xffff 3938; GFX6-NEXT: s_ashr_i32 s6, s6, 16 3939; GFX6-NEXT: s_or_b32 s1, s1, s2 3940; GFX6-NEXT: s_and_b32 s2, s4, 0xffff 3941; GFX6-NEXT: s_lshl_b32 s3, s3, 16 3942; GFX6-NEXT: s_and_b32 s4, s7, 0xffff 3943; GFX6-NEXT: s_or_b32 s2, s2, s3 3944; GFX6-NEXT: s_and_b32 s3, s6, 0xffff 3945; GFX6-NEXT: s_lshl_b32 s4, s4, 16 3946; GFX6-NEXT: s_or_b32 s3, s3, s4 3947; GFX6-NEXT: ; return to shader part epilog 3948; 3949; GFX8-LABEL: s_ssubsat_v8i16: 3950; GFX8: ; %bb.0: 3951; GFX8-NEXT: s_sext_i32_i16 s16, s0 3952; GFX8-NEXT: s_sext_i32_i16 s17, -1 3953; GFX8-NEXT: s_max_i32 s18, s16, s17 3954; GFX8-NEXT: s_addk_i32 s18, 0x8001 3955; GFX8-NEXT: s_lshr_b32 s12, s4, 16 3956; GFX8-NEXT: s_min_i32 s16, s16, s17 3957; GFX8-NEXT: s_sext_i32_i16 s18, s18 3958; GFX8-NEXT: s_sext_i32_i16 s4, s4 3959; GFX8-NEXT: s_addk_i32 s16, 0x8000 3960; GFX8-NEXT: s_max_i32 s4, s18, s4 3961; GFX8-NEXT: s_sext_i32_i16 s4, s4 3962; GFX8-NEXT: s_sext_i32_i16 s16, s16 3963; GFX8-NEXT: s_lshr_b32 s8, s0, 16 3964; GFX8-NEXT: s_min_i32 s4, s4, s16 3965; GFX8-NEXT: s_sub_i32 s0, s0, s4 3966; GFX8-NEXT: s_sext_i32_i16 s4, s8 3967; GFX8-NEXT: s_max_i32 s16, s4, s17 3968; GFX8-NEXT: s_addk_i32 s16, 0x8001 3969; GFX8-NEXT: s_min_i32 s4, s4, s17 3970; GFX8-NEXT: s_sext_i32_i16 s16, s16 3971; GFX8-NEXT: s_sext_i32_i16 s12, s12 3972; GFX8-NEXT: s_addk_i32 s4, 0x8000 3973; GFX8-NEXT: s_max_i32 s12, s16, s12 3974; GFX8-NEXT: s_sext_i32_i16 s12, s12 3975; GFX8-NEXT: s_sext_i32_i16 s4, s4 3976; GFX8-NEXT: s_min_i32 s4, s12, s4 3977; GFX8-NEXT: s_sub_i32 s4, s8, s4 3978; GFX8-NEXT: s_sext_i32_i16 s8, s1 3979; GFX8-NEXT: s_max_i32 s12, s8, s17 3980; GFX8-NEXT: s_addk_i32 s12, 0x8001 3981; GFX8-NEXT: s_lshr_b32 s13, s5, 16 3982; GFX8-NEXT: s_min_i32 s8, s8, s17 3983; GFX8-NEXT: s_sext_i32_i16 s12, s12 3984; GFX8-NEXT: s_sext_i32_i16 s5, s5 3985; GFX8-NEXT: s_addk_i32 s8, 0x8000 3986; GFX8-NEXT: s_max_i32 s5, s12, s5 3987; GFX8-NEXT: s_sext_i32_i16 s5, s5 3988; GFX8-NEXT: s_sext_i32_i16 s8, s8 3989; GFX8-NEXT: s_lshr_b32 s9, s1, 16 3990; GFX8-NEXT: s_min_i32 s5, s5, s8 3991; GFX8-NEXT: s_sub_i32 s1, s1, s5 3992; GFX8-NEXT: s_sext_i32_i16 s5, s9 3993; GFX8-NEXT: s_max_i32 s8, s5, s17 3994; GFX8-NEXT: s_addk_i32 s8, 0x8001 3995; GFX8-NEXT: s_min_i32 s5, s5, s17 3996; GFX8-NEXT: s_sext_i32_i16 s8, s8 3997; GFX8-NEXT: s_sext_i32_i16 s12, s13 3998; GFX8-NEXT: s_addk_i32 s5, 0x8000 3999; GFX8-NEXT: s_max_i32 s8, s8, s12 4000; GFX8-NEXT: s_sext_i32_i16 s8, s8 4001; GFX8-NEXT: s_sext_i32_i16 s5, s5 4002; GFX8-NEXT: s_min_i32 s5, s8, s5 4003; GFX8-NEXT: s_sext_i32_i16 s8, s2 4004; GFX8-NEXT: s_sub_i32 s5, s9, s5 4005; GFX8-NEXT: s_max_i32 s9, s8, s17 4006; GFX8-NEXT: s_addk_i32 s9, 0x8001 4007; GFX8-NEXT: s_lshr_b32 s14, s6, 16 4008; GFX8-NEXT: s_min_i32 s8, s8, s17 4009; GFX8-NEXT: s_sext_i32_i16 s9, s9 4010; GFX8-NEXT: s_sext_i32_i16 s6, s6 4011; GFX8-NEXT: s_addk_i32 s8, 0x8000 4012; GFX8-NEXT: s_max_i32 s6, s9, s6 4013; GFX8-NEXT: s_sext_i32_i16 s6, s6 4014; GFX8-NEXT: s_sext_i32_i16 s8, s8 4015; GFX8-NEXT: s_lshr_b32 s10, s2, 16 4016; GFX8-NEXT: s_min_i32 s6, s6, s8 4017; GFX8-NEXT: s_sub_i32 s2, s2, s6 4018; GFX8-NEXT: s_sext_i32_i16 s6, s10 4019; GFX8-NEXT: s_max_i32 s8, s6, s17 4020; GFX8-NEXT: s_addk_i32 s8, 0x8001 4021; GFX8-NEXT: s_min_i32 s6, s6, s17 4022; GFX8-NEXT: s_sext_i32_i16 s8, s8 4023; GFX8-NEXT: s_sext_i32_i16 s9, s14 4024; GFX8-NEXT: s_addk_i32 s6, 0x8000 4025; GFX8-NEXT: s_max_i32 s8, s8, s9 4026; GFX8-NEXT: s_sext_i32_i16 s8, s8 4027; GFX8-NEXT: s_sext_i32_i16 s6, s6 4028; GFX8-NEXT: s_min_i32 s6, s8, s6 4029; GFX8-NEXT: s_sext_i32_i16 s8, s3 4030; GFX8-NEXT: s_max_i32 s9, s8, s17 4031; GFX8-NEXT: s_addk_i32 s9, 0x8001 4032; GFX8-NEXT: s_lshr_b32 s15, s7, 16 4033; GFX8-NEXT: s_min_i32 s8, s8, s17 4034; GFX8-NEXT: s_sext_i32_i16 s9, s9 4035; GFX8-NEXT: s_sext_i32_i16 s7, s7 4036; GFX8-NEXT: s_addk_i32 s8, 0x8000 4037; GFX8-NEXT: s_max_i32 s7, s9, s7 4038; GFX8-NEXT: s_sext_i32_i16 s7, s7 4039; GFX8-NEXT: s_sext_i32_i16 s8, s8 4040; GFX8-NEXT: s_lshr_b32 s11, s3, 16 4041; GFX8-NEXT: s_min_i32 s7, s7, s8 4042; GFX8-NEXT: s_sub_i32 s3, s3, s7 4043; GFX8-NEXT: s_sext_i32_i16 s7, s11 4044; GFX8-NEXT: s_max_i32 s8, s7, s17 4045; GFX8-NEXT: s_addk_i32 s8, 0x8001 4046; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 4047; GFX8-NEXT: s_min_i32 s7, s7, s17 4048; GFX8-NEXT: s_sext_i32_i16 s8, s8 4049; GFX8-NEXT: s_sext_i32_i16 s9, s15 4050; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 4051; GFX8-NEXT: s_lshl_b32 s4, s4, 16 4052; GFX8-NEXT: s_addk_i32 s7, 0x8000 4053; GFX8-NEXT: s_max_i32 s8, s8, s9 4054; GFX8-NEXT: s_or_b32 s0, s0, s4 4055; GFX8-NEXT: s_and_b32 s4, 0xffff, s5 4056; GFX8-NEXT: s_sub_i32 s6, s10, s6 4057; GFX8-NEXT: s_sext_i32_i16 s8, s8 4058; GFX8-NEXT: s_sext_i32_i16 s7, s7 4059; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 4060; GFX8-NEXT: s_lshl_b32 s4, s4, 16 4061; GFX8-NEXT: s_min_i32 s7, s8, s7 4062; GFX8-NEXT: s_or_b32 s1, s1, s4 4063; GFX8-NEXT: s_and_b32 s4, 0xffff, s6 4064; GFX8-NEXT: s_sub_i32 s7, s11, s7 4065; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 4066; GFX8-NEXT: s_lshl_b32 s4, s4, 16 4067; GFX8-NEXT: s_or_b32 s2, s2, s4 4068; GFX8-NEXT: s_and_b32 s4, 0xffff, s7 4069; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 4070; GFX8-NEXT: s_lshl_b32 s4, s4, 16 4071; GFX8-NEXT: s_or_b32 s3, s3, s4 4072; GFX8-NEXT: ; return to shader part epilog 4073; 4074; GFX9-LABEL: s_ssubsat_v8i16: 4075; GFX9: ; %bb.0: 4076; GFX9-NEXT: v_mov_b32_e32 v0, s4 4077; GFX9-NEXT: v_mov_b32_e32 v1, s5 4078; GFX9-NEXT: v_mov_b32_e32 v2, s6 4079; GFX9-NEXT: v_mov_b32_e32 v3, s7 4080; GFX9-NEXT: v_pk_sub_i16 v0, s0, v0 clamp 4081; GFX9-NEXT: v_pk_sub_i16 v1, s1, v1 clamp 4082; GFX9-NEXT: v_pk_sub_i16 v2, s2, v2 clamp 4083; GFX9-NEXT: v_pk_sub_i16 v3, s3, v3 clamp 4084; GFX9-NEXT: v_readfirstlane_b32 s0, v0 4085; GFX9-NEXT: v_readfirstlane_b32 s1, v1 4086; GFX9-NEXT: v_readfirstlane_b32 s2, v2 4087; GFX9-NEXT: v_readfirstlane_b32 s3, v3 4088; GFX9-NEXT: ; return to shader part epilog 4089; 4090; GFX10PLUS-LABEL: s_ssubsat_v8i16: 4091; GFX10PLUS: ; %bb.0: 4092; GFX10PLUS-NEXT: v_pk_sub_i16 v0, s0, s4 clamp 4093; GFX10PLUS-NEXT: v_pk_sub_i16 v1, s1, s5 clamp 4094; GFX10PLUS-NEXT: v_pk_sub_i16 v2, s2, s6 clamp 4095; GFX10PLUS-NEXT: v_pk_sub_i16 v3, s3, s7 clamp 4096; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 4097; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1 4098; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v2 4099; GFX10PLUS-NEXT: v_readfirstlane_b32 s3, v3 4100; GFX10PLUS-NEXT: ; return to shader part epilog 4101 %result = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) 4102 %cast = bitcast <8 x i16> %result to <4 x i32> 4103 ret <4 x i32> %cast 4104} 4105 4106define i48 @v_ssubsat_i48(i48 %lhs, i48 %rhs) { 4107; GFX6-LABEL: v_ssubsat_i48: 4108; GFX6: ; %bb.0: 4109; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4110; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v0, v2 4111; GFX6-NEXT: v_subb_u32_e32 v6, vcc, v1, v3, vcc 4112; GFX6-NEXT: v_bfe_i32 v5, v4, 0, 16 4113; GFX6-NEXT: v_bfe_i32 v1, v0, 0, 16 4114; GFX6-NEXT: v_bfe_i32 v3, v2, 0, 16 4115; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1] 4116; GFX6-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[2:3] 4117; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v5 4118; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0xffff8000, v0 4119; GFX6-NEXT: v_ashrrev_i32_e32 v1, 15, v5 4120; GFX6-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] 4121; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc 4122; GFX6-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc 4123; GFX6-NEXT: s_setpc_b64 s[30:31] 4124; 4125; GFX8-LABEL: v_ssubsat_i48: 4126; GFX8: ; %bb.0: 4127; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4128; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v0, v2 4129; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v1, v3, vcc 4130; GFX8-NEXT: v_bfe_i32 v5, v4, 0, 16 4131; GFX8-NEXT: v_bfe_i32 v1, v0, 0, 16 4132; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 16 4133; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1] 4134; GFX8-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[2:3] 4135; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v5 4136; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xffff8000, v0 4137; GFX8-NEXT: v_ashrrev_i32_e32 v1, 15, v5 4138; GFX8-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] 4139; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc 4140; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc 4141; GFX8-NEXT: s_setpc_b64 s[30:31] 4142; 4143; GFX9-LABEL: v_ssubsat_i48: 4144; GFX9: ; %bb.0: 4145; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4146; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] 4147; GFX9-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3] 4148; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v0, v2 4149; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v3, vcc 4150; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] 4151; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[2:3] 4152; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v5 4153; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0 4154; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc 4155; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc 4156; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc 4157; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] 4158; GFX9-NEXT: s_setpc_b64 s[30:31] 4159; 4160; GFX10-LABEL: v_ssubsat_i48: 4161; GFX10: ; %bb.0: 4162; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4163; GFX10-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] 4164; GFX10-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3] 4165; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2 4166; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo 4167; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3] 4168; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 4169; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[4:5], v[0:1] 4170; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v6 4171; GFX10-NEXT: s_xor_b32 vcc_lo, vcc_lo, s4 4172; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo 4173; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo 4174; GFX10-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] 4175; GFX10-NEXT: s_setpc_b64 s[30:31] 4176; 4177; GFX11-LABEL: v_ssubsat_i48: 4178; GFX11: ; %bb.0: 4179; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4180; GFX11-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] 4181; GFX11-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3] 4182; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2 4183; GFX11-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo 4184; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3] 4185; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v5 4186; GFX11-NEXT: v_cmp_lt_i64_e64 s0, v[4:5], v[0:1] 4187; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v6 4188; GFX11-NEXT: s_xor_b32 vcc_lo, vcc_lo, s0 4189; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1 4190; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] 4191; GFX11-NEXT: s_setpc_b64 s[30:31] 4192 %result = call i48 @llvm.ssub.sat.i48(i48 %lhs, i48 %rhs) 4193 ret i48 %result 4194} 4195 4196define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) { 4197; GFX6-LABEL: s_ssubsat_i48: 4198; GFX6: ; %bb.0: 4199; GFX6-NEXT: s_sub_u32 s4, s0, s2 4200; GFX6-NEXT: s_subb_u32 s3, s1, s3 4201; GFX6-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000 4202; GFX6-NEXT: v_mov_b32_e32 v0, s0 4203; GFX6-NEXT: s_bfe_i64 s[6:7], s[4:5], 0x300000 4204; GFX6-NEXT: v_mov_b32_e32 v1, s1 4205; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x300000 4206; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1] 4207; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0 4208; GFX6-NEXT: s_ashr_i32 s2, s7, 31 4209; GFX6-NEXT: s_ashr_i32 s5, s7, 15 4210; GFX6-NEXT: s_addk_i32 s2, 0x8000 4211; GFX6-NEXT: v_mov_b32_e32 v0, s5 4212; GFX6-NEXT: v_mov_b32_e32 v1, s2 4213; GFX6-NEXT: v_mov_b32_e32 v2, s4 4214; GFX6-NEXT: v_mov_b32_e32 v3, s3 4215; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc 4216; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc 4217; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 4218; GFX6-NEXT: v_readfirstlane_b32 s0, v0 4219; GFX6-NEXT: v_readfirstlane_b32 s1, v1 4220; GFX6-NEXT: ; return to shader part epilog 4221; 4222; GFX8-LABEL: s_ssubsat_i48: 4223; GFX8: ; %bb.0: 4224; GFX8-NEXT: s_sub_u32 s4, s0, s2 4225; GFX8-NEXT: s_subb_u32 s3, s1, s3 4226; GFX8-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000 4227; GFX8-NEXT: v_mov_b32_e32 v0, s0 4228; GFX8-NEXT: s_bfe_i64 s[6:7], s[4:5], 0x300000 4229; GFX8-NEXT: v_mov_b32_e32 v1, s1 4230; GFX8-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x300000 4231; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[0:1] 4232; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0 4233; GFX8-NEXT: s_ashr_i32 s2, s7, 31 4234; GFX8-NEXT: s_ashr_i32 s5, s7, 15 4235; GFX8-NEXT: s_addk_i32 s2, 0x8000 4236; GFX8-NEXT: v_mov_b32_e32 v0, s5 4237; GFX8-NEXT: v_mov_b32_e32 v1, s2 4238; GFX8-NEXT: v_mov_b32_e32 v2, s4 4239; GFX8-NEXT: v_mov_b32_e32 v3, s3 4240; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc 4241; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc 4242; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 4243; GFX8-NEXT: v_readfirstlane_b32 s0, v0 4244; GFX8-NEXT: v_readfirstlane_b32 s1, v1 4245; GFX8-NEXT: ; return to shader part epilog 4246; 4247; GFX9-LABEL: s_ssubsat_i48: 4248; GFX9: ; %bb.0: 4249; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 4250; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 16 4251; GFX9-NEXT: s_sub_u32 s4, s0, s2 4252; GFX9-NEXT: v_mov_b32_e32 v0, s0 4253; GFX9-NEXT: s_subb_u32 s5, s1, s3 4254; GFX9-NEXT: v_mov_b32_e32 v1, s1 4255; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 4256; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 4257; GFX9-NEXT: s_ashr_i32 s2, s5, 31 4258; GFX9-NEXT: s_add_i32 s3, s2, 0x80000000 4259; GFX9-NEXT: v_mov_b32_e32 v0, s2 4260; GFX9-NEXT: v_mov_b32_e32 v1, s3 4261; GFX9-NEXT: v_mov_b32_e32 v2, s4 4262; GFX9-NEXT: v_mov_b32_e32 v3, s5 4263; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc 4264; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc 4265; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 4266; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] 4267; GFX9-NEXT: v_readfirstlane_b32 s0, v0 4268; GFX9-NEXT: v_readfirstlane_b32 s1, v1 4269; GFX9-NEXT: ; return to shader part epilog 4270; 4271; GFX10-LABEL: s_ssubsat_i48: 4272; GFX10: ; %bb.0: 4273; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 4274; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 16 4275; GFX10-NEXT: s_sub_u32 s4, s0, s2 4276; GFX10-NEXT: s_subb_u32 s5, s1, s3 4277; GFX10-NEXT: v_mov_b32_e32 v0, s4 4278; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[4:5], s[0:1] 4279; GFX10-NEXT: v_cmp_gt_i64_e64 s1, s[2:3], 0 4280; GFX10-NEXT: v_mov_b32_e32 v1, s5 4281; GFX10-NEXT: s_ashr_i32 s2, s5, 31 4282; GFX10-NEXT: s_add_i32 s3, s2, 0x80000000 4283; GFX10-NEXT: s_xor_b32 s0, s1, s0 4284; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0 4285; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0 4286; GFX10-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] 4287; GFX10-NEXT: v_readfirstlane_b32 s0, v0 4288; GFX10-NEXT: v_readfirstlane_b32 s1, v1 4289; GFX10-NEXT: ; return to shader part epilog 4290; 4291; GFX11-LABEL: s_ssubsat_i48: 4292; GFX11: ; %bb.0: 4293; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 4294; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], 16 4295; GFX11-NEXT: s_sub_u32 s4, s0, s2 4296; GFX11-NEXT: s_subb_u32 s5, s1, s3 4297; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 4298; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[4:5], s[0:1] 4299; GFX11-NEXT: v_cmp_gt_i64_e64 s1, s[2:3], 0 4300; GFX11-NEXT: s_ashr_i32 s2, s5, 31 4301; GFX11-NEXT: s_add_i32 s3, s2, 0x80000000 4302; GFX11-NEXT: s_xor_b32 s0, s1, s0 4303; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0 4304; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0 4305; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] 4306; GFX11-NEXT: v_readfirstlane_b32 s0, v0 4307; GFX11-NEXT: v_readfirstlane_b32 s1, v1 4308; GFX11-NEXT: ; return to shader part epilog 4309 %result = call i48 @llvm.ssub.sat.i48(i48 %lhs, i48 %rhs) 4310 ret i48 %result 4311} 4312 4313define amdgpu_ps <2 x float> @ssubsat_i48_sv(i48 inreg %lhs, i48 %rhs) { 4314; GFX6-LABEL: ssubsat_i48_sv: 4315; GFX6: ; %bb.0: 4316; GFX6-NEXT: v_mov_b32_e32 v3, s1 4317; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s0, v0 4318; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v3, v1, vcc 4319; GFX6-NEXT: v_bfe_i32 v3, v2, 0, 16 4320; GFX6-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000 4321; GFX6-NEXT: v_bfe_i32 v1, v0, 0, 16 4322; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3] 4323; GFX6-NEXT: v_cmp_lt_i64_e64 s[2:3], 0, v[0:1] 4324; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3 4325; GFX6-NEXT: v_ashrrev_i32_e32 v1, 15, v3 4326; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0xffff8000, v0 4327; GFX6-NEXT: s_xor_b64 vcc, s[2:3], s[0:1] 4328; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 4329; GFX6-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc 4330; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 4331; GFX6-NEXT: ; return to shader part epilog 4332; 4333; GFX8-LABEL: ssubsat_i48_sv: 4334; GFX8: ; %bb.0: 4335; GFX8-NEXT: v_mov_b32_e32 v3, s1 4336; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v0 4337; GFX8-NEXT: v_subb_u32_e32 v4, vcc, v3, v1, vcc 4338; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 16 4339; GFX8-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000 4340; GFX8-NEXT: v_bfe_i32 v1, v0, 0, 16 4341; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3] 4342; GFX8-NEXT: v_cmp_lt_i64_e64 s[2:3], 0, v[0:1] 4343; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3 4344; GFX8-NEXT: v_ashrrev_i32_e32 v1, 15, v3 4345; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xffff8000, v0 4346; GFX8-NEXT: s_xor_b64 vcc, s[2:3], s[0:1] 4347; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 4348; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc 4349; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 4350; GFX8-NEXT: ; return to shader part epilog 4351; 4352; GFX9-LABEL: ssubsat_i48_sv: 4353; GFX9: ; %bb.0: 4354; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] 4355; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 4356; GFX9-NEXT: v_mov_b32_e32 v3, s1 4357; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s0, v0 4358; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v1, vcc 4359; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[2:3] 4360; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], 0, v[0:1] 4361; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3 4362; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0 4363; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc 4364; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc 4365; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 4366; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] 4367; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 4368; GFX9-NEXT: ; return to shader part epilog 4369; 4370; GFX10-LABEL: ssubsat_i48_sv: 4371; GFX10: ; %bb.0: 4372; GFX10-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] 4373; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 4374; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, s0, v0 4375; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo 4376; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 4377; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3] 4378; GFX10-NEXT: v_cmp_lt_i64_e64 s0, 0, v[0:1] 4379; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 4380; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo 4381; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo 4382; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo 4383; GFX10-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] 4384; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 4385; GFX10-NEXT: ; return to shader part epilog 4386; 4387; GFX11-LABEL: ssubsat_i48_sv: 4388; GFX11: ; %bb.0: 4389; GFX11-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] 4390; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 4391; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, s0, v0 4392; GFX11-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo 4393; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3 4394; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3] 4395; GFX11-NEXT: v_cmp_lt_i64_e64 s0, 0, v[0:1] 4396; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 4397; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo 4398; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1 4399; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] 4400; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 4401; GFX11-NEXT: ; return to shader part epilog 4402 %result = call i48 @llvm.ssub.sat.i48(i48 %lhs, i48 %rhs) 4403 %ext.result = zext i48 %result to i64 4404 %cast = bitcast i64 %ext.result to <2 x float> 4405 ret <2 x float> %cast 4406} 4407 4408define amdgpu_ps <2 x float> @ssubsat_i48_vs(i48 %lhs, i48 inreg %rhs) { 4409; GFX6-LABEL: ssubsat_i48_vs: 4410; GFX6: ; %bb.0: 4411; GFX6-NEXT: v_mov_b32_e32 v3, s1 4412; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s0, v0 4413; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v1, v3, vcc 4414; GFX6-NEXT: v_bfe_i32 v3, v2, 0, 16 4415; GFX6-NEXT: v_bfe_i32 v1, v0, 0, 16 4416; GFX6-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000 4417; GFX6-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1] 4418; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0 4419; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3 4420; GFX6-NEXT: v_ashrrev_i32_e32 v1, 15, v3 4421; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0xffff8000, v0 4422; GFX6-NEXT: s_xor_b64 vcc, s[0:1], s[2:3] 4423; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 4424; GFX6-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc 4425; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 4426; GFX6-NEXT: ; return to shader part epilog 4427; 4428; GFX8-LABEL: ssubsat_i48_vs: 4429; GFX8: ; %bb.0: 4430; GFX8-NEXT: v_mov_b32_e32 v3, s1 4431; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s0, v0 4432; GFX8-NEXT: v_subb_u32_e32 v4, vcc, v1, v3, vcc 4433; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 16 4434; GFX8-NEXT: v_bfe_i32 v1, v0, 0, 16 4435; GFX8-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x300000 4436; GFX8-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1] 4437; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0 4438; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3 4439; GFX8-NEXT: v_ashrrev_i32_e32 v1, 15, v3 4440; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xffff8000, v0 4441; GFX8-NEXT: s_xor_b64 vcc, s[0:1], s[2:3] 4442; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 4443; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc 4444; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 4445; GFX8-NEXT: ; return to shader part epilog 4446; 4447; GFX9-LABEL: ssubsat_i48_vs: 4448; GFX9: ; %bb.0: 4449; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] 4450; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 4451; GFX9-NEXT: v_mov_b32_e32 v3, s1 4452; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s0, v0 4453; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc 4454; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[0:1] 4455; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0 4456; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3 4457; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0 4458; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc 4459; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc 4460; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 4461; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] 4462; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 4463; GFX9-NEXT: ; return to shader part epilog 4464; 4465; GFX10-LABEL: ssubsat_i48_vs: 4466; GFX10: ; %bb.0: 4467; GFX10-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] 4468; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 4469; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v0, s0 4470; GFX10-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo 4471; GFX10-NEXT: v_cmp_gt_i64_e64 s0, s[0:1], 0 4472; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 4473; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1] 4474; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 4475; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo 4476; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo 4477; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo 4478; GFX10-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] 4479; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 4480; GFX10-NEXT: ; return to shader part epilog 4481; 4482; GFX11-LABEL: ssubsat_i48_vs: 4483; GFX11: ; %bb.0: 4484; GFX11-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] 4485; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 4486; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v0, s0 4487; GFX11-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo 4488; GFX11-NEXT: v_cmp_gt_i64_e64 s0, s[0:1], 0 4489; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3 4490; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1] 4491; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 4492; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo 4493; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1 4494; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] 4495; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 4496; GFX11-NEXT: ; return to shader part epilog 4497 %result = call i48 @llvm.ssub.sat.i48(i48 %lhs, i48 %rhs) 4498 %ext.result = zext i48 %result to i64 4499 %cast = bitcast i64 %ext.result to <2 x float> 4500 ret <2 x float> %cast 4501} 4502 4503define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) { 4504; GFX6-LABEL: v_ssubsat_i64: 4505; GFX6: ; %bb.0: 4506; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4507; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v0, v2 4508; GFX6-NEXT: v_subb_u32_e32 v5, vcc, v1, v3, vcc 4509; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1] 4510; GFX6-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[2:3] 4511; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v5 4512; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0x80000000, v0 4513; GFX6-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] 4514; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc 4515; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc 4516; GFX6-NEXT: s_setpc_b64 s[30:31] 4517; 4518; GFX8-LABEL: v_ssubsat_i64: 4519; GFX8: ; %bb.0: 4520; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4521; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v0, v2 4522; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v1, v3, vcc 4523; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1] 4524; GFX8-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[2:3] 4525; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v5 4526; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x80000000, v0 4527; GFX8-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] 4528; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc 4529; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc 4530; GFX8-NEXT: s_setpc_b64 s[30:31] 4531; 4532; GFX9-LABEL: v_ssubsat_i64: 4533; GFX9: ; %bb.0: 4534; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4535; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v0, v2 4536; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v3, vcc 4537; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] 4538; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[2:3] 4539; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v5 4540; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0 4541; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc 4542; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc 4543; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc 4544; GFX9-NEXT: s_setpc_b64 s[30:31] 4545; 4546; GFX10-LABEL: v_ssubsat_i64: 4547; GFX10: ; %bb.0: 4548; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4549; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2 4550; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo 4551; GFX10-NEXT: v_cmp_lt_i64_e64 s4, 0, v[2:3] 4552; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 4553; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] 4554; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v6 4555; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo 4556; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo 4557; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo 4558; GFX10-NEXT: s_setpc_b64 s[30:31] 4559; 4560; GFX11-LABEL: v_ssubsat_i64: 4561; GFX11: ; %bb.0: 4562; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4563; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2 4564; GFX11-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo 4565; GFX11-NEXT: v_cmp_lt_i64_e64 s0, 0, v[2:3] 4566; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v5 4567; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] 4568; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v6 4569; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo 4570; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1 4571; GFX11-NEXT: s_setpc_b64 s[30:31] 4572 %result = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs) 4573 ret i64 %result 4574} 4575 4576define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) { 4577; GFX6-LABEL: s_ssubsat_i64: 4578; GFX6: ; %bb.0: 4579; GFX6-NEXT: s_sub_u32 s4, s0, s2 4580; GFX6-NEXT: v_mov_b32_e32 v0, s0 4581; GFX6-NEXT: s_subb_u32 s5, s1, s3 4582; GFX6-NEXT: v_mov_b32_e32 v1, s1 4583; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 4584; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 4585; GFX6-NEXT: s_ashr_i32 s2, s5, 31 4586; GFX6-NEXT: s_add_i32 s3, s2, 0x80000000 4587; GFX6-NEXT: v_mov_b32_e32 v0, s2 4588; GFX6-NEXT: v_mov_b32_e32 v1, s3 4589; GFX6-NEXT: v_mov_b32_e32 v2, s4 4590; GFX6-NEXT: v_mov_b32_e32 v3, s5 4591; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc 4592; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc 4593; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 4594; GFX6-NEXT: v_readfirstlane_b32 s0, v0 4595; GFX6-NEXT: v_readfirstlane_b32 s1, v1 4596; GFX6-NEXT: ; return to shader part epilog 4597; 4598; GFX8-LABEL: s_ssubsat_i64: 4599; GFX8: ; %bb.0: 4600; GFX8-NEXT: s_sub_u32 s4, s0, s2 4601; GFX8-NEXT: v_mov_b32_e32 v0, s0 4602; GFX8-NEXT: s_subb_u32 s5, s1, s3 4603; GFX8-NEXT: v_mov_b32_e32 v1, s1 4604; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 4605; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 4606; GFX8-NEXT: s_ashr_i32 s2, s5, 31 4607; GFX8-NEXT: s_add_i32 s3, s2, 0x80000000 4608; GFX8-NEXT: v_mov_b32_e32 v0, s2 4609; GFX8-NEXT: v_mov_b32_e32 v1, s3 4610; GFX8-NEXT: v_mov_b32_e32 v2, s4 4611; GFX8-NEXT: v_mov_b32_e32 v3, s5 4612; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc 4613; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc 4614; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 4615; GFX8-NEXT: v_readfirstlane_b32 s0, v0 4616; GFX8-NEXT: v_readfirstlane_b32 s1, v1 4617; GFX8-NEXT: ; return to shader part epilog 4618; 4619; GFX9-LABEL: s_ssubsat_i64: 4620; GFX9: ; %bb.0: 4621; GFX9-NEXT: s_sub_u32 s4, s0, s2 4622; GFX9-NEXT: v_mov_b32_e32 v0, s0 4623; GFX9-NEXT: s_subb_u32 s5, s1, s3 4624; GFX9-NEXT: v_mov_b32_e32 v1, s1 4625; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] 4626; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 4627; GFX9-NEXT: s_ashr_i32 s2, s5, 31 4628; GFX9-NEXT: s_add_i32 s3, s2, 0x80000000 4629; GFX9-NEXT: v_mov_b32_e32 v0, s2 4630; GFX9-NEXT: v_mov_b32_e32 v1, s3 4631; GFX9-NEXT: v_mov_b32_e32 v2, s4 4632; GFX9-NEXT: v_mov_b32_e32 v3, s5 4633; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc 4634; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc 4635; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 4636; GFX9-NEXT: v_readfirstlane_b32 s0, v0 4637; GFX9-NEXT: v_readfirstlane_b32 s1, v1 4638; GFX9-NEXT: ; return to shader part epilog 4639; 4640; GFX10-LABEL: s_ssubsat_i64: 4641; GFX10: ; %bb.0: 4642; GFX10-NEXT: s_sub_u32 s4, s0, s2 4643; GFX10-NEXT: s_subb_u32 s5, s1, s3 4644; GFX10-NEXT: v_mov_b32_e32 v0, s4 4645; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[4:5], s[0:1] 4646; GFX10-NEXT: v_cmp_gt_i64_e64 s1, s[2:3], 0 4647; GFX10-NEXT: v_mov_b32_e32 v1, s5 4648; GFX10-NEXT: s_ashr_i32 s2, s5, 31 4649; GFX10-NEXT: s_add_i32 s3, s2, 0x80000000 4650; GFX10-NEXT: s_xor_b32 s0, s1, s0 4651; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0 4652; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0 4653; GFX10-NEXT: v_readfirstlane_b32 s0, v0 4654; GFX10-NEXT: v_readfirstlane_b32 s1, v1 4655; GFX10-NEXT: ; return to shader part epilog 4656; 4657; GFX11-LABEL: s_ssubsat_i64: 4658; GFX11: ; %bb.0: 4659; GFX11-NEXT: s_sub_u32 s4, s0, s2 4660; GFX11-NEXT: s_subb_u32 s5, s1, s3 4661; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 4662; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[4:5], s[0:1] 4663; GFX11-NEXT: v_cmp_gt_i64_e64 s1, s[2:3], 0 4664; GFX11-NEXT: s_ashr_i32 s2, s5, 31 4665; GFX11-NEXT: s_add_i32 s3, s2, 0x80000000 4666; GFX11-NEXT: s_xor_b32 s0, s1, s0 4667; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0 4668; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0 4669; GFX11-NEXT: v_readfirstlane_b32 s0, v0 4670; GFX11-NEXT: v_readfirstlane_b32 s1, v1 4671; GFX11-NEXT: ; return to shader part epilog 4672 %result = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs) 4673 ret i64 %result 4674} 4675 4676define amdgpu_ps <2 x float> @ssubsat_i64_sv(i64 inreg %lhs, i64 %rhs) { 4677; GFX6-LABEL: ssubsat_i64_sv: 4678; GFX6: ; %bb.0: 4679; GFX6-NEXT: v_mov_b32_e32 v3, s1 4680; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s0, v0 4681; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v3, v1, vcc 4682; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3] 4683; GFX6-NEXT: v_cmp_lt_i64_e64 s[2:3], 0, v[0:1] 4684; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3 4685; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0x80000000, v0 4686; GFX6-NEXT: s_xor_b64 vcc, s[2:3], s[0:1] 4687; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc 4688; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 4689; GFX6-NEXT: ; return to shader part epilog 4690; 4691; GFX8-LABEL: ssubsat_i64_sv: 4692; GFX8: ; %bb.0: 4693; GFX8-NEXT: v_mov_b32_e32 v3, s1 4694; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v0 4695; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v1, vcc 4696; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3] 4697; GFX8-NEXT: v_cmp_lt_i64_e64 s[2:3], 0, v[0:1] 4698; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3 4699; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x80000000, v0 4700; GFX8-NEXT: s_xor_b64 vcc, s[2:3], s[0:1] 4701; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc 4702; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 4703; GFX8-NEXT: ; return to shader part epilog 4704; 4705; GFX9-LABEL: ssubsat_i64_sv: 4706; GFX9: ; %bb.0: 4707; GFX9-NEXT: v_mov_b32_e32 v3, s1 4708; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s0, v0 4709; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v1, vcc 4710; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[2:3] 4711; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], 0, v[0:1] 4712; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3 4713; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0 4714; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc 4715; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc 4716; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 4717; GFX9-NEXT: ; return to shader part epilog 4718; 4719; GFX10-LABEL: ssubsat_i64_sv: 4720; GFX10: ; %bb.0: 4721; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, s0, v0 4722; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo 4723; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 4724; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3] 4725; GFX10-NEXT: v_cmp_lt_i64_e64 s0, 0, v[0:1] 4726; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 4727; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo 4728; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo 4729; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo 4730; GFX10-NEXT: ; return to shader part epilog 4731; 4732; GFX11-LABEL: ssubsat_i64_sv: 4733; GFX11: ; %bb.0: 4734; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, s0, v0 4735; GFX11-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo 4736; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3 4737; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3] 4738; GFX11-NEXT: v_cmp_lt_i64_e64 s0, 0, v[0:1] 4739; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 4740; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo 4741; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1 4742; GFX11-NEXT: ; return to shader part epilog 4743 %result = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs) 4744 %cast = bitcast i64 %result to <2 x float> 4745 ret <2 x float> %cast 4746} 4747 4748define amdgpu_ps <2 x float> @ssubsat_i64_vs(i64 %lhs, i64 inreg %rhs) { 4749; GFX6-LABEL: ssubsat_i64_vs: 4750; GFX6: ; %bb.0: 4751; GFX6-NEXT: v_mov_b32_e32 v3, s1 4752; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s0, v0 4753; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc 4754; GFX6-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1] 4755; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0 4756; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3 4757; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0x80000000, v0 4758; GFX6-NEXT: s_xor_b64 vcc, s[0:1], s[2:3] 4759; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc 4760; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 4761; GFX6-NEXT: ; return to shader part epilog 4762; 4763; GFX8-LABEL: ssubsat_i64_vs: 4764; GFX8: ; %bb.0: 4765; GFX8-NEXT: v_mov_b32_e32 v3, s1 4766; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s0, v0 4767; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc 4768; GFX8-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1] 4769; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0 4770; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3 4771; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x80000000, v0 4772; GFX8-NEXT: s_xor_b64 vcc, s[0:1], s[2:3] 4773; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc 4774; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 4775; GFX8-NEXT: ; return to shader part epilog 4776; 4777; GFX9-LABEL: ssubsat_i64_vs: 4778; GFX9: ; %bb.0: 4779; GFX9-NEXT: v_mov_b32_e32 v3, s1 4780; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s0, v0 4781; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc 4782; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[0:1] 4783; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0 4784; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3 4785; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0 4786; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc 4787; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc 4788; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 4789; GFX9-NEXT: ; return to shader part epilog 4790; 4791; GFX10-LABEL: ssubsat_i64_vs: 4792; GFX10: ; %bb.0: 4793; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v0, s0 4794; GFX10-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo 4795; GFX10-NEXT: v_cmp_gt_i64_e64 s0, s[0:1], 0 4796; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 4797; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1] 4798; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 4799; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo 4800; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo 4801; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo 4802; GFX10-NEXT: ; return to shader part epilog 4803; 4804; GFX11-LABEL: ssubsat_i64_vs: 4805; GFX11: ; %bb.0: 4806; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v0, s0 4807; GFX11-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo 4808; GFX11-NEXT: v_cmp_gt_i64_e64 s0, s[0:1], 0 4809; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3 4810; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1] 4811; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 4812; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo 4813; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1 4814; GFX11-NEXT: ; return to shader part epilog 4815 %result = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs) 4816 %cast = bitcast i64 %result to <2 x float> 4817 ret <2 x float> %cast 4818} 4819 4820define <2 x i64> @v_ssubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { 4821; GFX6-LABEL: v_ssubsat_v2i64: 4822; GFX6: ; %bb.0: 4823; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4824; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v0, v4 4825; GFX6-NEXT: v_subb_u32_e32 v9, vcc, v1, v5, vcc 4826; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], v[8:9], v[0:1] 4827; GFX6-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[4:5] 4828; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v9 4829; GFX6-NEXT: v_bfrev_b32_e32 v1, 1 4830; GFX6-NEXT: v_add_i32_e32 v1, vcc, v0, v1 4831; GFX6-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] 4832; GFX6-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc 4833; GFX6-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc 4834; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v2, v6 4835; GFX6-NEXT: v_subb_u32_e32 v5, vcc, v3, v7, vcc 4836; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[2:3] 4837; GFX6-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[6:7] 4838; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v5 4839; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0x80000000, v2 4840; GFX6-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] 4841; GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 4842; GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc 4843; GFX6-NEXT: s_setpc_b64 s[30:31] 4844; 4845; GFX8-LABEL: v_ssubsat_v2i64: 4846; GFX8: ; %bb.0: 4847; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4848; GFX8-NEXT: v_sub_u32_e32 v8, vcc, v0, v4 4849; GFX8-NEXT: v_subb_u32_e32 v9, vcc, v1, v5, vcc 4850; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], v[8:9], v[0:1] 4851; GFX8-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[4:5] 4852; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v9 4853; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 4854; GFX8-NEXT: v_add_u32_e32 v1, vcc, v0, v1 4855; GFX8-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] 4856; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc 4857; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc 4858; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v2, v6 4859; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v3, v7, vcc 4860; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[2:3] 4861; GFX8-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[6:7] 4862; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v5 4863; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x80000000, v2 4864; GFX8-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] 4865; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 4866; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc 4867; GFX8-NEXT: s_setpc_b64 s[30:31] 4868; 4869; GFX9-LABEL: v_ssubsat_v2i64: 4870; GFX9: ; %bb.0: 4871; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4872; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, v0, v4 4873; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, v1, v5, vcc 4874; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[8:9], v[0:1] 4875; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[4:5] 4876; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v9 4877; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0 4878; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc 4879; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc 4880; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc 4881; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v2, v6 4882; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v3, v7, vcc 4883; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[2:3] 4884; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[6:7] 4885; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v5 4886; GFX9-NEXT: v_add_u32_e32 v3, 0x80000000, v2 4887; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc 4888; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 4889; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc 4890; GFX9-NEXT: s_setpc_b64 s[30:31] 4891; 4892; GFX10-LABEL: v_ssubsat_v2i64: 4893; GFX10: ; %bb.0: 4894; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4895; GFX10-NEXT: v_sub_co_u32 v8, vcc_lo, v0, v4 4896; GFX10-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, v1, v5, vcc_lo 4897; GFX10-NEXT: v_sub_co_u32 v10, vcc_lo, v2, v6 4898; GFX10-NEXT: v_sub_co_ci_u32_e32 v11, vcc_lo, v3, v7, vcc_lo 4899; GFX10-NEXT: v_ashrrev_i32_e32 v12, 31, v9 4900; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[0:1] 4901; GFX10-NEXT: v_cmp_lt_i64_e64 s4, 0, v[4:5] 4902; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v11 4903; GFX10-NEXT: v_cmp_lt_i64_e64 s5, v[10:11], v[2:3] 4904; GFX10-NEXT: v_cmp_lt_i64_e64 s6, 0, v[6:7] 4905; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v12 4906; GFX10-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v4 4907; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo 4908; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v12, vcc_lo 4909; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc_lo 4910; GFX10-NEXT: s_xor_b32 vcc_lo, s6, s5 4911; GFX10-NEXT: v_cndmask_b32_e32 v2, v10, v4, vcc_lo 4912; GFX10-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc_lo 4913; GFX10-NEXT: s_setpc_b64 s[30:31] 4914; 4915; GFX11-LABEL: v_ssubsat_v2i64: 4916; GFX11: ; %bb.0: 4917; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 4918; GFX11-NEXT: v_sub_co_u32 v8, vcc_lo, v0, v4 4919; GFX11-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, v1, v5, vcc_lo 4920; GFX11-NEXT: v_sub_co_u32 v10, vcc_lo, v2, v6 4921; GFX11-NEXT: v_sub_co_ci_u32_e32 v11, vcc_lo, v3, v7, vcc_lo 4922; GFX11-NEXT: v_ashrrev_i32_e32 v12, 31, v9 4923; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[0:1] 4924; GFX11-NEXT: v_cmp_lt_i64_e64 s0, 0, v[4:5] 4925; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v11 4926; GFX11-NEXT: v_cmp_lt_i64_e64 s1, v[10:11], v[2:3] 4927; GFX11-NEXT: v_cmp_lt_i64_e64 s2, 0, v[6:7] 4928; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v12 4929; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v4 4930; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo 4931; GFX11-NEXT: v_dual_cndmask_b32 v0, v8, v12 :: v_dual_cndmask_b32 v1, v9, v1 4932; GFX11-NEXT: s_xor_b32 vcc_lo, s2, s1 4933; GFX11-NEXT: v_dual_cndmask_b32 v2, v10, v4 :: v_dual_cndmask_b32 v3, v11, v3 4934; GFX11-NEXT: s_setpc_b64 s[30:31] 4935 %result = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) 4936 ret <2 x i64> %result 4937} 4938 4939define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inreg %rhs) { 4940; GFX6-LABEL: s_ssubsat_v2i64: 4941; GFX6: ; %bb.0: 4942; GFX6-NEXT: s_sub_u32 s8, s0, s4 4943; GFX6-NEXT: v_mov_b32_e32 v0, s0 4944; GFX6-NEXT: s_subb_u32 s9, s1, s5 4945; GFX6-NEXT: v_mov_b32_e32 v1, s1 4946; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] 4947; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], 0 4948; GFX6-NEXT: s_ashr_i32 s4, s9, 31 4949; GFX6-NEXT: s_add_i32 s5, s4, 0x80000000 4950; GFX6-NEXT: v_mov_b32_e32 v0, s4 4951; GFX6-NEXT: v_mov_b32_e32 v1, s5 4952; GFX6-NEXT: v_mov_b32_e32 v2, s8 4953; GFX6-NEXT: v_mov_b32_e32 v3, s9 4954; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc 4955; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc 4956; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc 4957; GFX6-NEXT: s_sub_u32 s0, s2, s6 4958; GFX6-NEXT: v_mov_b32_e32 v0, s2 4959; GFX6-NEXT: s_subb_u32 s1, s3, s7 4960; GFX6-NEXT: v_mov_b32_e32 v1, s3 4961; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] 4962; GFX6-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], 0 4963; GFX6-NEXT: s_ashr_i32 s4, s1, 31 4964; GFX6-NEXT: s_add_i32 s5, s4, 0x80000000 4965; GFX6-NEXT: v_mov_b32_e32 v0, s4 4966; GFX6-NEXT: v_mov_b32_e32 v1, s5 4967; GFX6-NEXT: v_mov_b32_e32 v4, s0 4968; GFX6-NEXT: v_mov_b32_e32 v5, s1 4969; GFX6-NEXT: s_xor_b64 vcc, s[2:3], vcc 4970; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc 4971; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc 4972; GFX6-NEXT: v_readfirstlane_b32 s0, v2 4973; GFX6-NEXT: v_readfirstlane_b32 s1, v3 4974; GFX6-NEXT: v_readfirstlane_b32 s2, v0 4975; GFX6-NEXT: v_readfirstlane_b32 s3, v1 4976; GFX6-NEXT: ; return to shader part epilog 4977; 4978; GFX8-LABEL: s_ssubsat_v2i64: 4979; GFX8: ; %bb.0: 4980; GFX8-NEXT: s_sub_u32 s8, s0, s4 4981; GFX8-NEXT: v_mov_b32_e32 v0, s0 4982; GFX8-NEXT: s_subb_u32 s9, s1, s5 4983; GFX8-NEXT: v_mov_b32_e32 v1, s1 4984; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] 4985; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], 0 4986; GFX8-NEXT: s_ashr_i32 s4, s9, 31 4987; GFX8-NEXT: s_add_i32 s5, s4, 0x80000000 4988; GFX8-NEXT: v_mov_b32_e32 v0, s4 4989; GFX8-NEXT: v_mov_b32_e32 v1, s5 4990; GFX8-NEXT: v_mov_b32_e32 v2, s8 4991; GFX8-NEXT: v_mov_b32_e32 v3, s9 4992; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc 4993; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc 4994; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc 4995; GFX8-NEXT: s_sub_u32 s0, s2, s6 4996; GFX8-NEXT: v_mov_b32_e32 v0, s2 4997; GFX8-NEXT: s_subb_u32 s1, s3, s7 4998; GFX8-NEXT: v_mov_b32_e32 v1, s3 4999; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] 5000; GFX8-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], 0 5001; GFX8-NEXT: s_ashr_i32 s4, s1, 31 5002; GFX8-NEXT: s_add_i32 s5, s4, 0x80000000 5003; GFX8-NEXT: v_mov_b32_e32 v0, s4 5004; GFX8-NEXT: v_mov_b32_e32 v1, s5 5005; GFX8-NEXT: v_mov_b32_e32 v4, s0 5006; GFX8-NEXT: v_mov_b32_e32 v5, s1 5007; GFX8-NEXT: s_xor_b64 vcc, s[2:3], vcc 5008; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc 5009; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc 5010; GFX8-NEXT: v_readfirstlane_b32 s0, v2 5011; GFX8-NEXT: v_readfirstlane_b32 s1, v3 5012; GFX8-NEXT: v_readfirstlane_b32 s2, v0 5013; GFX8-NEXT: v_readfirstlane_b32 s3, v1 5014; GFX8-NEXT: ; return to shader part epilog 5015; 5016; GFX9-LABEL: s_ssubsat_v2i64: 5017; GFX9: ; %bb.0: 5018; GFX9-NEXT: s_sub_u32 s8, s0, s4 5019; GFX9-NEXT: v_mov_b32_e32 v0, s0 5020; GFX9-NEXT: s_subb_u32 s9, s1, s5 5021; GFX9-NEXT: v_mov_b32_e32 v1, s1 5022; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] 5023; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], 0 5024; GFX9-NEXT: s_ashr_i32 s4, s9, 31 5025; GFX9-NEXT: s_add_i32 s5, s4, 0x80000000 5026; GFX9-NEXT: v_mov_b32_e32 v0, s4 5027; GFX9-NEXT: v_mov_b32_e32 v1, s5 5028; GFX9-NEXT: v_mov_b32_e32 v2, s8 5029; GFX9-NEXT: v_mov_b32_e32 v3, s9 5030; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc 5031; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc 5032; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc 5033; GFX9-NEXT: s_sub_u32 s0, s2, s6 5034; GFX9-NEXT: v_mov_b32_e32 v0, s2 5035; GFX9-NEXT: s_subb_u32 s1, s3, s7 5036; GFX9-NEXT: v_mov_b32_e32 v1, s3 5037; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] 5038; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], 0 5039; GFX9-NEXT: s_ashr_i32 s4, s1, 31 5040; GFX9-NEXT: s_add_i32 s5, s4, 0x80000000 5041; GFX9-NEXT: v_mov_b32_e32 v0, s4 5042; GFX9-NEXT: v_mov_b32_e32 v1, s5 5043; GFX9-NEXT: v_mov_b32_e32 v4, s0 5044; GFX9-NEXT: v_mov_b32_e32 v5, s1 5045; GFX9-NEXT: s_xor_b64 vcc, s[2:3], vcc 5046; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc 5047; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc 5048; GFX9-NEXT: v_readfirstlane_b32 s0, v2 5049; GFX9-NEXT: v_readfirstlane_b32 s1, v3 5050; GFX9-NEXT: v_readfirstlane_b32 s2, v0 5051; GFX9-NEXT: v_readfirstlane_b32 s3, v1 5052; GFX9-NEXT: ; return to shader part epilog 5053; 5054; GFX10-LABEL: s_ssubsat_v2i64: 5055; GFX10: ; %bb.0: 5056; GFX10-NEXT: s_sub_u32 s8, s0, s4 5057; GFX10-NEXT: s_subb_u32 s9, s1, s5 5058; GFX10-NEXT: v_mov_b32_e32 v0, s8 5059; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[8:9], s[0:1] 5060; GFX10-NEXT: v_cmp_gt_i64_e64 s1, s[4:5], 0 5061; GFX10-NEXT: s_ashr_i32 s4, s9, 31 5062; GFX10-NEXT: v_mov_b32_e32 v1, s9 5063; GFX10-NEXT: s_add_i32 s5, s4, 0x80000000 5064; GFX10-NEXT: s_xor_b32 s8, s1, s0 5065; GFX10-NEXT: s_sub_u32 s0, s2, s6 5066; GFX10-NEXT: s_subb_u32 s1, s3, s7 5067; GFX10-NEXT: v_mov_b32_e32 v2, s0 5068; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[0:1], s[2:3] 5069; GFX10-NEXT: v_cmp_gt_i64_e64 s3, s[6:7], 0 5070; GFX10-NEXT: v_mov_b32_e32 v3, s1 5071; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s4, s8 5072; GFX10-NEXT: s_ashr_i32 s4, s1, 31 5073; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s5, s8 5074; GFX10-NEXT: s_add_i32 s0, s4, 0x80000000 5075; GFX10-NEXT: s_xor_b32 s1, s3, s2 5076; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s4, s1 5077; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s0, s1 5078; GFX10-NEXT: v_readfirstlane_b32 s0, v0 5079; GFX10-NEXT: v_readfirstlane_b32 s1, v1 5080; GFX10-NEXT: v_readfirstlane_b32 s2, v2 5081; GFX10-NEXT: v_readfirstlane_b32 s3, v3 5082; GFX10-NEXT: ; return to shader part epilog 5083; 5084; GFX11-LABEL: s_ssubsat_v2i64: 5085; GFX11: ; %bb.0: 5086; GFX11-NEXT: s_sub_u32 s8, s0, s4 5087; GFX11-NEXT: s_subb_u32 s9, s1, s5 5088; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 5089; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[8:9], s[0:1] 5090; GFX11-NEXT: v_cmp_gt_i64_e64 s1, s[4:5], 0 5091; GFX11-NEXT: s_ashr_i32 s4, s9, 31 5092; GFX11-NEXT: s_add_i32 s5, s4, 0x80000000 5093; GFX11-NEXT: s_xor_b32 s8, s1, s0 5094; GFX11-NEXT: s_sub_u32 s0, s2, s6 5095; GFX11-NEXT: s_subb_u32 s1, s3, s7 5096; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 5097; GFX11-NEXT: v_cmp_lt_i64_e64 s2, s[0:1], s[2:3] 5098; GFX11-NEXT: v_cmp_gt_i64_e64 s3, s[6:7], 0 5099; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s4, s8 5100; GFX11-NEXT: s_ashr_i32 s4, s1, 31 5101; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s5, s8 5102; GFX11-NEXT: s_add_i32 s0, s4, 0x80000000 5103; GFX11-NEXT: s_xor_b32 s1, s3, s2 5104; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s4, s1 5105; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s0, s1 5106; GFX11-NEXT: v_readfirstlane_b32 s0, v0 5107; GFX11-NEXT: v_readfirstlane_b32 s1, v1 5108; GFX11-NEXT: v_readfirstlane_b32 s2, v2 5109; GFX11-NEXT: v_readfirstlane_b32 s3, v3 5110; GFX11-NEXT: ; return to shader part epilog 5111 %result = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) 5112 ret <2 x i64> %result 5113} 5114 5115define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) { 5116; GFX6-LABEL: s_ssubsat_i128: 5117; GFX6: ; %bb.0: 5118; GFX6-NEXT: s_sub_u32 s8, s0, s4 5119; GFX6-NEXT: v_mov_b32_e32 v0, s0 5120; GFX6-NEXT: s_subb_u32 s9, s1, s5 5121; GFX6-NEXT: v_mov_b32_e32 v1, s1 5122; GFX6-NEXT: s_subb_u32 s10, s2, s6 5123; GFX6-NEXT: v_mov_b32_e32 v2, s2 5124; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1] 5125; GFX6-NEXT: s_subb_u32 s11, s3, s7 5126; GFX6-NEXT: v_mov_b32_e32 v3, s3 5127; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 5128; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[10:11], v[2:3] 5129; GFX6-NEXT: v_cmp_gt_u64_e64 s[0:1], s[4:5], 0 5130; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 5131; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[10:11], v[2:3] 5132; GFX6-NEXT: v_mov_b32_e32 v3, s9 5133; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc 5134; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] 5135; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], 0 5136; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[6:7], 0 5137; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] 5138; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc 5139; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 5140; GFX6-NEXT: s_ashr_i32 s0, s11, 31 5141; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 5142; GFX6-NEXT: s_add_i32 s1, s0, 0x80000000 5143; GFX6-NEXT: v_mov_b32_e32 v1, s0 5144; GFX6-NEXT: v_mov_b32_e32 v2, s8 5145; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 5146; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 5147; GFX6-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc 5148; GFX6-NEXT: v_mov_b32_e32 v3, s1 5149; GFX6-NEXT: v_mov_b32_e32 v4, s10 5150; GFX6-NEXT: v_mov_b32_e32 v5, s11 5151; GFX6-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc 5152; GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc 5153; GFX6-NEXT: v_readfirstlane_b32 s0, v0 5154; GFX6-NEXT: v_readfirstlane_b32 s1, v2 5155; GFX6-NEXT: v_readfirstlane_b32 s2, v1 5156; GFX6-NEXT: v_readfirstlane_b32 s3, v3 5157; GFX6-NEXT: ; return to shader part epilog 5158; 5159; GFX8-LABEL: s_ssubsat_i128: 5160; GFX8: ; %bb.0: 5161; GFX8-NEXT: s_sub_u32 s8, s0, s4 5162; GFX8-NEXT: s_subb_u32 s9, s1, s5 5163; GFX8-NEXT: v_mov_b32_e32 v0, s0 5164; GFX8-NEXT: s_subb_u32 s10, s2, s6 5165; GFX8-NEXT: v_mov_b32_e32 v1, s1 5166; GFX8-NEXT: s_subb_u32 s11, s3, s7 5167; GFX8-NEXT: v_mov_b32_e32 v2, s2 5168; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1] 5169; GFX8-NEXT: v_mov_b32_e32 v3, s3 5170; GFX8-NEXT: s_cmp_eq_u64 s[10:11], s[2:3] 5171; GFX8-NEXT: s_cselect_b32 s0, 1, 0 5172; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 5173; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[10:11], v[2:3] 5174; GFX8-NEXT: s_and_b32 s0, 1, s0 5175; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 5176; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 5177; GFX8-NEXT: v_cmp_gt_u64_e64 s[0:1], s[4:5], 0 5178; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc 5179; GFX8-NEXT: s_cmp_eq_u64 s[6:7], 0 5180; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] 5181; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], 0 5182; GFX8-NEXT: s_cselect_b32 s2, 1, 0 5183; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] 5184; GFX8-NEXT: s_and_b32 s0, 1, s2 5185; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 5186; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc 5187; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 5188; GFX8-NEXT: s_ashr_i32 s0, s11, 31 5189; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 5190; GFX8-NEXT: s_add_i32 s1, s0, 0x80000000 5191; GFX8-NEXT: v_mov_b32_e32 v1, s0 5192; GFX8-NEXT: v_mov_b32_e32 v2, s8 5193; GFX8-NEXT: v_mov_b32_e32 v3, s9 5194; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 5195; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 5196; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc 5197; GFX8-NEXT: v_mov_b32_e32 v3, s1 5198; GFX8-NEXT: v_mov_b32_e32 v4, s10 5199; GFX8-NEXT: v_mov_b32_e32 v5, s11 5200; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc 5201; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc 5202; GFX8-NEXT: v_readfirstlane_b32 s0, v0 5203; GFX8-NEXT: v_readfirstlane_b32 s1, v2 5204; GFX8-NEXT: v_readfirstlane_b32 s2, v1 5205; GFX8-NEXT: v_readfirstlane_b32 s3, v3 5206; GFX8-NEXT: ; return to shader part epilog 5207; 5208; GFX9-LABEL: s_ssubsat_i128: 5209; GFX9: ; %bb.0: 5210; GFX9-NEXT: s_sub_u32 s8, s0, s4 5211; GFX9-NEXT: s_subb_u32 s9, s1, s5 5212; GFX9-NEXT: v_mov_b32_e32 v0, s0 5213; GFX9-NEXT: s_subb_u32 s10, s2, s6 5214; GFX9-NEXT: v_mov_b32_e32 v1, s1 5215; GFX9-NEXT: s_subb_u32 s11, s3, s7 5216; GFX9-NEXT: v_mov_b32_e32 v2, s2 5217; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1] 5218; GFX9-NEXT: v_mov_b32_e32 v3, s3 5219; GFX9-NEXT: s_cmp_eq_u64 s[10:11], s[2:3] 5220; GFX9-NEXT: s_cselect_b32 s0, 1, 0 5221; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 5222; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[10:11], v[2:3] 5223; GFX9-NEXT: s_and_b32 s0, 1, s0 5224; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 5225; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 5226; GFX9-NEXT: v_cmp_gt_u64_e64 s[0:1], s[4:5], 0 5227; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc 5228; GFX9-NEXT: s_cmp_eq_u64 s[6:7], 0 5229; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] 5230; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], 0 5231; GFX9-NEXT: s_cselect_b32 s2, 1, 0 5232; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] 5233; GFX9-NEXT: s_and_b32 s0, 1, s2 5234; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 5235; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc 5236; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 5237; GFX9-NEXT: s_ashr_i32 s0, s11, 31 5238; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 5239; GFX9-NEXT: s_add_i32 s1, s0, 0x80000000 5240; GFX9-NEXT: v_mov_b32_e32 v1, s0 5241; GFX9-NEXT: v_mov_b32_e32 v2, s8 5242; GFX9-NEXT: v_mov_b32_e32 v3, s9 5243; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 5244; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 5245; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc 5246; GFX9-NEXT: v_mov_b32_e32 v3, s1 5247; GFX9-NEXT: v_mov_b32_e32 v4, s10 5248; GFX9-NEXT: v_mov_b32_e32 v5, s11 5249; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc 5250; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc 5251; GFX9-NEXT: v_readfirstlane_b32 s0, v0 5252; GFX9-NEXT: v_readfirstlane_b32 s1, v2 5253; GFX9-NEXT: v_readfirstlane_b32 s2, v1 5254; GFX9-NEXT: v_readfirstlane_b32 s3, v3 5255; GFX9-NEXT: ; return to shader part epilog 5256; 5257; GFX10-LABEL: s_ssubsat_i128: 5258; GFX10: ; %bb.0: 5259; GFX10-NEXT: s_sub_u32 s8, s0, s4 5260; GFX10-NEXT: s_subb_u32 s9, s1, s5 5261; GFX10-NEXT: s_subb_u32 s10, s2, s6 5262; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[8:9], s[0:1] 5263; GFX10-NEXT: s_subb_u32 s11, s3, s7 5264; GFX10-NEXT: s_cmp_eq_u64 s[10:11], s[2:3] 5265; GFX10-NEXT: s_cselect_b32 s12, 1, 0 5266; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 5267; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[10:11], s[2:3] 5268; GFX10-NEXT: v_cmp_gt_u64_e64 s2, s[4:5], 0 5269; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 5270; GFX10-NEXT: s_and_b32 s0, 1, s12 5271; GFX10-NEXT: s_cmp_eq_u64 s[6:7], 0 5272; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2 5273; GFX10-NEXT: v_cmp_gt_i64_e64 s2, s[6:7], 0 5274; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 5275; GFX10-NEXT: s_cselect_b32 s1, 1, 0 5276; GFX10-NEXT: s_ashr_i32 s0, s11, 31 5277; GFX10-NEXT: s_and_b32 s1, 1, s1 5278; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 5279; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo 5280; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 5281; GFX10-NEXT: s_add_i32 s1, s0, 0x80000000 5282; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo 5283; GFX10-NEXT: v_mov_b32_e32 v2, s9 5284; GFX10-NEXT: v_mov_b32_e32 v3, s11 5285; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 5286; GFX10-NEXT: v_mov_b32_e32 v1, s8 5287; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 5288; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 5289; GFX10-NEXT: v_mov_b32_e32 v0, s10 5290; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s0, vcc_lo 5291; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s0, vcc_lo 5292; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc_lo 5293; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s1, vcc_lo 5294; GFX10-NEXT: v_readfirstlane_b32 s0, v1 5295; GFX10-NEXT: v_readfirstlane_b32 s1, v2 5296; GFX10-NEXT: v_readfirstlane_b32 s2, v0 5297; GFX10-NEXT: v_readfirstlane_b32 s3, v3 5298; GFX10-NEXT: ; return to shader part epilog 5299; 5300; GFX11-LABEL: s_ssubsat_i128: 5301; GFX11: ; %bb.0: 5302; GFX11-NEXT: s_sub_u32 s8, s0, s4 5303; GFX11-NEXT: s_subb_u32 s9, s1, s5 5304; GFX11-NEXT: s_subb_u32 s10, s2, s6 5305; GFX11-NEXT: v_cmp_lt_u64_e64 s0, s[8:9], s[0:1] 5306; GFX11-NEXT: s_subb_u32 s11, s3, s7 5307; GFX11-NEXT: s_cmp_eq_u64 s[10:11], s[2:3] 5308; GFX11-NEXT: s_cselect_b32 s12, 1, 0 5309; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 5310; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[10:11], s[2:3] 5311; GFX11-NEXT: v_cmp_gt_u64_e64 s2, s[4:5], 0 5312; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 5313; GFX11-NEXT: s_and_b32 s0, 1, s12 5314; GFX11-NEXT: s_cmp_eq_u64 s[6:7], 0 5315; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2 5316; GFX11-NEXT: v_cmp_gt_i64_e64 s2, s[6:7], 0 5317; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 5318; GFX11-NEXT: s_cselect_b32 s1, 1, 0 5319; GFX11-NEXT: s_ashr_i32 s0, s11, 31 5320; GFX11-NEXT: s_and_b32 s1, 1, s1 5321; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 5322; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo 5323; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 5324; GFX11-NEXT: s_add_i32 s1, s0, 0x80000000 5325; GFX11-NEXT: v_dual_cndmask_b32 v1, v3, v2 :: v_dual_mov_b32 v2, s9 5326; GFX11-NEXT: v_mov_b32_e32 v3, s11 5327; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0 5328; GFX11-NEXT: v_dual_mov_b32 v1, s8 :: v_dual_and_b32 v0, 1, v0 5329; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 5330; GFX11-NEXT: v_mov_b32_e32 v0, s10 5331; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s0, vcc_lo 5332; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s0, vcc_lo 5333; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc_lo 5334; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s1, vcc_lo 5335; GFX11-NEXT: v_readfirstlane_b32 s0, v1 5336; GFX11-NEXT: v_readfirstlane_b32 s1, v2 5337; GFX11-NEXT: v_readfirstlane_b32 s2, v0 5338; GFX11-NEXT: v_readfirstlane_b32 s3, v3 5339; GFX11-NEXT: ; return to shader part epilog 5340 %result = call i128 @llvm.ssub.sat.i128(i128 %lhs, i128 %rhs) 5341 ret i128 %result 5342} 5343 5344define amdgpu_ps <4 x float> @ssubsat_i128_sv(i128 inreg %lhs, i128 %rhs) { 5345; GFX6-LABEL: ssubsat_i128_sv: 5346; GFX6: ; %bb.0: 5347; GFX6-NEXT: v_mov_b32_e32 v5, s1 5348; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s0, v0 5349; GFX6-NEXT: v_subb_u32_e32 v5, vcc, v5, v1, vcc 5350; GFX6-NEXT: v_mov_b32_e32 v6, s2 5351; GFX6-NEXT: v_mov_b32_e32 v7, s3 5352; GFX6-NEXT: v_subb_u32_e32 v6, vcc, v6, v2, vcc 5353; GFX6-NEXT: v_subb_u32_e32 v7, vcc, v7, v3, vcc 5354; GFX6-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5] 5355; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc 5356; GFX6-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[6:7] 5357; GFX6-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc 5358; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[6:7] 5359; GFX6-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc 5360; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[0:1] 5361; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 5362; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[2:3] 5363; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 5364; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] 5365; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v7 5366; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc 5367; GFX6-NEXT: v_xor_b32_e32 v0, v0, v8 5368; GFX6-NEXT: v_bfrev_b32_e32 v1, 1 5369; GFX6-NEXT: v_add_i32_e32 v3, vcc, v2, v1 5370; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 5371; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 5372; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc 5373; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc 5374; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc 5375; GFX6-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc 5376; GFX6-NEXT: ; return to shader part epilog 5377; 5378; GFX8-LABEL: ssubsat_i128_sv: 5379; GFX8: ; %bb.0: 5380; GFX8-NEXT: v_mov_b32_e32 v5, s1 5381; GFX8-NEXT: v_sub_u32_e32 v4, vcc, s0, v0 5382; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v5, v1, vcc 5383; GFX8-NEXT: v_mov_b32_e32 v6, s2 5384; GFX8-NEXT: v_mov_b32_e32 v7, s3 5385; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v6, v2, vcc 5386; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v7, v3, vcc 5387; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5] 5388; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc 5389; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[6:7] 5390; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc 5391; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[6:7] 5392; GFX8-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc 5393; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[0:1] 5394; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 5395; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[2:3] 5396; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 5397; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] 5398; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v7 5399; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc 5400; GFX8-NEXT: v_xor_b32_e32 v0, v0, v8 5401; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 5402; GFX8-NEXT: v_add_u32_e32 v3, vcc, v2, v1 5403; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 5404; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 5405; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc 5406; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc 5407; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc 5408; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc 5409; GFX8-NEXT: ; return to shader part epilog 5410; 5411; GFX9-LABEL: ssubsat_i128_sv: 5412; GFX9: ; %bb.0: 5413; GFX9-NEXT: v_mov_b32_e32 v5, s1 5414; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, s0, v0 5415; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v1, vcc 5416; GFX9-NEXT: v_mov_b32_e32 v6, s2 5417; GFX9-NEXT: v_mov_b32_e32 v7, s3 5418; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v2, vcc 5419; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v3, vcc 5420; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5] 5421; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc 5422; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[6:7] 5423; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc 5424; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[6:7] 5425; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc 5426; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[0:1] 5427; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 5428; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[2:3] 5429; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 5430; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] 5431; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v7 5432; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc 5433; GFX9-NEXT: v_xor_b32_e32 v0, v0, v8 5434; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 5435; GFX9-NEXT: v_add_u32_e32 v3, 0x80000000, v2 5436; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 5437; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc 5438; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc 5439; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc 5440; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc 5441; GFX9-NEXT: ; return to shader part epilog 5442; 5443; GFX10-LABEL: ssubsat_i128_sv: 5444; GFX10: ; %bb.0: 5445; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, s0, v0 5446; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo 5447; GFX10-NEXT: v_sub_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo 5448; GFX10-NEXT: v_sub_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo 5449; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[4:5] 5450; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo 5451; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[6:7] 5452; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo 5453; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, 0, v[0:1] 5454; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo 5455; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3] 5456; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo 5457; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[6:7] 5458; GFX10-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc_lo 5459; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3] 5460; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v7 5461; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo 5462; GFX10-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v2 5463; GFX10-NEXT: v_xor_b32_e32 v0, v0, v8 5464; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 5465; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 5466; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc_lo 5467; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc_lo 5468; GFX10-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo 5469; GFX10-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo 5470; GFX10-NEXT: ; return to shader part epilog 5471; 5472; GFX11-LABEL: ssubsat_i128_sv: 5473; GFX11: ; %bb.0: 5474; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, s0, v0 5475; GFX11-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo 5476; GFX11-NEXT: v_sub_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo 5477; GFX11-NEXT: v_sub_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo 5478; GFX11-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[4:5] 5479; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo 5480; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[6:7] 5481; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo 5482; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, 0, v[0:1] 5483; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo 5484; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3] 5485; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo 5486; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[6:7] 5487; GFX11-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc_lo 5488; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3] 5489; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v7 5490; GFX11-NEXT: v_dual_cndmask_b32 v0, v1, v0 :: v_dual_add_nc_u32 v3, 0x80000000, v2 5491; GFX11-NEXT: v_xor_b32_e32 v0, v0, v8 5492; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 5493; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 5494; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc_lo 5495; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc_lo 5496; GFX11-NEXT: v_dual_cndmask_b32 v2, v6, v2 :: v_dual_cndmask_b32 v3, v7, v3 5497; GFX11-NEXT: ; return to shader part epilog 5498 %result = call i128 @llvm.ssub.sat.i128(i128 %lhs, i128 %rhs) 5499 %cast = bitcast i128 %result to <4 x float> 5500 ret <4 x float> %cast 5501} 5502 5503define amdgpu_ps <4 x float> @ssubsat_i128_vs(i128 %lhs, i128 inreg %rhs) { 5504; GFX6-LABEL: ssubsat_i128_vs: 5505; GFX6: ; %bb.0: 5506; GFX6-NEXT: v_mov_b32_e32 v5, s1 5507; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s0, v0 5508; GFX6-NEXT: v_subb_u32_e32 v5, vcc, v1, v5, vcc 5509; GFX6-NEXT: v_mov_b32_e32 v6, s2 5510; GFX6-NEXT: v_mov_b32_e32 v7, s3 5511; GFX6-NEXT: v_subb_u32_e32 v6, vcc, v2, v6, vcc 5512; GFX6-NEXT: v_subb_u32_e32 v7, vcc, v3, v7, vcc 5513; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[0:1] 5514; GFX6-NEXT: v_cmp_gt_u64_e64 s[0:1], s[0:1], 0 5515; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 5516; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[6:7], v[2:3] 5517; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 5518; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] 5519; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc 5520; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] 5521; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 5522; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[2:3], 0 5523; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] 5524; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc 5525; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 5526; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v7 5527; GFX6-NEXT: v_bfrev_b32_e32 v1, 1 5528; GFX6-NEXT: v_add_i32_e32 v3, vcc, v2, v1 5529; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 5530; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 5531; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc 5532; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc 5533; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc 5534; GFX6-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc 5535; GFX6-NEXT: ; return to shader part epilog 5536; 5537; GFX8-LABEL: ssubsat_i128_vs: 5538; GFX8: ; %bb.0: 5539; GFX8-NEXT: v_mov_b32_e32 v5, s1 5540; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s0, v0 5541; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v1, v5, vcc 5542; GFX8-NEXT: v_mov_b32_e32 v6, s2 5543; GFX8-NEXT: v_mov_b32_e32 v7, s3 5544; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v2, v6, vcc 5545; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v3, v7, vcc 5546; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[0:1] 5547; GFX8-NEXT: v_cmp_gt_u64_e64 s[0:1], s[0:1], 0 5548; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 5549; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[6:7], v[2:3] 5550; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 5551; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 5552; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] 5553; GFX8-NEXT: s_cselect_b32 s4, 1, 0 5554; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc 5555; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] 5556; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 5557; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] 5558; GFX8-NEXT: s_and_b32 s0, 1, s4 5559; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 5560; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc 5561; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 5562; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v7 5563; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 5564; GFX8-NEXT: v_add_u32_e32 v3, vcc, v2, v1 5565; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 5566; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 5567; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc 5568; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc 5569; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc 5570; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc 5571; GFX8-NEXT: ; return to shader part epilog 5572; 5573; GFX9-LABEL: ssubsat_i128_vs: 5574; GFX9: ; %bb.0: 5575; GFX9-NEXT: v_mov_b32_e32 v5, s1 5576; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s0, v0 5577; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v5, vcc 5578; GFX9-NEXT: v_mov_b32_e32 v6, s2 5579; GFX9-NEXT: v_mov_b32_e32 v7, s3 5580; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v2, v6, vcc 5581; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v3, v7, vcc 5582; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[0:1] 5583; GFX9-NEXT: v_cmp_gt_u64_e64 s[0:1], s[0:1], 0 5584; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 5585; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[6:7], v[2:3] 5586; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 5587; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 5588; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] 5589; GFX9-NEXT: s_cselect_b32 s4, 1, 0 5590; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc 5591; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] 5592; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 5593; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] 5594; GFX9-NEXT: s_and_b32 s0, 1, s4 5595; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 5596; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc 5597; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 5598; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v7 5599; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 5600; GFX9-NEXT: v_add_u32_e32 v3, 0x80000000, v2 5601; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 5602; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc 5603; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc 5604; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc 5605; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc 5606; GFX9-NEXT: ; return to shader part epilog 5607; 5608; GFX10-LABEL: ssubsat_i128_vs: 5609; GFX10: ; %bb.0: 5610; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v0, s0 5611; GFX10-NEXT: v_subrev_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo 5612; GFX10-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo 5613; GFX10-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo 5614; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[0:1] 5615; GFX10-NEXT: v_cmp_gt_u64_e64 s0, s[0:1], 0 5616; GFX10-NEXT: s_cmp_eq_u64 s[2:3], 0 5617; GFX10-NEXT: s_cselect_b32 s4, 1, 0 5618; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo 5619; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[6:7], v[2:3] 5620; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0 5621; GFX10-NEXT: v_cmp_gt_i64_e64 s0, s[2:3], 0 5622; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo 5623; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3] 5624; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0 5625; GFX10-NEXT: s_and_b32 s0, 1, s4 5626; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v7 5627; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo 5628; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 5629; GFX10-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v2 5630; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc_lo 5631; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 5632; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 5633; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 5634; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc_lo 5635; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc_lo 5636; GFX10-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo 5637; GFX10-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo 5638; GFX10-NEXT: ; return to shader part epilog 5639; 5640; GFX11-LABEL: ssubsat_i128_vs: 5641; GFX11: ; %bb.0: 5642; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, v0, s0 5643; GFX11-NEXT: v_subrev_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo 5644; GFX11-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo 5645; GFX11-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo 5646; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[0:1] 5647; GFX11-NEXT: v_cmp_gt_u64_e64 s0, s[0:1], 0 5648; GFX11-NEXT: s_cmp_eq_u64 s[2:3], 0 5649; GFX11-NEXT: s_cselect_b32 s4, 1, 0 5650; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo 5651; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[6:7], v[2:3] 5652; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0 5653; GFX11-NEXT: v_cmp_gt_i64_e64 s0, s[2:3], 0 5654; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo 5655; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3] 5656; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0 5657; GFX11-NEXT: s_and_b32 s0, 1, s4 5658; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v7 5659; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo 5660; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 5661; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v2 5662; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc_lo 5663; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0 5664; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 5665; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 5666; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc_lo 5667; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc_lo 5668; GFX11-NEXT: v_dual_cndmask_b32 v2, v6, v2 :: v_dual_cndmask_b32 v3, v7, v3 5669; GFX11-NEXT: ; return to shader part epilog 5670 %result = call i128 @llvm.ssub.sat.i128(i128 %lhs, i128 %rhs) 5671 %cast = bitcast i128 %result to <4 x float> 5672 ret <4 x float> %cast 5673} 5674 5675define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { 5676; GFX6-LABEL: v_ssubsat_v2i128: 5677; GFX6: ; %bb.0: 5678; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5679; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v0, v8 5680; GFX6-NEXT: v_subb_u32_e32 v17, vcc, v1, v9, vcc 5681; GFX6-NEXT: v_subb_u32_e32 v18, vcc, v2, v10, vcc 5682; GFX6-NEXT: v_subb_u32_e32 v19, vcc, v3, v11, vcc 5683; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[16:17], v[0:1] 5684; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 5685; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[18:19], v[2:3] 5686; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 5687; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[18:19], v[2:3] 5688; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc 5689; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[8:9] 5690; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 5691; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[10:11] 5692; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc 5693; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] 5694; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc 5695; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 5696; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v19 5697; GFX6-NEXT: v_bfrev_b32_e32 v1, 1 5698; GFX6-NEXT: v_add_i32_e32 v3, vcc, v2, v1 5699; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 5700; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 5701; GFX6-NEXT: v_cndmask_b32_e32 v0, v16, v2, vcc 5702; GFX6-NEXT: v_cndmask_b32_e32 v1, v17, v2, vcc 5703; GFX6-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc 5704; GFX6-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc 5705; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v4, v12 5706; GFX6-NEXT: v_subb_u32_e32 v9, vcc, v5, v13, vcc 5707; GFX6-NEXT: v_subb_u32_e32 v10, vcc, v6, v14, vcc 5708; GFX6-NEXT: v_subb_u32_e32 v11, vcc, v7, v15, vcc 5709; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5] 5710; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc 5711; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7] 5712; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc 5713; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7] 5714; GFX6-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc 5715; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[12:13] 5716; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc 5717; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[14:15] 5718; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc 5719; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] 5720; GFX6-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc 5721; GFX6-NEXT: v_xor_b32_e32 v4, v5, v4 5722; GFX6-NEXT: v_ashrrev_i32_e32 v6, 31, v11 5723; GFX6-NEXT: v_add_i32_e32 v7, vcc, 0x80000000, v6 5724; GFX6-NEXT: v_and_b32_e32 v4, 1, v4 5725; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 5726; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v6, vcc 5727; GFX6-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc 5728; GFX6-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc 5729; GFX6-NEXT: v_cndmask_b32_e32 v7, v11, v7, vcc 5730; GFX6-NEXT: s_setpc_b64 s[30:31] 5731; 5732; GFX8-LABEL: v_ssubsat_v2i128: 5733; GFX8: ; %bb.0: 5734; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5735; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v0, v8 5736; GFX8-NEXT: v_subb_u32_e32 v17, vcc, v1, v9, vcc 5737; GFX8-NEXT: v_subb_u32_e32 v18, vcc, v2, v10, vcc 5738; GFX8-NEXT: v_subb_u32_e32 v19, vcc, v3, v11, vcc 5739; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[16:17], v[0:1] 5740; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 5741; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[18:19], v[2:3] 5742; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 5743; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[18:19], v[2:3] 5744; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc 5745; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[8:9] 5746; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 5747; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[10:11] 5748; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc 5749; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] 5750; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc 5751; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 5752; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v19 5753; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 5754; GFX8-NEXT: v_add_u32_e32 v3, vcc, v2, v1 5755; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 5756; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 5757; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v2, vcc 5758; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v2, vcc 5759; GFX8-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc 5760; GFX8-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc 5761; GFX8-NEXT: v_sub_u32_e32 v8, vcc, v4, v12 5762; GFX8-NEXT: v_subb_u32_e32 v9, vcc, v5, v13, vcc 5763; GFX8-NEXT: v_subb_u32_e32 v10, vcc, v6, v14, vcc 5764; GFX8-NEXT: v_subb_u32_e32 v11, vcc, v7, v15, vcc 5765; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5] 5766; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc 5767; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7] 5768; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc 5769; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7] 5770; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc 5771; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[12:13] 5772; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc 5773; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[14:15] 5774; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc 5775; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] 5776; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc 5777; GFX8-NEXT: v_xor_b32_e32 v4, v5, v4 5778; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v11 5779; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x80000000, v6 5780; GFX8-NEXT: v_and_b32_e32 v4, 1, v4 5781; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 5782; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v6, vcc 5783; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc 5784; GFX8-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc 5785; GFX8-NEXT: v_cndmask_b32_e32 v7, v11, v7, vcc 5786; GFX8-NEXT: s_setpc_b64 s[30:31] 5787; 5788; GFX9-LABEL: v_ssubsat_v2i128: 5789; GFX9: ; %bb.0: 5790; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5791; GFX9-NEXT: v_sub_co_u32_e32 v16, vcc, v0, v8 5792; GFX9-NEXT: v_subb_co_u32_e32 v17, vcc, v1, v9, vcc 5793; GFX9-NEXT: v_subb_co_u32_e32 v18, vcc, v2, v10, vcc 5794; GFX9-NEXT: v_subb_co_u32_e32 v19, vcc, v3, v11, vcc 5795; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[16:17], v[0:1] 5796; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 5797; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[18:19], v[2:3] 5798; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 5799; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[18:19], v[2:3] 5800; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc 5801; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[8:9] 5802; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 5803; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[10:11] 5804; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc 5805; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] 5806; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc 5807; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 5808; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v19 5809; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 5810; GFX9-NEXT: v_add_u32_e32 v3, 0x80000000, v2 5811; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 5812; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v2, vcc 5813; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v2, vcc 5814; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc 5815; GFX9-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc 5816; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, v4, v12 5817; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, v5, v13, vcc 5818; GFX9-NEXT: v_subb_co_u32_e32 v10, vcc, v6, v14, vcc 5819; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, v7, v15, vcc 5820; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[4:5] 5821; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc 5822; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[10:11], v[6:7] 5823; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc 5824; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[10:11], v[6:7] 5825; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc 5826; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[12:13] 5827; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc 5828; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[14:15] 5829; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc 5830; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] 5831; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc 5832; GFX9-NEXT: v_xor_b32_e32 v4, v5, v4 5833; GFX9-NEXT: v_ashrrev_i32_e32 v6, 31, v11 5834; GFX9-NEXT: v_and_b32_e32 v4, 1, v4 5835; GFX9-NEXT: v_add_u32_e32 v7, 0x80000000, v6 5836; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 5837; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v6, vcc 5838; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc 5839; GFX9-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc 5840; GFX9-NEXT: v_cndmask_b32_e32 v7, v11, v7, vcc 5841; GFX9-NEXT: s_setpc_b64 s[30:31] 5842; 5843; GFX10-LABEL: v_ssubsat_v2i128: 5844; GFX10: ; %bb.0: 5845; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5846; GFX10-NEXT: v_sub_co_u32 v16, vcc_lo, v0, v8 5847; GFX10-NEXT: v_sub_co_ci_u32_e32 v17, vcc_lo, v1, v9, vcc_lo 5848; GFX10-NEXT: v_sub_co_ci_u32_e32 v18, vcc_lo, v2, v10, vcc_lo 5849; GFX10-NEXT: v_sub_co_ci_u32_e32 v19, vcc_lo, v3, v11, vcc_lo 5850; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[16:17], v[0:1] 5851; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo 5852; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[18:19], v[2:3] 5853; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo 5854; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[18:19], v[2:3] 5855; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo 5856; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, 0, v[8:9] 5857; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo 5858; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[10:11] 5859; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo 5860; GFX10-NEXT: v_sub_co_u32 v8, vcc_lo, v4, v12 5861; GFX10-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, v5, v13, vcc_lo 5862; GFX10-NEXT: v_sub_co_ci_u32_e32 v20, vcc_lo, v6, v14, vcc_lo 5863; GFX10-NEXT: v_sub_co_ci_u32_e32 v21, vcc_lo, v7, v15, vcc_lo 5864; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[10:11] 5865; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo 5866; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[8:9], v[4:5] 5867; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 5868; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo 5869; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[20:21], v[6:7] 5870; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 5871; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo 5872; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, 0, v[12:13] 5873; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo 5874; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[14:15] 5875; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo 5876; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[20:21], v[6:7] 5877; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v21 5878; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo 5879; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15] 5880; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x80000000, v6 5881; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc_lo 5882; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 5883; GFX10-NEXT: v_xor_b32_e32 v1, v2, v1 5884; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v19 5885; GFX10-NEXT: v_and_b32_e32 v3, 1, v1 5886; GFX10-NEXT: v_add_nc_u32_e32 v4, 0x80000000, v2 5887; GFX10-NEXT: v_cndmask_b32_e32 v0, v16, v2, vcc_lo 5888; GFX10-NEXT: v_cndmask_b32_e32 v1, v17, v2, vcc_lo 5889; GFX10-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc_lo 5890; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v3 5891; GFX10-NEXT: v_cndmask_b32_e32 v3, v19, v4, vcc_lo 5892; GFX10-NEXT: v_cndmask_b32_e64 v4, v8, v6, s4 5893; GFX10-NEXT: v_cndmask_b32_e64 v5, v9, v6, s4 5894; GFX10-NEXT: v_cndmask_b32_e64 v6, v20, v6, s4 5895; GFX10-NEXT: v_cndmask_b32_e64 v7, v21, v7, s4 5896; GFX10-NEXT: s_setpc_b64 s[30:31] 5897; 5898; GFX11-LABEL: v_ssubsat_v2i128: 5899; GFX11: ; %bb.0: 5900; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 5901; GFX11-NEXT: v_sub_co_u32 v16, vcc_lo, v0, v8 5902; GFX11-NEXT: v_sub_co_ci_u32_e32 v17, vcc_lo, v1, v9, vcc_lo 5903; GFX11-NEXT: v_sub_co_ci_u32_e32 v18, vcc_lo, v2, v10, vcc_lo 5904; GFX11-NEXT: v_sub_co_ci_u32_e32 v19, vcc_lo, v3, v11, vcc_lo 5905; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[16:17], v[0:1] 5906; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo 5907; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[18:19], v[2:3] 5908; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo 5909; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[18:19], v[2:3] 5910; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo 5911; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, 0, v[8:9] 5912; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo 5913; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[10:11] 5914; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo 5915; GFX11-NEXT: v_sub_co_u32 v8, vcc_lo, v4, v12 5916; GFX11-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, v5, v13, vcc_lo 5917; GFX11-NEXT: v_sub_co_ci_u32_e32 v20, vcc_lo, v6, v14, vcc_lo 5918; GFX11-NEXT: v_sub_co_ci_u32_e32 v21, vcc_lo, v7, v15, vcc_lo 5919; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[10:11] 5920; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo 5921; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[8:9], v[4:5] 5922; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0 5923; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo 5924; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[20:21], v[6:7] 5925; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo 5926; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, 0, v[12:13] 5927; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo 5928; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[14:15] 5929; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo 5930; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[20:21], v[6:7] 5931; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v21 5932; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo 5933; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15] 5934; GFX11-NEXT: v_dual_cndmask_b32 v2, v5, v4 :: v_dual_add_nc_u32 v7, 0x80000000, v6 5935; GFX11-NEXT: v_xor_b32_e32 v1, v2, v1 5936; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v19 5937; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 5938; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x80000000, v2 5939; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 5940; GFX11-NEXT: v_dual_cndmask_b32 v0, v16, v2 :: v_dual_and_b32 v3, 1, v1 5941; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v3 5942; GFX11-NEXT: v_cndmask_b32_e32 v1, v17, v2, vcc_lo 5943; GFX11-NEXT: v_dual_cndmask_b32 v2, v18, v2 :: v_dual_cndmask_b32 v3, v19, v4 5944; GFX11-NEXT: v_cndmask_b32_e64 v4, v8, v6, s0 5945; GFX11-NEXT: v_cndmask_b32_e64 v5, v9, v6, s0 5946; GFX11-NEXT: v_cndmask_b32_e64 v6, v20, v6, s0 5947; GFX11-NEXT: v_cndmask_b32_e64 v7, v21, v7, s0 5948; GFX11-NEXT: s_setpc_b64 s[30:31] 5949 %result = call <2 x i128> @llvm.ssub.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs) 5950 ret <2 x i128> %result 5951} 5952 5953define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> inreg %rhs) { 5954; GFX6-LABEL: s_ssubsat_v2i128: 5955; GFX6: ; %bb.0: 5956; GFX6-NEXT: s_sub_u32 s16, s0, s8 5957; GFX6-NEXT: v_mov_b32_e32 v0, s0 5958; GFX6-NEXT: s_subb_u32 s17, s1, s9 5959; GFX6-NEXT: v_mov_b32_e32 v1, s1 5960; GFX6-NEXT: s_subb_u32 s18, s2, s10 5961; GFX6-NEXT: v_mov_b32_e32 v2, s2 5962; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[16:17], v[0:1] 5963; GFX6-NEXT: s_subb_u32 s19, s3, s11 5964; GFX6-NEXT: v_mov_b32_e32 v3, s3 5965; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 5966; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[18:19], v[2:3] 5967; GFX6-NEXT: v_cmp_gt_u64_e64 s[0:1], s[8:9], 0 5968; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 5969; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[18:19], v[2:3] 5970; GFX6-NEXT: v_mov_b32_e32 v3, s17 5971; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc 5972; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] 5973; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[10:11], 0 5974; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[10:11], 0 5975; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] 5976; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc 5977; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 5978; GFX6-NEXT: s_ashr_i32 s0, s19, 31 5979; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 5980; GFX6-NEXT: s_add_i32 s1, s0, 0x80000000 5981; GFX6-NEXT: v_mov_b32_e32 v1, s0 5982; GFX6-NEXT: v_mov_b32_e32 v2, s16 5983; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 5984; GFX6-NEXT: v_cndmask_b32_e32 v4, v2, v1, vcc 5985; GFX6-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc 5986; GFX6-NEXT: v_mov_b32_e32 v0, s1 5987; GFX6-NEXT: v_mov_b32_e32 v2, s18 5988; GFX6-NEXT: v_mov_b32_e32 v3, s19 5989; GFX6-NEXT: v_cndmask_b32_e32 v6, v2, v1, vcc 5990; GFX6-NEXT: v_cndmask_b32_e32 v7, v3, v0, vcc 5991; GFX6-NEXT: s_sub_u32 s0, s4, s12 5992; GFX6-NEXT: v_mov_b32_e32 v0, s4 5993; GFX6-NEXT: s_subb_u32 s1, s5, s13 5994; GFX6-NEXT: v_mov_b32_e32 v1, s5 5995; GFX6-NEXT: s_subb_u32 s2, s6, s14 5996; GFX6-NEXT: v_mov_b32_e32 v2, s6 5997; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] 5998; GFX6-NEXT: s_subb_u32 s3, s7, s15 5999; GFX6-NEXT: v_mov_b32_e32 v3, s7 6000; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 6001; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] 6002; GFX6-NEXT: v_cmp_gt_u64_e64 s[4:5], s[12:13], 0 6003; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 6004; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[2:3] 6005; GFX6-NEXT: v_mov_b32_e32 v3, s1 6006; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc 6007; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] 6008; GFX6-NEXT: v_cmp_gt_i64_e64 s[4:5], s[14:15], 0 6009; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[14:15], 0 6010; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] 6011; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc 6012; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 6013; GFX6-NEXT: s_ashr_i32 s4, s3, 31 6014; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 6015; GFX6-NEXT: s_add_i32 s5, s4, 0x80000000 6016; GFX6-NEXT: v_mov_b32_e32 v1, s4 6017; GFX6-NEXT: v_mov_b32_e32 v2, s0 6018; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 6019; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 6020; GFX6-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc 6021; GFX6-NEXT: v_mov_b32_e32 v3, s5 6022; GFX6-NEXT: v_mov_b32_e32 v8, s2 6023; GFX6-NEXT: v_mov_b32_e32 v9, s3 6024; GFX6-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc 6025; GFX6-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc 6026; GFX6-NEXT: v_readfirstlane_b32 s0, v4 6027; GFX6-NEXT: v_readfirstlane_b32 s1, v5 6028; GFX6-NEXT: v_readfirstlane_b32 s2, v6 6029; GFX6-NEXT: v_readfirstlane_b32 s3, v7 6030; GFX6-NEXT: v_readfirstlane_b32 s4, v0 6031; GFX6-NEXT: v_readfirstlane_b32 s5, v2 6032; GFX6-NEXT: v_readfirstlane_b32 s6, v1 6033; GFX6-NEXT: v_readfirstlane_b32 s7, v3 6034; GFX6-NEXT: ; return to shader part epilog 6035; 6036; GFX8-LABEL: s_ssubsat_v2i128: 6037; GFX8: ; %bb.0: 6038; GFX8-NEXT: s_sub_u32 s16, s0, s8 6039; GFX8-NEXT: s_subb_u32 s17, s1, s9 6040; GFX8-NEXT: v_mov_b32_e32 v0, s0 6041; GFX8-NEXT: s_subb_u32 s18, s2, s10 6042; GFX8-NEXT: v_mov_b32_e32 v1, s1 6043; GFX8-NEXT: s_subb_u32 s19, s3, s11 6044; GFX8-NEXT: v_mov_b32_e32 v2, s2 6045; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[16:17], v[0:1] 6046; GFX8-NEXT: v_mov_b32_e32 v3, s3 6047; GFX8-NEXT: s_cmp_eq_u64 s[18:19], s[2:3] 6048; GFX8-NEXT: s_cselect_b32 s0, 1, 0 6049; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 6050; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[18:19], v[2:3] 6051; GFX8-NEXT: s_and_b32 s0, 1, s0 6052; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 6053; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 6054; GFX8-NEXT: v_cmp_gt_u64_e64 s[0:1], s[8:9], 0 6055; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc 6056; GFX8-NEXT: s_cmp_eq_u64 s[10:11], 0 6057; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] 6058; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[10:11], 0 6059; GFX8-NEXT: s_cselect_b32 s2, 1, 0 6060; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] 6061; GFX8-NEXT: s_and_b32 s0, 1, s2 6062; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 6063; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc 6064; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 6065; GFX8-NEXT: s_ashr_i32 s0, s19, 31 6066; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 6067; GFX8-NEXT: s_add_i32 s1, s0, 0x80000000 6068; GFX8-NEXT: v_mov_b32_e32 v1, s0 6069; GFX8-NEXT: v_mov_b32_e32 v2, s16 6070; GFX8-NEXT: v_mov_b32_e32 v3, s17 6071; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 6072; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v1, vcc 6073; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc 6074; GFX8-NEXT: v_mov_b32_e32 v0, s1 6075; GFX8-NEXT: v_mov_b32_e32 v2, s18 6076; GFX8-NEXT: v_mov_b32_e32 v3, s19 6077; GFX8-NEXT: s_sub_u32 s0, s4, s12 6078; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v1, vcc 6079; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v0, vcc 6080; GFX8-NEXT: s_subb_u32 s1, s5, s13 6081; GFX8-NEXT: v_mov_b32_e32 v0, s4 6082; GFX8-NEXT: s_subb_u32 s2, s6, s14 6083; GFX8-NEXT: v_mov_b32_e32 v1, s5 6084; GFX8-NEXT: s_subb_u32 s3, s7, s15 6085; GFX8-NEXT: v_mov_b32_e32 v2, s6 6086; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] 6087; GFX8-NEXT: v_mov_b32_e32 v3, s7 6088; GFX8-NEXT: s_cmp_eq_u64 s[2:3], s[6:7] 6089; GFX8-NEXT: s_cselect_b32 s4, 1, 0 6090; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 6091; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] 6092; GFX8-NEXT: s_and_b32 s4, 1, s4 6093; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 6094; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 6095; GFX8-NEXT: v_cmp_gt_u64_e64 s[4:5], s[12:13], 0 6096; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc 6097; GFX8-NEXT: s_cmp_eq_u64 s[14:15], 0 6098; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] 6099; GFX8-NEXT: v_cmp_gt_i64_e64 s[4:5], s[14:15], 0 6100; GFX8-NEXT: s_cselect_b32 s6, 1, 0 6101; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] 6102; GFX8-NEXT: s_and_b32 s4, 1, s6 6103; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 6104; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc 6105; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 6106; GFX8-NEXT: s_ashr_i32 s4, s3, 31 6107; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 6108; GFX8-NEXT: s_add_i32 s5, s4, 0x80000000 6109; GFX8-NEXT: v_mov_b32_e32 v1, s4 6110; GFX8-NEXT: v_mov_b32_e32 v2, s0 6111; GFX8-NEXT: v_mov_b32_e32 v3, s1 6112; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 6113; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 6114; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc 6115; GFX8-NEXT: v_mov_b32_e32 v3, s5 6116; GFX8-NEXT: v_mov_b32_e32 v8, s2 6117; GFX8-NEXT: v_mov_b32_e32 v9, s3 6118; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc 6119; GFX8-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc 6120; GFX8-NEXT: v_readfirstlane_b32 s0, v4 6121; GFX8-NEXT: v_readfirstlane_b32 s1, v5 6122; GFX8-NEXT: v_readfirstlane_b32 s2, v6 6123; GFX8-NEXT: v_readfirstlane_b32 s3, v7 6124; GFX8-NEXT: v_readfirstlane_b32 s4, v0 6125; GFX8-NEXT: v_readfirstlane_b32 s5, v2 6126; GFX8-NEXT: v_readfirstlane_b32 s6, v1 6127; GFX8-NEXT: v_readfirstlane_b32 s7, v3 6128; GFX8-NEXT: ; return to shader part epilog 6129; 6130; GFX9-LABEL: s_ssubsat_v2i128: 6131; GFX9: ; %bb.0: 6132; GFX9-NEXT: s_sub_u32 s16, s0, s8 6133; GFX9-NEXT: s_subb_u32 s17, s1, s9 6134; GFX9-NEXT: v_mov_b32_e32 v0, s0 6135; GFX9-NEXT: s_subb_u32 s18, s2, s10 6136; GFX9-NEXT: v_mov_b32_e32 v1, s1 6137; GFX9-NEXT: s_subb_u32 s19, s3, s11 6138; GFX9-NEXT: v_mov_b32_e32 v2, s2 6139; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[16:17], v[0:1] 6140; GFX9-NEXT: v_mov_b32_e32 v3, s3 6141; GFX9-NEXT: s_cmp_eq_u64 s[18:19], s[2:3] 6142; GFX9-NEXT: s_cselect_b32 s0, 1, 0 6143; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 6144; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[18:19], v[2:3] 6145; GFX9-NEXT: s_and_b32 s0, 1, s0 6146; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 6147; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 6148; GFX9-NEXT: v_cmp_gt_u64_e64 s[0:1], s[8:9], 0 6149; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc 6150; GFX9-NEXT: s_cmp_eq_u64 s[10:11], 0 6151; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] 6152; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[10:11], 0 6153; GFX9-NEXT: s_cselect_b32 s2, 1, 0 6154; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] 6155; GFX9-NEXT: s_and_b32 s0, 1, s2 6156; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 6157; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc 6158; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 6159; GFX9-NEXT: s_ashr_i32 s0, s19, 31 6160; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 6161; GFX9-NEXT: s_add_i32 s1, s0, 0x80000000 6162; GFX9-NEXT: v_mov_b32_e32 v1, s0 6163; GFX9-NEXT: v_mov_b32_e32 v2, s16 6164; GFX9-NEXT: v_mov_b32_e32 v3, s17 6165; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 6166; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v1, vcc 6167; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc 6168; GFX9-NEXT: v_mov_b32_e32 v0, s1 6169; GFX9-NEXT: v_mov_b32_e32 v2, s18 6170; GFX9-NEXT: v_mov_b32_e32 v3, s19 6171; GFX9-NEXT: s_sub_u32 s0, s4, s12 6172; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v1, vcc 6173; GFX9-NEXT: v_cndmask_b32_e32 v7, v3, v0, vcc 6174; GFX9-NEXT: s_subb_u32 s1, s5, s13 6175; GFX9-NEXT: v_mov_b32_e32 v0, s4 6176; GFX9-NEXT: s_subb_u32 s2, s6, s14 6177; GFX9-NEXT: v_mov_b32_e32 v1, s5 6178; GFX9-NEXT: s_subb_u32 s3, s7, s15 6179; GFX9-NEXT: v_mov_b32_e32 v2, s6 6180; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] 6181; GFX9-NEXT: v_mov_b32_e32 v3, s7 6182; GFX9-NEXT: s_cmp_eq_u64 s[2:3], s[6:7] 6183; GFX9-NEXT: s_cselect_b32 s4, 1, 0 6184; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 6185; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3] 6186; GFX9-NEXT: s_and_b32 s4, 1, s4 6187; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 6188; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 6189; GFX9-NEXT: v_cmp_gt_u64_e64 s[4:5], s[12:13], 0 6190; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc 6191; GFX9-NEXT: s_cmp_eq_u64 s[14:15], 0 6192; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] 6193; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], s[14:15], 0 6194; GFX9-NEXT: s_cselect_b32 s6, 1, 0 6195; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] 6196; GFX9-NEXT: s_and_b32 s4, 1, s6 6197; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 6198; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc 6199; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 6200; GFX9-NEXT: s_ashr_i32 s4, s3, 31 6201; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 6202; GFX9-NEXT: s_add_i32 s5, s4, 0x80000000 6203; GFX9-NEXT: v_mov_b32_e32 v1, s4 6204; GFX9-NEXT: v_mov_b32_e32 v2, s0 6205; GFX9-NEXT: v_mov_b32_e32 v3, s1 6206; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 6207; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 6208; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc 6209; GFX9-NEXT: v_mov_b32_e32 v3, s5 6210; GFX9-NEXT: v_mov_b32_e32 v8, s2 6211; GFX9-NEXT: v_mov_b32_e32 v9, s3 6212; GFX9-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc 6213; GFX9-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc 6214; GFX9-NEXT: v_readfirstlane_b32 s0, v4 6215; GFX9-NEXT: v_readfirstlane_b32 s1, v5 6216; GFX9-NEXT: v_readfirstlane_b32 s2, v6 6217; GFX9-NEXT: v_readfirstlane_b32 s3, v7 6218; GFX9-NEXT: v_readfirstlane_b32 s4, v0 6219; GFX9-NEXT: v_readfirstlane_b32 s5, v2 6220; GFX9-NEXT: v_readfirstlane_b32 s6, v1 6221; GFX9-NEXT: v_readfirstlane_b32 s7, v3 6222; GFX9-NEXT: ; return to shader part epilog 6223; 6224; GFX10-LABEL: s_ssubsat_v2i128: 6225; GFX10: ; %bb.0: 6226; GFX10-NEXT: s_sub_u32 s18, s0, s8 6227; GFX10-NEXT: s_subb_u32 s19, s1, s9 6228; GFX10-NEXT: s_subb_u32 s16, s2, s10 6229; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[18:19], s[0:1] 6230; GFX10-NEXT: s_subb_u32 s17, s3, s11 6231; GFX10-NEXT: s_cmp_eq_u64 s[16:17], s[2:3] 6232; GFX10-NEXT: s_cselect_b32 s20, 1, 0 6233; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 6234; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[16:17], s[2:3] 6235; GFX10-NEXT: v_cmp_gt_u64_e64 s2, s[8:9], 0 6236; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 6237; GFX10-NEXT: s_and_b32 s0, 1, s20 6238; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 6239; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2 6240; GFX10-NEXT: v_cmp_gt_i64_e64 s2, s[10:11], 0 6241; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 6242; GFX10-NEXT: s_cselect_b32 s1, 1, 0 6243; GFX10-NEXT: s_ashr_i32 s8, s17, 31 6244; GFX10-NEXT: s_and_b32 s1, 1, s1 6245; GFX10-NEXT: s_add_i32 s9, s8, 0x80000000 6246; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 6247; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo 6248; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 6249; GFX10-NEXT: s_sub_u32 s0, s4, s12 6250; GFX10-NEXT: s_subb_u32 s1, s5, s13 6251; GFX10-NEXT: s_subb_u32 s2, s6, s14 6252; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[0:1], s[4:5] 6253; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo 6254; GFX10-NEXT: s_subb_u32 s3, s7, s15 6255; GFX10-NEXT: v_mov_b32_e32 v5, s0 6256; GFX10-NEXT: s_cmp_eq_u64 s[2:3], s[6:7] 6257; GFX10-NEXT: v_mov_b32_e32 v6, s1 6258; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 6259; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s4 6260; GFX10-NEXT: v_cmp_lt_i64_e64 s4, s[2:3], s[6:7] 6261; GFX10-NEXT: v_cmp_gt_u64_e64 s6, s[12:13], 0 6262; GFX10-NEXT: s_cselect_b32 s10, 1, 0 6263; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 6264; GFX10-NEXT: v_mov_b32_e32 v7, s3 6265; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 6266; GFX10-NEXT: s_and_b32 s4, 1, s10 6267; GFX10-NEXT: s_cmp_eq_u64 s[14:15], 0 6268; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6 6269; GFX10-NEXT: v_cmp_gt_i64_e64 s6, s[14:15], 0 6270; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4 6271; GFX10-NEXT: s_cselect_b32 s5, 1, 0 6272; GFX10-NEXT: s_ashr_i32 s4, s3, 31 6273; GFX10-NEXT: s_and_b32 s5, 1, s5 6274; GFX10-NEXT: s_add_i32 s0, s4, 0x80000000 6275; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s6 6276; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo 6277; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s5 6278; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc_lo 6279; GFX10-NEXT: v_mov_b32_e32 v3, s18 6280; GFX10-NEXT: v_mov_b32_e32 v4, s19 6281; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 6282; GFX10-NEXT: v_mov_b32_e32 v0, s16 6283; GFX10-NEXT: v_xor_b32_e32 v1, v2, v1 6284; GFX10-NEXT: v_mov_b32_e32 v2, s17 6285; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s8, vcc_lo 6286; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s8, vcc_lo 6287; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 6288; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s8, vcc_lo 6289; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s9, vcc_lo 6290; GFX10-NEXT: v_readfirstlane_b32 s1, v4 6291; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 6292; GFX10-NEXT: v_mov_b32_e32 v1, s2 6293; GFX10-NEXT: v_readfirstlane_b32 s2, v0 6294; GFX10-NEXT: v_readfirstlane_b32 s3, v2 6295; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s4, vcc_lo 6296; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, s4, vcc_lo 6297; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s4, vcc_lo 6298; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, s0, vcc_lo 6299; GFX10-NEXT: v_readfirstlane_b32 s0, v3 6300; GFX10-NEXT: v_readfirstlane_b32 s4, v5 6301; GFX10-NEXT: v_readfirstlane_b32 s5, v6 6302; GFX10-NEXT: v_readfirstlane_b32 s6, v1 6303; GFX10-NEXT: v_readfirstlane_b32 s7, v7 6304; GFX10-NEXT: ; return to shader part epilog 6305; 6306; GFX11-LABEL: s_ssubsat_v2i128: 6307; GFX11: ; %bb.0: 6308; GFX11-NEXT: s_sub_u32 s18, s0, s8 6309; GFX11-NEXT: s_subb_u32 s19, s1, s9 6310; GFX11-NEXT: s_subb_u32 s16, s2, s10 6311; GFX11-NEXT: v_cmp_lt_u64_e64 s0, s[18:19], s[0:1] 6312; GFX11-NEXT: s_subb_u32 s17, s3, s11 6313; GFX11-NEXT: s_cmp_eq_u64 s[16:17], s[2:3] 6314; GFX11-NEXT: s_cselect_b32 s20, 1, 0 6315; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 6316; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[16:17], s[2:3] 6317; GFX11-NEXT: v_cmp_gt_u64_e64 s2, s[8:9], 0 6318; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 6319; GFX11-NEXT: s_and_b32 s0, 1, s20 6320; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 6321; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2 6322; GFX11-NEXT: v_cmp_gt_i64_e64 s2, s[10:11], 0 6323; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 6324; GFX11-NEXT: s_cselect_b32 s1, 1, 0 6325; GFX11-NEXT: s_ashr_i32 s8, s17, 31 6326; GFX11-NEXT: s_and_b32 s1, 1, s1 6327; GFX11-NEXT: s_add_i32 s9, s8, 0x80000000 6328; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 6329; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo 6330; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 6331; GFX11-NEXT: s_sub_u32 s0, s4, s12 6332; GFX11-NEXT: s_subb_u32 s1, s5, s13 6333; GFX11-NEXT: s_subb_u32 s2, s6, s14 6334; GFX11-NEXT: v_cmp_lt_u64_e64 s4, s[0:1], s[4:5] 6335; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo 6336; GFX11-NEXT: s_subb_u32 s3, s7, s15 6337; GFX11-NEXT: v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v7, s3 6338; GFX11-NEXT: s_cmp_eq_u64 s[2:3], s[6:7] 6339; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0 6340; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s4 6341; GFX11-NEXT: v_cmp_lt_i64_e64 s4, s[2:3], s[6:7] 6342; GFX11-NEXT: v_cmp_gt_u64_e64 s6, s[12:13], 0 6343; GFX11-NEXT: s_cselect_b32 s10, 1, 0 6344; GFX11-NEXT: v_mov_b32_e32 v5, s0 6345; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 6346; GFX11-NEXT: s_and_b32 s4, 1, s10 6347; GFX11-NEXT: s_cmp_eq_u64 s[14:15], 0 6348; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6 6349; GFX11-NEXT: v_cmp_gt_i64_e64 s6, s[14:15], 0 6350; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4 6351; GFX11-NEXT: s_cselect_b32 s5, 1, 0 6352; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 6353; GFX11-NEXT: s_and_b32 s5, 1, s5 6354; GFX11-NEXT: s_ashr_i32 s4, s3, 31 6355; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo 6356; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s6 6357; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s5 6358; GFX11-NEXT: s_add_i32 s0, s4, 0x80000000 6359; GFX11-NEXT: v_dual_cndmask_b32 v2, v4, v3 :: v_dual_mov_b32 v3, s18 6360; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 6361; GFX11-NEXT: v_mov_b32_e32 v0, s16 6362; GFX11-NEXT: v_xor_b32_e32 v1, v2, v1 6363; GFX11-NEXT: v_mov_b32_e32 v4, s19 6364; GFX11-NEXT: v_mov_b32_e32 v2, s17 6365; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s8, vcc_lo 6366; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s8, vcc_lo 6367; GFX11-NEXT: v_and_b32_e32 v1, 1, v1 6368; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s8, vcc_lo 6369; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s9, vcc_lo 6370; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 6371; GFX11-NEXT: v_mov_b32_e32 v1, s2 6372; GFX11-NEXT: v_readfirstlane_b32 s1, v4 6373; GFX11-NEXT: v_readfirstlane_b32 s2, v0 6374; GFX11-NEXT: v_readfirstlane_b32 s3, v2 6375; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s4, vcc_lo 6376; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, s4, vcc_lo 6377; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s4, vcc_lo 6378; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s0, vcc_lo 6379; GFX11-NEXT: v_readfirstlane_b32 s0, v3 6380; GFX11-NEXT: v_readfirstlane_b32 s4, v5 6381; GFX11-NEXT: v_readfirstlane_b32 s5, v6 6382; GFX11-NEXT: v_readfirstlane_b32 s6, v1 6383; GFX11-NEXT: v_readfirstlane_b32 s7, v7 6384; GFX11-NEXT: ; return to shader part epilog 6385 %result = call <2 x i128> @llvm.ssub.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs) 6386 ret <2 x i128> %result 6387} 6388 6389declare i7 @llvm.ssub.sat.i7(i7, i7) #0 6390declare i8 @llvm.ssub.sat.i8(i8, i8) #0 6391declare <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8>, <2 x i8>) #0 6392declare <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8>, <4 x i8>) #0 6393 6394declare i16 @llvm.ssub.sat.i16(i16, i16) #0 6395declare <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16>, <2 x i16>) #0 6396declare <3 x i16> @llvm.ssub.sat.v3i16(<3 x i16>, <3 x i16>) #0 6397declare <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16>, <4 x i16>) #0 6398declare <5 x i16> @llvm.ssub.sat.v5i16(<5 x i16>, <5 x i16>) #0 6399declare <6 x i16> @llvm.ssub.sat.v6i16(<6 x i16>, <6 x i16>) #0 6400declare <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16>, <8 x i16>) #0 6401 6402declare i24 @llvm.ssub.sat.i24(i24, i24) #0 6403 6404declare i32 @llvm.ssub.sat.i32(i32, i32) #0 6405declare <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32>, <2 x i32>) #0 6406declare <3 x i32> @llvm.ssub.sat.v3i32(<3 x i32>, <3 x i32>) #0 6407declare <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32>, <4 x i32>) #0 6408declare <5 x i32> @llvm.ssub.sat.v5i32(<5 x i32>, <5 x i32>) #0 6409declare <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32>, <16 x i32>) #0 6410 6411declare i48 @llvm.ssub.sat.i48(i48, i48) #0 6412 6413declare i64 @llvm.ssub.sat.i64(i64, i64) #0 6414declare <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64>, <2 x i64>) #0 6415 6416declare i128 @llvm.ssub.sat.i128(i128, i128) #0 6417declare <2 x i128> @llvm.ssub.sat.v2i128(<2 x i128>, <2 x i128>) #0 6418 6419attributes #0 = { nounwind readnone speculatable willreturn } 6420