xref: /netbsd-src/external/gpl3/gcc.old/dist/gcc/config/rs6000/rs6000-string.c (revision ae082add65442546470c0ba499a860ee89eed305)
1 /* Subroutines used to expand string and block move, clear,
2    compare and other operations for PowerPC.
3    Copyright (C) 1991-2019 Free Software Foundation, Inc.
4 
5    This file is part of GCC.
6 
7    GCC is free software; you can redistribute it and/or modify it
8    under the terms of the GNU General Public License as published
9    by the Free Software Foundation; either version 3, or (at your
10    option) any later version.
11 
12    GCC is distributed in the hope that it will be useful, but WITHOUT
13    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14    or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
15    License for more details.
16 
17    You should have received a copy of the GNU General Public License
18    along with GCC; see the file COPYING3.  If not see
19    <http://www.gnu.org/licenses/>.  */
20 
21 #define IN_TARGET_CODE 1
22 
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "backend.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "memmodel.h"
30 #include "tm_p.h"
31 #include "ira.h"
32 #include "print-tree.h"
33 #include "varasm.h"
34 #include "explow.h"
35 #include "expr.h"
36 #include "output.h"
37 #include "target.h"
38 #include "profile-count.h"
39 #include "predict.h"
40 
41 /* Expand a block clear operation, and return 1 if successful.  Return 0
42    if we should let the compiler generate normal code.
43 
44    operands[0] is the destination
45    operands[1] is the length
46    operands[3] is the alignment */
47 
48 int
49 expand_block_clear (rtx operands[])
50 {
51   rtx orig_dest = operands[0];
52   rtx bytes_rtx	= operands[1];
53   rtx align_rtx = operands[3];
54   bool constp	= CONST_INT_P (bytes_rtx);
55   HOST_WIDE_INT align;
56   HOST_WIDE_INT bytes;
57   int offset;
58   int clear_bytes;
59   int clear_step;
60 
61   /* If this is not a fixed size move, just call memcpy */
62   if (! constp)
63     return 0;
64 
65   /* This must be a fixed size alignment  */
66   gcc_assert (CONST_INT_P (align_rtx));
67   align = INTVAL (align_rtx) * BITS_PER_UNIT;
68 
69   /* Anything to clear? */
70   bytes = INTVAL (bytes_rtx);
71   if (bytes <= 0)
72     return 1;
73 
74   /* Use the builtin memset after a point, to avoid huge code bloat.
75      When optimize_size, avoid any significant code bloat; calling
76      memset is about 4 instructions, so allow for one instruction to
77      load zero and three to do clearing.  */
78   if (TARGET_ALTIVEC && (align >= 128 || TARGET_EFFICIENT_UNALIGNED_VSX))
79     clear_step = 16;
80   else if (TARGET_POWERPC64 && (align >= 64 || !STRICT_ALIGNMENT))
81     clear_step = 8;
82   else
83     clear_step = 4;
84 
85   if (optimize_size && bytes > 3 * clear_step)
86     return 0;
87   if (! optimize_size && bytes > 8 * clear_step)
88     return 0;
89 
90   bool unaligned_vsx_ok = (bytes >= 32 && TARGET_EFFICIENT_UNALIGNED_VSX);
91 
92   for (offset = 0; bytes > 0; offset += clear_bytes, bytes -= clear_bytes)
93     {
94       machine_mode mode = BLKmode;
95       rtx dest;
96 
97       if (TARGET_ALTIVEC
98 	  && (bytes >= 16 && (align >= 128 || unaligned_vsx_ok)))
99 	{
100 	  clear_bytes = 16;
101 	  mode = V4SImode;
102 	}
103       else if (bytes >= 8 && TARGET_POWERPC64
104 	       && (align >= 64 || !STRICT_ALIGNMENT))
105 	{
106 	  clear_bytes = 8;
107 	  mode = DImode;
108 	  if (offset == 0 && align < 64)
109 	    {
110 	      rtx addr;
111 
112 	      /* If the address form is reg+offset with offset not a
113 		 multiple of four, reload into reg indirect form here
114 		 rather than waiting for reload.  This way we get one
115 		 reload, not one per store.  */
116 	      addr = XEXP (orig_dest, 0);
117 	      if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM)
118 		  && CONST_INT_P (XEXP (addr, 1))
119 		  && (INTVAL (XEXP (addr, 1)) & 3) != 0)
120 		{
121 		  addr = copy_addr_to_reg (addr);
122 		  orig_dest = replace_equiv_address (orig_dest, addr);
123 		}
124 	    }
125 	}
126       else if (bytes >= 4 && (align >= 32 || !STRICT_ALIGNMENT))
127 	{			/* move 4 bytes */
128 	  clear_bytes = 4;
129 	  mode = SImode;
130 	}
131       else if (bytes >= 2 && (align >= 16 || !STRICT_ALIGNMENT))
132 	{			/* move 2 bytes */
133 	  clear_bytes = 2;
134 	  mode = HImode;
135 	}
136       else /* move 1 byte at a time */
137 	{
138 	  clear_bytes = 1;
139 	  mode = QImode;
140 	}
141 
142       dest = adjust_address (orig_dest, mode, offset);
143 
144       emit_move_insn (dest, CONST0_RTX (mode));
145     }
146 
147   return 1;
148 }
149 
150 /* Figure out the correct instructions to generate to load data for
151    block compare.  MODE is used for the read from memory, and
152    data is zero extended if REG is wider than MODE.  If LE code
153    is being generated, bswap loads are used.
154 
155    REG is the destination register to move the data into.
156    MEM is the memory block being read.
157    MODE is the mode of memory to use for the read.  */
158 static void
159 do_load_for_compare (rtx reg, rtx mem, machine_mode mode)
160 {
161   switch (GET_MODE (reg))
162     {
163     case E_V16QImode:
164       switch (mode)
165 	{
166 	case E_V16QImode:
167 	  if (!BYTES_BIG_ENDIAN)
168 	    {
169 	      if (TARGET_P9_VECTOR)
170 		emit_insn (gen_vsx_ld_elemrev_v16qi_internal (reg, mem));
171 	      else
172 		{
173 		  rtx reg_v2di = simplify_gen_subreg (V2DImode, reg,
174 						      V16QImode, 0);
175 		  gcc_assert (MEM_P (mem));
176 		  rtx addr = XEXP (mem, 0);
177 		  rtx mem_v2di = gen_rtx_MEM (V2DImode, addr);
178 		  MEM_COPY_ATTRIBUTES (mem_v2di, mem);
179 		  set_mem_size (mem, GET_MODE_SIZE (V2DImode));
180 		  emit_insn (gen_vsx_ld_elemrev_v2di (reg_v2di, mem_v2di));
181 		}
182 	    }
183 	  else
184 	    emit_insn (gen_vsx_movv2di_64bit (reg, mem));
185 	  break;
186 	default:
187 	  gcc_unreachable ();
188 	}
189       break;
190     case E_DImode:
191       switch (mode)
192 	{
193 	case E_QImode:
194 	  emit_insn (gen_zero_extendqidi2 (reg, mem));
195 	  break;
196 	case E_HImode:
197 	  {
198 	    rtx src = mem;
199 	    if (!BYTES_BIG_ENDIAN)
200 	      {
201 		src = gen_reg_rtx (HImode);
202 		emit_insn (gen_bswaphi2 (src, mem));
203 	      }
204 	    emit_insn (gen_zero_extendhidi2 (reg, src));
205 	    break;
206 	  }
207 	case E_SImode:
208 	  {
209 	    rtx src = mem;
210 	    if (!BYTES_BIG_ENDIAN)
211 	      {
212 		src = gen_reg_rtx (SImode);
213 		emit_insn (gen_bswapsi2 (src, mem));
214 	      }
215 	    emit_insn (gen_zero_extendsidi2 (reg, src));
216 	  }
217 	  break;
218 	case E_DImode:
219 	  if (!BYTES_BIG_ENDIAN)
220 	    emit_insn (gen_bswapdi2 (reg, mem));
221 	  else
222 	    emit_insn (gen_movdi (reg, mem));
223 	  break;
224 	default:
225 	  gcc_unreachable ();
226 	}
227       break;
228 
229     case E_SImode:
230       switch (mode)
231 	{
232 	case E_QImode:
233 	  emit_insn (gen_zero_extendqisi2 (reg, mem));
234 	  break;
235 	case E_HImode:
236 	  {
237 	    rtx src = mem;
238 	    if (!BYTES_BIG_ENDIAN)
239 	      {
240 		src = gen_reg_rtx (HImode);
241 		emit_insn (gen_bswaphi2 (src, mem));
242 	      }
243 	    emit_insn (gen_zero_extendhisi2 (reg, src));
244 	    break;
245 	  }
246 	case E_SImode:
247 	  if (!BYTES_BIG_ENDIAN)
248 	    emit_insn (gen_bswapsi2 (reg, mem));
249 	  else
250 	    emit_insn (gen_movsi (reg, mem));
251 	  break;
252 	case E_DImode:
253 	  /* DImode is larger than the destination reg so is not expected.  */
254 	  gcc_unreachable ();
255 	  break;
256 	default:
257 	  gcc_unreachable ();
258 	}
259       break;
260 
261     case E_QImode:
262       gcc_assert (mode == E_QImode);
263       emit_move_insn (reg, mem);
264       break;
265 
266     default:
267       gcc_unreachable ();
268       break;
269     }
270 }
271 
272 /* Select the mode to be used for reading the next chunk of bytes
273    in the compare.
274 
275    OFFSET is the current read offset from the beginning of the block.
276    BYTES is the number of bytes remaining to be read.
277    ALIGN is the minimum alignment of the memory blocks being compared in bytes.  */
278 static machine_mode
279 select_block_compare_mode (unsigned HOST_WIDE_INT offset,
280 			   unsigned HOST_WIDE_INT bytes,
281 			   unsigned HOST_WIDE_INT align)
282 {
283   /* First see if we can do a whole load unit
284      as that will be more efficient than a larger load + shift.  */
285 
286   /* If big, use biggest chunk.
287      If exactly chunk size, use that size.
288      If remainder can be done in one piece with shifting, do that.
289      Do largest chunk possible without violating alignment rules.  */
290 
291   /* The most we can read without potential page crossing.  */
292   unsigned HOST_WIDE_INT maxread = ROUND_UP (bytes, align);
293 
294   /* If we have an LE target without ldbrx and word_mode is DImode,
295      then we must avoid using word_mode.  */
296   int word_mode_ok = !(!BYTES_BIG_ENDIAN && !TARGET_LDBRX
297 		       && word_mode == DImode);
298 
299   if (word_mode_ok && bytes >= UNITS_PER_WORD)
300     return word_mode;
301   else if (bytes == GET_MODE_SIZE (SImode))
302     return SImode;
303   else if (bytes == GET_MODE_SIZE (HImode))
304     return HImode;
305   else if (bytes == GET_MODE_SIZE (QImode))
306     return QImode;
307   else if (bytes < GET_MODE_SIZE (SImode)
308 	   && TARGET_EFFICIENT_OVERLAPPING_UNALIGNED
309 	   && offset >= GET_MODE_SIZE (SImode) - bytes)
310     /* This matches the case were we have SImode and 3 bytes
311        and offset >= 1 and permits us to move back one and overlap
312        with the previous read, thus avoiding having to shift
313        unwanted bytes off of the input.  */
314     return SImode;
315   else if (word_mode_ok && bytes < UNITS_PER_WORD
316 	   && TARGET_EFFICIENT_OVERLAPPING_UNALIGNED
317 	   && offset >= UNITS_PER_WORD-bytes)
318     /* Similarly, if we can use DImode it will get matched here and
319        can do an overlapping read that ends at the end of the block.  */
320     return word_mode;
321   else if (word_mode_ok && maxread >= UNITS_PER_WORD)
322     /* It is safe to do all remaining in one load of largest size,
323        possibly with a shift to get rid of unwanted bytes.  */
324     return word_mode;
325   else if (maxread >= GET_MODE_SIZE (SImode))
326     /* It is safe to do all remaining in one SImode load,
327        possibly with a shift to get rid of unwanted bytes.  */
328     return SImode;
329   else if (bytes > GET_MODE_SIZE (SImode))
330     return SImode;
331   else if (bytes > GET_MODE_SIZE (HImode))
332     return HImode;
333 
334   /* final fallback is do one byte */
335   return QImode;
336 }
337 
338 /* Compute the alignment of pointer+OFFSET where the original alignment
339    of pointer was BASE_ALIGN.  */
340 static unsigned HOST_WIDE_INT
341 compute_current_alignment (unsigned HOST_WIDE_INT base_align,
342 			   unsigned HOST_WIDE_INT offset)
343 {
344   if (offset == 0)
345     return base_align;
346   return MIN (base_align, offset & -offset);
347 }
348 
349 /* Prepare address and then do a load.
350 
351    MODE is the mode to use for the load.
352    DEST is the destination register for the data.
353    ADDR is the address to be loaded.
354    ORIG_ADDR is the original address expression.  */
355 static void
356 do_load_for_compare_from_addr (machine_mode mode, rtx dest, rtx addr,
357 			       rtx orig_addr)
358 {
359   rtx mem = gen_rtx_MEM (mode, addr);
360   MEM_COPY_ATTRIBUTES (mem, orig_addr);
361   set_mem_size (mem, GET_MODE_SIZE (mode));
362   do_load_for_compare (dest, mem, mode);
363   return;
364 }
365 
366 /* Do a branch for an if/else decision.
367 
368    CMPMODE is the mode to use for the comparison.
369    COMPARISON is the rtx code for the compare needed.
370    A is the first thing to be compared.
371    B is the second thing to be compared.
372    CR is the condition code reg input, or NULL_RTX.
373    TRUE_LABEL is the label to branch to if the condition is true.
374    P is the estimated branch probability for the branch.
375 
376    The return value is the CR used for the comparison.
377    If CR is null_rtx, then a new register of CMPMODE is generated.
378    If A and B are both null_rtx, then CR must not be null, and the
379    compare is not generated so you can use this with a dot form insn.  */
380 
381 static void
382 do_ifelse (machine_mode cmpmode, rtx_code comparison,
383 	   rtx a, rtx b, rtx cr, rtx true_label, profile_probability br_prob)
384 {
385   gcc_assert ((a == NULL_RTX && b == NULL_RTX && cr != NULL_RTX)
386 	      || (a != NULL_RTX && b != NULL_RTX));
387 
388   if (cr != NULL_RTX)
389     gcc_assert (GET_MODE (cr) == cmpmode);
390   else
391     cr = gen_reg_rtx (cmpmode);
392 
393   rtx label_ref = gen_rtx_LABEL_REF (VOIDmode, true_label);
394 
395   if (a != NULL_RTX)
396     emit_move_insn (cr, gen_rtx_COMPARE (cmpmode, a, b));
397 
398   rtx cmp_rtx = gen_rtx_fmt_ee (comparison, VOIDmode, cr, const0_rtx);
399 
400   rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx, label_ref, pc_rtx);
401   rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
402   add_reg_br_prob_note (j, br_prob);
403   JUMP_LABEL (j) = true_label;
404   LABEL_NUSES (true_label) += 1;
405 }
406 
407 /* Emit an isel of the proper mode for DEST.
408 
409    DEST is the isel destination register.
410    SRC1 is the isel source if CR is true.
411    SRC2 is the isel source if CR is false.
412    CR is the condition for the isel.  */
413 static void
414 do_isel (rtx dest, rtx cmp, rtx src_t, rtx src_f, rtx cr)
415 {
416   if (GET_MODE (dest) == DImode)
417     emit_insn (gen_isel_signed_di (dest, cmp, src_t, src_f, cr));
418   else
419     emit_insn (gen_isel_signed_si (dest, cmp, src_t, src_f, cr));
420 }
421 
422 /* Emit a subtract of the proper mode for DEST.
423 
424    DEST is the destination register for the subtract.
425    SRC1 is the first subtract input.
426    SRC2 is the second subtract input.
427 
428    Computes DEST = SRC1-SRC2.  */
429 static void
430 do_sub3 (rtx dest, rtx src1, rtx src2)
431 {
432   if (GET_MODE (dest) == DImode)
433     emit_insn (gen_subdi3 (dest, src1, src2));
434   else
435     emit_insn (gen_subsi3 (dest, src1, src2));
436 }
437 
438 /* Emit an add of the proper mode for DEST.
439 
440    DEST is the destination register for the add.
441    SRC1 is the first add input.
442    SRC2 is the second add input.
443 
444    Computes DEST = SRC1+SRC2.  */
445 static void
446 do_add3 (rtx dest, rtx src1, rtx src2)
447 {
448   if (GET_MODE (dest) == DImode)
449     emit_insn (gen_adddi3 (dest, src1, src2));
450   else
451     emit_insn (gen_addsi3 (dest, src1, src2));
452 }
453 
454 /* Emit an and of the proper mode for DEST.
455 
456    DEST is the destination register for the and.
457    SRC1 is the first and input.
458    SRC2 is the second and input.
459 
460    Computes DEST = SRC1&SRC2.  */
461 static void
462 do_and3 (rtx dest, rtx src1, rtx src2)
463 {
464   if (GET_MODE (dest) == DImode)
465     emit_insn (gen_anddi3 (dest, src1, src2));
466   else
467     emit_insn (gen_andsi3 (dest, src1, src2));
468 }
469 
470 /* Emit an cmpb of the proper mode for DEST.
471 
472    DEST is the destination register for the cmpb.
473    SRC1 is the first input.
474    SRC2 is the second input.
475 
476    Computes cmpb of SRC1, SRC2.  */
477 static void
478 do_cmpb3 (rtx dest, rtx src1, rtx src2)
479 {
480   if (GET_MODE (dest) == DImode)
481     emit_insn (gen_cmpbdi3 (dest, src1, src2));
482   else
483     emit_insn (gen_cmpbsi3 (dest, src1, src2));
484 }
485 
486 /* Emit a rotl of the proper mode for DEST.
487 
488    DEST is the destination register for the and.
489    SRC1 is the first and input.
490    SRC2 is the second and input.
491 
492    Computes DEST = SRC1 rotated left by SRC2.  */
493 static void
494 do_rotl3 (rtx dest, rtx src1, rtx src2)
495 {
496   if (GET_MODE (dest) == DImode)
497     emit_insn (gen_rotldi3 (dest, src1, src2));
498   else
499     emit_insn (gen_rotlsi3 (dest, src1, src2));
500 }
501 
502 /* Generate rtl for a load, shift, and compare of less than a full word.
503 
504    LOAD_MODE is the machine mode for the loads.
505    DIFF is the reg for the difference.
506    CMP_REM is the reg containing the remaining bytes to compare.
507    DCOND is the CCUNS reg for the compare if we are doing P9 code with setb.
508    SRC1_ADDR is the first source address.
509    SRC2_ADDR is the second source address.
510    ORIG_SRC1 is the original first source block's address rtx.
511    ORIG_SRC2 is the original second source block's address rtx.  */
512 static void
513 do_load_mask_compare (const machine_mode load_mode, rtx diff, rtx cmp_rem, rtx dcond,
514 		      rtx src1_addr, rtx src2_addr, rtx orig_src1, rtx orig_src2)
515 {
516   HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode);
517   rtx shift_amount = gen_reg_rtx (word_mode);
518   rtx d1 = gen_reg_rtx (word_mode);
519   rtx d2 = gen_reg_rtx (word_mode);
520 
521   do_load_for_compare_from_addr (load_mode, d1, src1_addr, orig_src1);
522   do_load_for_compare_from_addr (load_mode, d2, src2_addr, orig_src2);
523   do_sub3 (shift_amount, GEN_INT (load_mode_size), cmp_rem);
524 
525   if (word_mode == DImode)
526     {
527       emit_insn (gen_ashldi3 (shift_amount, shift_amount,
528 			      GEN_INT (LOG2_BITS_PER_UNIT)));
529       emit_insn (gen_lshrdi3 (d1, d1,
530 			      gen_lowpart (SImode, shift_amount)));
531       emit_insn (gen_lshrdi3 (d2, d2,
532 			      gen_lowpart (SImode, shift_amount)));
533     }
534   else
535     {
536       emit_insn (gen_ashlsi3 (shift_amount, shift_amount,
537 			      GEN_INT (LOG2_BITS_PER_UNIT)));
538       emit_insn (gen_lshrsi3 (d1, d1, shift_amount));
539       emit_insn (gen_lshrsi3 (d2, d2, shift_amount));
540     }
541 
542   if (TARGET_P9_MISC)
543     {
544       /* Generate a compare, and convert with a setb later.  */
545       rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1, d2);
546       emit_insn (gen_rtx_SET (dcond, cmp));
547     }
548   else
549     {
550       if (word_mode == DImode)
551 	emit_insn (gen_subfdi3_carry (diff, d2, d1));
552       else
553 	emit_insn (gen_subfsi3_carry (diff, d2, d1));
554     }
555 }
556 
557 /* Generate rtl for an overlapping load and compare of less than a
558    full load_mode.  This assumes that the previous word is part of the
559    block being compared so it's ok to back up part of a word so we can
560    compare the last unaligned full word that ends at the end of the block.
561 
562    LOAD_MODE is the machine mode for the loads.
563    ISCONST tells whether the remaining length is a constant or in a register.
564    BYTES_REM is the remaining length if ISCONST is true.
565    DIFF is the reg for the difference.
566    CMP_REM is the reg containing the remaining bytes to compare if !ISCONST.
567    DCOND is the CCUNS reg for the compare if we are doing P9 code with setb.
568    SRC1_ADDR is the first source address.
569    SRC2_ADDR is the second source address.
570    ORIG_SRC1 is the original first source block's address rtx.
571    ORIG_SRC2 is the original second source block's address rtx.  */
572 static void
573 do_overlap_load_compare (machine_mode load_mode, bool isConst,
574 			HOST_WIDE_INT bytes_rem, rtx diff,
575 			rtx cmp_rem, rtx dcond, rtx src1_addr, rtx src2_addr,
576 			rtx orig_src1, rtx orig_src2)
577 {
578   HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode);
579   HOST_WIDE_INT addr_adj = load_mode_size - bytes_rem;
580   rtx d1 = gen_reg_rtx (word_mode);
581   rtx d2 = gen_reg_rtx (word_mode);
582 
583   rtx addr1, addr2;
584   if (!isConst || addr_adj)
585     {
586       rtx adj_reg = gen_reg_rtx (word_mode);
587       if (isConst)
588 	emit_move_insn (adj_reg, GEN_INT (-addr_adj));
589       else
590 	{
591 	  rtx reg_lms = gen_reg_rtx (word_mode);
592 	  emit_move_insn (reg_lms, GEN_INT (load_mode_size));
593 	  do_sub3 (adj_reg, cmp_rem, reg_lms);
594 	}
595 
596       addr1 = gen_rtx_PLUS (word_mode, src1_addr, adj_reg);
597       addr2 = gen_rtx_PLUS (word_mode, src2_addr, adj_reg);
598     }
599   else
600     {
601       addr1 = src1_addr;
602       addr2 = src2_addr;
603     }
604 
605   do_load_for_compare_from_addr (load_mode, d1, addr1, orig_src1);
606   do_load_for_compare_from_addr (load_mode, d2, addr2, orig_src2);
607 
608   if (TARGET_P9_MISC)
609     {
610       /* Generate a compare, and convert with a setb later.  */
611       rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1, d2);
612       emit_insn (gen_rtx_SET (dcond, cmp));
613     }
614   else
615     {
616       if (word_mode == DImode)
617 	emit_insn (gen_subfdi3_carry (diff, d2, d1));
618       else
619 	emit_insn (gen_subfsi3_carry (diff, d2, d1));
620     }
621 }
622 
623 /* Generate the sequence of compares for strcmp/strncmp using vec/vsx
624    instructions.
625 
626    BYTES_TO_COMPARE is the number of bytes to be compared.
627    ORIG_SRC1 is the unmodified rtx for the first string.
628    ORIG_SRC2 is the unmodified rtx for the second string.
629    S1ADDR is the register to use for the base address of the first string.
630    S2ADDR is the register to use for the base address of the second string.
631    OFF_REG is the register to use for the string offset for loads.
632    S1DATA is the register for loading the first string.
633    S2DATA is the register for loading the second string.
634    VEC_RESULT is the rtx for the vector result indicating the byte difference.
635    EQUALITY_COMPARE_REST is a flag to indicate we need to make a cleanup call
636    to strcmp/strncmp if we have equality at the end of the inline comparison.
637    P_CLEANUP_LABEL is a pointer to rtx for a label we generate if we need code
638    to clean up and generate the final comparison result.
639    FINAL_MOVE_LABEL is rtx for a label we can branch to when we can just
640    set the final result.
641    CHECKZERO indicates whether the sequence should check for zero bytes
642    for use doing strncmp, or not (for use doing memcmp).  */
643 static void
644 expand_cmp_vec_sequence (unsigned HOST_WIDE_INT bytes_to_compare,
645 			 rtx orig_src1, rtx orig_src2,
646 			 rtx s1addr, rtx s2addr, rtx off_reg,
647 			 rtx s1data, rtx s2data, rtx vec_result,
648 			 bool equality_compare_rest, rtx *p_cleanup_label,
649 			 rtx final_move_label, bool checkzero)
650 {
651   machine_mode load_mode;
652   unsigned int load_mode_size;
653   unsigned HOST_WIDE_INT cmp_bytes = 0;
654   unsigned HOST_WIDE_INT offset = 0;
655   rtx zero_reg = NULL;
656 
657   gcc_assert (p_cleanup_label != NULL);
658   rtx cleanup_label = *p_cleanup_label;
659 
660   emit_move_insn (s1addr, force_reg (Pmode, XEXP (orig_src1, 0)));
661   emit_move_insn (s2addr, force_reg (Pmode, XEXP (orig_src2, 0)));
662 
663   if (checkzero && !TARGET_P9_VECTOR)
664     {
665       zero_reg = gen_reg_rtx (V16QImode);
666       emit_move_insn (zero_reg, CONST0_RTX (V16QImode));
667     }
668 
669   while (bytes_to_compare > 0)
670     {
671       /* VEC/VSX compare sequence for P8:
672 	 check each 16B with:
673 	 lxvd2x 32,28,8
674 	 lxvd2x 33,29,8
675 	 vcmpequb 2,0,1  # compare strings
676 	 vcmpequb 4,0,3  # compare w/ 0
677 	 xxlorc 37,36,34       # first FF byte is either mismatch or end of string
678 	 vcmpequb. 7,5,3  # reg 7 contains 0
679 	 bnl 6,.Lmismatch
680 
681 	 For the P8 LE case, we use lxvd2x and compare full 16 bytes
682 	 but then use use vgbbd and a shift to get two bytes with the
683 	 information we need in the correct order.
684 
685 	 VEC/VSX compare sequence if TARGET_P9_VECTOR:
686 	 lxvb16x/lxvb16x     # load 16B of each string
687 	 vcmpnezb.           # produces difference location or zero byte location
688 	 bne 6,.Lmismatch
689 
690 	 Use the overlapping compare trick for the last block if it is
691 	 less than 16 bytes.
692       */
693 
694       load_mode = V16QImode;
695       load_mode_size = GET_MODE_SIZE (load_mode);
696 
697       if (bytes_to_compare >= load_mode_size)
698 	cmp_bytes = load_mode_size;
699       else
700 	{
701 	  /* Move this load back so it doesn't go past the end.  P8/P9
702 	     can do this efficiently.  This is never called with less
703 	     than 16 bytes so we should always be able to do this.  */
704 	  unsigned int extra_bytes = load_mode_size - bytes_to_compare;
705 	  cmp_bytes = bytes_to_compare;
706 	  gcc_assert (offset > extra_bytes);
707 	  offset -= extra_bytes;
708 	  cmp_bytes = load_mode_size;
709 	  bytes_to_compare = cmp_bytes;
710 	}
711 
712       /* The offset currently used is always kept in off_reg so that the
713 	 cleanup code on P8 can use it to extract the differing byte.  */
714       emit_move_insn (off_reg, GEN_INT (offset));
715 
716       rtx addr1 = gen_rtx_PLUS (Pmode, s1addr, off_reg);
717       do_load_for_compare_from_addr (load_mode, s1data, addr1, orig_src1);
718       rtx addr2 = gen_rtx_PLUS (Pmode, s2addr, off_reg);
719       do_load_for_compare_from_addr (load_mode, s2data, addr2, orig_src2);
720 
721       /* Cases to handle.  A and B are chunks of the two strings.
722 	 1: Not end of comparison:
723 	 A != B: branch to cleanup code to compute result.
724 	 A == B: next block
725 	 2: End of the inline comparison:
726 	 A != B: branch to cleanup code to compute result.
727 	 A == B: call strcmp/strncmp
728 	 3: compared requested N bytes:
729 	 A == B: branch to result 0.
730 	 A != B: cleanup code to compute result.  */
731 
732       unsigned HOST_WIDE_INT remain = bytes_to_compare - cmp_bytes;
733 
734       if (checkzero)
735 	{
736 	  if (TARGET_P9_VECTOR)
737 	    emit_insn (gen_vcmpnezb_p (vec_result, s1data, s2data));
738 	  else
739 	    {
740 	      /* Emit instructions to do comparison and zero check.  */
741 	      rtx cmp_res = gen_reg_rtx (load_mode);
742 	      rtx cmp_zero = gen_reg_rtx (load_mode);
743 	      rtx cmp_combined = gen_reg_rtx (load_mode);
744 	      emit_insn (gen_altivec_eqv16qi (cmp_res, s1data, s2data));
745 	      emit_insn (gen_altivec_eqv16qi (cmp_zero, s1data, zero_reg));
746 	      emit_insn (gen_orcv16qi3 (vec_result, cmp_zero, cmp_res));
747 	      emit_insn (gen_altivec_vcmpequb_p (cmp_combined, vec_result, zero_reg));
748 	    }
749 	}
750       else
751 	emit_insn (gen_altivec_vcmpequb_p (vec_result, s1data, s2data));
752 
753       bool branch_to_cleanup = (remain > 0 || equality_compare_rest);
754       rtx cr6 = gen_rtx_REG (CCmode, CR6_REGNO);
755       rtx dst_label;
756       rtx cmp_rtx;
757       if (branch_to_cleanup)
758 	{
759 	  /* Branch to cleanup code, otherwise fall through to do more
760 	     compares.  P8 and P9 use different CR bits because on P8
761 	     we are looking at the result of a comparsion vs a
762 	     register of zeroes so the all-true condition means no
763 	     difference or zero was found.  On P9, vcmpnezb sets a byte
764 	     to 0xff if there is a mismatch or zero, so the all-false
765 	     condition indicates we found no difference or zero.  */
766 	  if (!cleanup_label)
767 	    cleanup_label = gen_label_rtx ();
768 	  dst_label = cleanup_label;
769 	  if (TARGET_P9_VECTOR && checkzero)
770 	    cmp_rtx = gen_rtx_NE (VOIDmode, cr6, const0_rtx);
771 	  else
772 	    cmp_rtx = gen_rtx_GE (VOIDmode, cr6, const0_rtx);
773 	}
774       else
775 	{
776 	  /* Branch to final return or fall through to cleanup,
777 	     result is already set to 0.  */
778 	  dst_label = final_move_label;
779 	  if (TARGET_P9_VECTOR && checkzero)
780 	    cmp_rtx = gen_rtx_EQ (VOIDmode, cr6, const0_rtx);
781 	  else
782 	    cmp_rtx = gen_rtx_LT (VOIDmode, cr6, const0_rtx);
783 	}
784 
785       rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, dst_label);
786       rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
787 					 lab_ref, pc_rtx);
788       rtx_insn *j2 = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
789       add_reg_br_prob_note (j2, profile_probability::likely ());
790       JUMP_LABEL (j2) = dst_label;
791       LABEL_NUSES (dst_label) += 1;
792 
793       offset += cmp_bytes;
794       bytes_to_compare -= cmp_bytes;
795     }
796   *p_cleanup_label = cleanup_label;
797   return;
798 }
799 
800 /* Generate the final sequence that identifies the differing
801    byte and generates the final result, taking into account
802    zero bytes:
803 
804    P8:
805         vgbbd 0,0
806         vsldoi 0,0,0,9
807         mfvsrd 9,32
808         addi 10,9,-1    # count trailing zero bits
809         andc 9,10,9
810         popcntd 9,9
811         lbzx 10,28,9    # use that offset to load differing byte
812         lbzx 3,29,9
813         subf 3,3,10     # subtract for final result
814 
815    P9:
816 	 vclzlsbb            # counts trailing bytes with lsb=0
817 	 vextublx            # extract differing byte
818 
819    STR1 is the reg rtx for data from string 1.
820    STR2 is the reg rtx for data from string 2.
821    RESULT is the reg rtx for the comparison result.
822    S1ADDR is the register to use for the base address of the first string.
823    S2ADDR is the register to use for the base address of the second string.
824    ORIG_SRC1 is the unmodified rtx for the first string.
825    ORIG_SRC2 is the unmodified rtx for the second string.
826    OFF_REG is the register to use for the string offset for loads.
827    VEC_RESULT is the rtx for the vector result indicating the byte difference.  */
828 
829 static void
830 emit_final_compare_vec (rtx str1, rtx str2, rtx result,
831 			rtx s1addr, rtx s2addr,
832 			rtx orig_src1, rtx orig_src2,
833 			rtx off_reg, rtx vec_result)
834 {
835 
836   if (TARGET_P9_VECTOR)
837     {
838       rtx diffix = gen_reg_rtx (SImode);
839       rtx chr1 = gen_reg_rtx (SImode);
840       rtx chr2 = gen_reg_rtx (SImode);
841       rtx chr1_di = simplify_gen_subreg (DImode, chr1, SImode, 0);
842       rtx chr2_di = simplify_gen_subreg (DImode, chr2, SImode, 0);
843       emit_insn (gen_vclzlsbb_v16qi (diffix, vec_result));
844       emit_insn (gen_vextublx (chr1, diffix, str1));
845       emit_insn (gen_vextublx (chr2, diffix, str2));
846       do_sub3 (result, chr1_di, chr2_di);
847     }
848   else
849     {
850       gcc_assert (TARGET_P8_VECTOR);
851       rtx diffix = gen_reg_rtx (DImode);
852       rtx result_gbbd = gen_reg_rtx (V16QImode);
853       /* Since each byte of the input is either 00 or FF, the bytes in
854 	 dw0 and dw1 after vgbbd are all identical to each other.  */
855       emit_insn (gen_p8v_vgbbd (result_gbbd, vec_result));
856       /* For LE, we shift by 9 and get BA in the low two bytes then CTZ.
857 	 For BE, we shift by 7 and get AB in the high two bytes then CLZ.  */
858       rtx result_shifted = gen_reg_rtx (V16QImode);
859       int shift_amt = (BYTES_BIG_ENDIAN) ? 7 : 9;
860       emit_insn (gen_altivec_vsldoi_v16qi (result_shifted, result_gbbd,
861 					   result_gbbd, GEN_INT (shift_amt)));
862 
863       rtx diffix_df = simplify_gen_subreg (DFmode, diffix, DImode, 0);
864       emit_insn (gen_p8_mfvsrd_3_v16qi (diffix_df, result_shifted));
865       rtx count = gen_reg_rtx (DImode);
866 
867       if (BYTES_BIG_ENDIAN)
868 	emit_insn (gen_clzdi2 (count, diffix));
869       else
870 	emit_insn (gen_ctzdi2 (count, diffix));
871 
872       /* P8 doesn't have a good solution for extracting one byte from
873 	 a vsx reg like vextublx on P9 so we just compute the offset
874 	 of the differing byte and load it from each string.  */
875       do_add3 (off_reg, off_reg, count);
876 
877       rtx chr1 = gen_reg_rtx (QImode);
878       rtx chr2 = gen_reg_rtx (QImode);
879       rtx addr1 = gen_rtx_PLUS (Pmode, s1addr, off_reg);
880       do_load_for_compare_from_addr (QImode, chr1, addr1, orig_src1);
881       rtx addr2 = gen_rtx_PLUS (Pmode, s2addr, off_reg);
882       do_load_for_compare_from_addr (QImode, chr2, addr2, orig_src2);
883       machine_mode rmode = GET_MODE (result);
884       rtx chr1_rm = simplify_gen_subreg (rmode, chr1, QImode, 0);
885       rtx chr2_rm = simplify_gen_subreg (rmode, chr2, QImode, 0);
886       do_sub3 (result, chr1_rm, chr2_rm);
887     }
888 
889   return;
890 }
891 
892 /* Expand a block compare operation using loop code, and return true
893    if successful.  Return false if we should let the compiler generate
894    normal code, probably a memcmp call.
895 
896    OPERANDS[0] is the target (result).
897    OPERANDS[1] is the first source.
898    OPERANDS[2] is the second source.
899    OPERANDS[3] is the length.
900    OPERANDS[4] is the alignment.  */
901 bool
902 expand_compare_loop (rtx operands[])
903 {
904   rtx target = operands[0];
905   rtx orig_src1 = operands[1];
906   rtx orig_src2 = operands[2];
907   rtx bytes_rtx = operands[3];
908   rtx align_rtx = operands[4];
909 
910   /* This case is complicated to handle because the subtract
911      with carry instructions do not generate the 64-bit
912      carry and so we must emit code to calculate it ourselves.
913      We choose not to implement this yet.  */
914   if (TARGET_32BIT && TARGET_POWERPC64)
915     return false;
916 
917   /* Allow non-const length.  */
918   int bytes_is_const = CONST_INT_P (bytes_rtx);
919 
920   /* This must be a fixed size alignment.  */
921   if (!CONST_INT_P (align_rtx))
922     return false;
923 
924   HOST_WIDE_INT align1 = MEM_ALIGN (orig_src1) / BITS_PER_UNIT;
925   HOST_WIDE_INT align2 = MEM_ALIGN (orig_src2) / BITS_PER_UNIT;
926   HOST_WIDE_INT minalign = MIN (align1, align2);
927 
928   bool isP7 = (rs6000_tune == PROCESSOR_POWER7);
929 
930   gcc_assert (GET_MODE (target) == SImode);
931 
932   /* Anything to move?	*/
933   HOST_WIDE_INT bytes = 0;
934   if (bytes_is_const)
935     bytes = INTVAL (bytes_rtx);
936 
937   if (bytes_is_const && bytes == 0)
938     return true;
939 
940   /* Limit the amount we compare, if known statically.  */
941   HOST_WIDE_INT max_bytes;
942   switch (rs6000_tune)
943     {
944     case PROCESSOR_POWER7:
945       if (!bytes_is_const)
946 	if (minalign < 8)
947 	  max_bytes = 0;
948 	else
949 	  max_bytes = 128;
950       else
951 	if (minalign < 8)
952 	  max_bytes = 32;
953 	else
954 	  max_bytes = 128;
955       break;
956     case PROCESSOR_POWER8:
957       if (!bytes_is_const)
958 	max_bytes = 0;
959       else
960 	if (minalign < 8)
961 	  max_bytes = 128;
962 	else
963 	  max_bytes = 64;
964       break;
965     case PROCESSOR_POWER9:
966       if (bytes_is_const)
967 	max_bytes = 191;
968       else
969 	max_bytes = 0;
970       break;
971     default:
972       max_bytes = 128;
973     }
974 
975   /* Allow the option to override the default.  */
976   if (rs6000_block_compare_inline_loop_limit >= 0)
977     max_bytes = (unsigned HOST_WIDE_INT) rs6000_block_compare_inline_loop_limit;
978 
979   if (max_bytes == 0)
980     return false;
981 
982   rtx cmp_rem = gen_reg_rtx (word_mode);  /* Remainder for library call.  */
983   rtx loop_cmp = gen_reg_rtx (word_mode); /* Actual amount compared by loop.  */
984   HOST_WIDE_INT niter;
985   rtx iter = gen_reg_rtx (word_mode);
986   rtx iv1 = gen_reg_rtx (word_mode);
987   rtx iv2 = gen_reg_rtx (word_mode);
988   rtx d1_1 = gen_reg_rtx (word_mode);  /* Addr expression src1+iv1 */
989   rtx d1_2 = gen_reg_rtx (word_mode);  /* Addr expression src1+iv2 */
990   rtx d2_1 = gen_reg_rtx (word_mode);  /* Addr expression src2+iv1 */
991   rtx d2_2 = gen_reg_rtx (word_mode);  /* Addr expression src2+iv2 */
992 
993   /* Strip unneeded subreg from length if there is one.  */
994   if (SUBREG_P (bytes_rtx) && subreg_lowpart_p (bytes_rtx))
995     bytes_rtx = SUBREG_REG (bytes_rtx);
996   /* Extend bytes_rtx to word_mode if needed.  But, we expect only to
997    maybe have to deal with the case were bytes_rtx is SImode and
998    word_mode is DImode.  */
999   if (!bytes_is_const)
1000     {
1001       if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) > GET_MODE_SIZE (word_mode))
1002 	/* Do not expect length longer than word_mode.  */
1003 	return false;
1004       else if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) < GET_MODE_SIZE (word_mode))
1005 	{
1006 	  bytes_rtx = force_reg (GET_MODE (bytes_rtx), bytes_rtx);
1007 	  bytes_rtx = force_reg (word_mode,
1008 				 gen_rtx_fmt_e (ZERO_EXTEND, word_mode,
1009 						bytes_rtx));
1010 	}
1011       else
1012 	/* Make sure it's in a register before we get started.  */
1013 	bytes_rtx = force_reg (GET_MODE (bytes_rtx), bytes_rtx);
1014     }
1015 
1016   machine_mode load_mode = word_mode;
1017   HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode);
1018 
1019   /* Number of bytes per iteration of the unrolled loop.  */
1020   HOST_WIDE_INT loop_bytes = 2 * load_mode_size;
1021   /* max iters and bytes compared in the loop.  */
1022   HOST_WIDE_INT max_loop_iter = max_bytes / loop_bytes;
1023   HOST_WIDE_INT max_loop_bytes = max_loop_iter * loop_bytes;
1024   int l2lb = floor_log2 (loop_bytes);
1025 
1026   if (bytes_is_const && (max_bytes < load_mode_size
1027 			 || !IN_RANGE (bytes, load_mode_size, max_bytes)))
1028     return false;
1029 
1030   bool no_remainder_code = false;
1031   rtx final_label = gen_label_rtx ();
1032   rtx final_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
1033   rtx diff_label = gen_label_rtx ();
1034   rtx library_call_label = NULL;
1035   rtx cleanup_label = gen_label_rtx ();
1036 
1037   rtx cr;
1038 
1039   rtx src1_addr = copy_addr_to_reg (XEXP (orig_src1, 0));
1040   rtx src2_addr = copy_addr_to_reg (XEXP (orig_src2, 0));
1041 
1042   /* Difference found is stored here before jump to diff_label.  */
1043   rtx diff = gen_reg_rtx (word_mode);
1044   rtx_insn *j;
1045 
1046   /* Example of generated code for 35 bytes aligned 1 byte.
1047 
1048 	     mtctr 8
1049 	     li 6,0
1050 	     li 5,8
1051      .L13:
1052 	     ldbrx 7,3,6
1053 	     ldbrx 9,10,6
1054 	     ldbrx 0,3,5
1055 	     ldbrx 4,10,5
1056 	     addi 6,6,16
1057 	     addi 5,5,16
1058 	     subfc. 9,9,7
1059 	     bne 0,.L10
1060 	     subfc. 9,4,0
1061 	     bdnzt 2,.L13
1062 	     bne 0,.L10
1063 	     add 3,3,6
1064 	     add 10,10,6
1065 	     addi 9,3,-5
1066 	     ldbrx 7,0,9
1067 	     addi 9,10,-5
1068 	     ldbrx 9,0,9
1069 	     subfc 9,9,7
1070 	     .p2align 4,,15
1071      .L10:
1072 	     popcntd 9,9
1073 	     subfe 10,10,10
1074 	     or 9,9,10
1075 
1076      Compiled with -fno-reorder-blocks for clarity.  */
1077 
1078   /* Structure of what we're going to do:
1079      Two separate lengths: what we will compare before bailing to library
1080 	call (max_bytes), and the total length to be checked.
1081      if length <= 16, branch to linear cleanup code starting with
1082 	remainder length check (length not known at compile time)
1083      set up 2 iv's and load count reg, compute remainder length
1084      unrollx2 compare loop
1085      if loop exit due to a difference, branch to difference handling code
1086      if remainder length < 8, branch to final cleanup compare
1087      load and compare 8B
1088      final cleanup comparison (depends on alignment and length)
1089 	load 8B, shift off bytes past length, compare
1090 	load 8B ending at last byte and compare
1091 	load/compare 1 byte at a time (short block abutting 4k boundary)
1092      difference handling, 64->32 conversion
1093      final result
1094      branch around memcmp call
1095      memcmp library call
1096   */
1097 
1098   /* If bytes is not const, compare length and branch directly
1099      to the cleanup code that can handle 0-16 bytes if length
1100      is >= 16.  Stash away bytes-max_bytes for the library call.  */
1101   if (bytes_is_const)
1102     {
1103       /* These need to be set for some of the places we may jump to.  */
1104       if (bytes > max_bytes)
1105 	{
1106 	  no_remainder_code = true;
1107 	  niter = max_loop_iter;
1108 	  library_call_label = gen_label_rtx ();
1109 	}
1110       else
1111 	{
1112 	  niter = bytes / loop_bytes;
1113 	}
1114       emit_move_insn (iter, GEN_INT (niter));
1115       emit_move_insn (loop_cmp, GEN_INT (niter * loop_bytes));
1116       emit_move_insn (cmp_rem, GEN_INT (bytes - niter * loop_bytes));
1117     }
1118   else
1119     {
1120       library_call_label = gen_label_rtx ();
1121 
1122       /* If we go to the cleanup code, it expects length to be in cmp_rem.  */
1123       emit_move_insn (cmp_rem, bytes_rtx);
1124 
1125       /* Check for > max_bytes bytes.  We want to bail out as quickly as
1126 	 possible if we have to go over to memcmp.  */
1127       do_ifelse (CCmode, GT, bytes_rtx, GEN_INT (max_bytes),
1128 		 NULL_RTX, library_call_label, profile_probability::even ());
1129 
1130       /* Check for < loop_bytes bytes.  */
1131       do_ifelse (CCmode, LT, bytes_rtx, GEN_INT (loop_bytes),
1132 		 NULL_RTX, cleanup_label, profile_probability::even ());
1133 
1134       /* Loop compare bytes and iterations if bytes>max_bytes.  */
1135       rtx mb_reg = gen_reg_rtx (word_mode);
1136       emit_move_insn (mb_reg, GEN_INT (max_loop_bytes));
1137       rtx mi_reg = gen_reg_rtx (word_mode);
1138       emit_move_insn (mi_reg, GEN_INT (max_loop_iter));
1139 
1140       /* Compute number of loop iterations if bytes <= max_bytes.  */
1141       if (word_mode == DImode)
1142 	emit_insn (gen_lshrdi3 (iter, bytes_rtx, GEN_INT (l2lb)));
1143       else
1144 	emit_insn (gen_lshrsi3 (iter, bytes_rtx, GEN_INT (l2lb)));
1145 
1146       /* Compute bytes to compare in loop if bytes <= max_bytes.  */
1147       rtx mask = GEN_INT (HOST_WIDE_INT_M1U << l2lb);
1148       if (word_mode == DImode)
1149 	{
1150 	  emit_insn (gen_anddi3 (loop_cmp, bytes_rtx, mask));
1151 	}
1152       else
1153 	{
1154 	  emit_insn (gen_andsi3 (loop_cmp, bytes_rtx, mask));
1155 	}
1156 
1157       /* Check for bytes <= max_bytes.  */
1158       if (TARGET_ISEL)
1159 	{
1160 	  /* P9 has fast isel so we use one compare and two isel.  */
1161 	  cr = gen_reg_rtx (CCmode);
1162 	  rtx compare_rtx = gen_rtx_COMPARE (CCmode, bytes_rtx,
1163 					     GEN_INT (max_bytes));
1164 	  emit_move_insn (cr, compare_rtx);
1165 	  rtx cmp_rtx = gen_rtx_LE (VOIDmode, cr, const0_rtx);
1166 	  do_isel (loop_cmp, cmp_rtx, loop_cmp, mb_reg, cr);
1167 	  do_isel (iter, cmp_rtx, iter, mi_reg, cr);
1168 	}
1169       else
1170 	{
1171 	  rtx lab_after = gen_label_rtx ();
1172 	  do_ifelse (CCmode, LE, bytes_rtx, GEN_INT (max_bytes),
1173 		     NULL_RTX, lab_after, profile_probability::even ());
1174 	  emit_move_insn (loop_cmp, mb_reg);
1175 	  emit_move_insn (iter, mi_reg);
1176 	  emit_label (lab_after);
1177 	}
1178 
1179       /* Now compute remainder bytes which isn't used until after the loop.  */
1180       do_sub3 (cmp_rem, bytes_rtx, loop_cmp);
1181     }
1182 
1183   rtx dcond = NULL_RTX; /* Used for when we jump to diff_label.  */
1184   /* For p9 we need to have just one of these as multiple places define
1185      it and it gets used by the setb at the end.  */
1186   if (TARGET_P9_MISC)
1187     dcond = gen_reg_rtx (CCUNSmode);
1188 
1189   if (!bytes_is_const || bytes >= loop_bytes)
1190     {
1191       /* It should not be possible to come here if remaining bytes is
1192 	 < 16 in the runtime case either.  Compute number of loop
1193 	 iterations.  We compare 2*word_mode per iteration so 16B for
1194 	 64-bit code and 8B for 32-bit.  Set up two induction
1195 	 variables and load count register.  */
1196 
1197       /* HACK ALERT: create hard reg for CTR here.  If we just use a
1198 	 pseudo, cse will get rid of it and then the allocator will
1199 	 see it used in the lshr above and won't give us ctr.  */
1200       rtx ctr = gen_rtx_REG (Pmode, CTR_REGNO);
1201       emit_move_insn (ctr, iter);
1202       emit_move_insn (diff, GEN_INT (0));
1203       emit_move_insn (iv1, GEN_INT (0));
1204       emit_move_insn (iv2, GEN_INT (load_mode_size));
1205 
1206       /* inner loop to compare 2*word_mode */
1207       rtx loop_top_label = gen_label_rtx ();
1208       emit_label (loop_top_label);
1209 
1210       rtx src1_ix1 = gen_rtx_PLUS (word_mode, src1_addr, iv1);
1211       rtx src2_ix1 = gen_rtx_PLUS (word_mode, src2_addr, iv1);
1212 
1213       do_load_for_compare_from_addr (load_mode, d1_1,
1214 				     src1_ix1, orig_src1);
1215       do_load_for_compare_from_addr (load_mode, d2_1,
1216 				     src2_ix1, orig_src2);
1217       do_add3 (iv1, iv1, GEN_INT (loop_bytes));
1218 
1219       rtx src1_ix2 = gen_rtx_PLUS (word_mode, src1_addr, iv2);
1220       rtx src2_ix2 = gen_rtx_PLUS (word_mode, src2_addr, iv2);
1221 
1222       do_load_for_compare_from_addr (load_mode, d1_2,
1223 				     src1_ix2, orig_src1);
1224       do_load_for_compare_from_addr (load_mode, d2_2,
1225 				     src2_ix2, orig_src2);
1226       do_add3 (iv2, iv2, GEN_INT (loop_bytes));
1227 
1228       if (TARGET_P9_MISC)
1229 	{
1230 	  /* Generate a compare, and convert with a setb later.  */
1231 	  rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_1, d2_1);
1232 	  emit_insn (gen_rtx_SET (dcond, cmp));
1233 	}
1234       else
1235 	{
1236 	  dcond = gen_reg_rtx (CCmode);
1237 	  if (word_mode == DImode)
1238 	    emit_insn (gen_subfdi3_carry_dot2 (diff, d2_1, d1_1, dcond));
1239 	  else
1240 	    emit_insn (gen_subfsi3_carry_dot2 (diff, d2_1, d1_1, dcond));
1241 	}
1242 
1243       do_ifelse (GET_MODE (dcond), NE, NULL_RTX, NULL_RTX,
1244 		 dcond, diff_label, profile_probability::unlikely ());
1245 
1246       if (TARGET_P9_MISC)
1247 	{
1248 	  /* Generate a compare, and convert with a setb later.  */
1249 	  rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_2, d2_2);
1250 	  emit_insn (gen_rtx_SET (dcond, cmp));
1251 	}
1252       else
1253 	{
1254 	  dcond = gen_reg_rtx (CCmode);
1255 	  if (word_mode == DImode)
1256 	    emit_insn (gen_subfdi3_carry_dot2 (diff, d2_2, d1_2, dcond));
1257 	  else
1258 	    emit_insn (gen_subfsi3_carry_dot2 (diff, d2_2, d1_2, dcond));
1259 	}
1260 
1261       rtx eqrtx = gen_rtx_EQ (VOIDmode, d1_2, d2_2);
1262       if (TARGET_64BIT)
1263 	j = emit_jump_insn (gen_bdnztf_di (loop_top_label, ctr, ctr,
1264 					   eqrtx, dcond));
1265       else
1266 	j = emit_jump_insn (gen_bdnztf_si (loop_top_label, ctr, ctr,
1267 					   eqrtx, dcond));
1268       add_reg_br_prob_note (j, profile_probability::likely ());
1269       JUMP_LABEL (j) = loop_top_label;
1270       LABEL_NUSES (loop_top_label) += 1;
1271     }
1272 
1273   HOST_WIDE_INT bytes_remaining = 0;
1274   if (bytes_is_const)
1275     bytes_remaining = (bytes % loop_bytes);
1276 
1277   /* If diff is nonzero, branch to difference handling
1278      code.  If we exit here with a nonzero diff, it is
1279      because the second word differed.  */
1280   if (TARGET_P9_MISC)
1281     do_ifelse (CCUNSmode, NE, NULL_RTX, NULL_RTX, dcond,
1282 	       diff_label, profile_probability::unlikely ());
1283   else
1284     do_ifelse (CCmode, NE, diff, const0_rtx, NULL_RTX,
1285 	       diff_label, profile_probability::unlikely ());
1286 
1287   if (library_call_label != NULL && bytes_is_const && bytes > max_bytes)
1288     {
1289       /* If the length is known at compile time, then we will always
1290 	 have a remainder to go to the library call with.  */
1291       rtx library_call_ref = gen_rtx_LABEL_REF (VOIDmode, library_call_label);
1292       j = emit_jump_insn (gen_rtx_SET (pc_rtx, library_call_ref));
1293       JUMP_LABEL (j) = library_call_label;
1294       LABEL_NUSES (library_call_label) += 1;
1295       emit_barrier ();
1296     }
1297 
1298   if (bytes_is_const && bytes_remaining == 0)
1299     {
1300       /* No remainder and if we are here then diff is 0 so just return 0 */
1301       if (TARGET_64BIT)
1302 	emit_insn (gen_movsi (target, gen_lowpart (SImode, diff)));
1303       else
1304 	emit_move_insn (target, diff);
1305       j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref));
1306       JUMP_LABEL (j) = final_label;
1307       LABEL_NUSES (final_label) += 1;
1308       emit_barrier ();
1309     }
1310   else if (!no_remainder_code)
1311     {
1312       /* Update addresses to point to the next word to examine.  */
1313       do_add3 (src1_addr, src1_addr, iv1);
1314       do_add3 (src2_addr, src2_addr, iv1);
1315 
1316       emit_label (cleanup_label);
1317 
1318       if (!bytes_is_const)
1319 	{
1320 	  /* If we're dealing with runtime length, we have to check if
1321 	     it's zero after the loop.  When length is known at compile
1322 	     time the no-remainder condition is dealt with above.  By
1323 	     doing this after cleanup_label, we also deal with the
1324 	     case where length is 0 at the start and we bypass the
1325 	     loop with a branch to cleanup_label.  */
1326 	  emit_move_insn (target, const0_rtx);
1327 	  do_ifelse (CCmode, EQ, cmp_rem, const0_rtx,
1328 		     NULL_RTX, final_label, profile_probability::unlikely ());
1329 	}
1330 
1331       rtx final_cleanup = gen_label_rtx ();
1332       rtx cmp_rem_before = gen_reg_rtx (word_mode);
1333       /* Compare one more word_mode chunk if needed.  */
1334       if (!bytes_is_const || bytes_remaining >= load_mode_size)
1335 	{
1336 	  /* If remainder length < word length, branch to final
1337 	     cleanup compare.  */
1338 
1339 	  if (!bytes_is_const)
1340 	    {
1341 	      do_ifelse (CCmode, LT, cmp_rem, GEN_INT (load_mode_size),
1342 			 NULL_RTX, final_cleanup, profile_probability::even ());
1343 	    }
1344 
1345 	  /* load and compare 8B */
1346 	  do_load_for_compare_from_addr (load_mode, d1_1,
1347 					 src1_addr, orig_src1);
1348 	  do_load_for_compare_from_addr (load_mode, d2_1,
1349 					 src2_addr, orig_src2);
1350 
1351 	  /* Compare the word, see if we need to do the last partial.  */
1352 	  if (TARGET_P9_MISC)
1353 	    {
1354 	      /* Generate a compare, and convert with a setb later.  */
1355 	      rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_1, d2_1);
1356 	      emit_insn (gen_rtx_SET (dcond, cmp));
1357 	    }
1358 	  else
1359 	    {
1360 	      dcond = gen_reg_rtx (CCmode);
1361 	      if (word_mode == DImode)
1362 		emit_insn (gen_subfdi3_carry_dot2 (diff, d2_1, d1_1, dcond));
1363 	      else
1364 		emit_insn (gen_subfsi3_carry_dot2 (diff, d2_1, d1_1, dcond));
1365 	    }
1366 
1367 	  do_ifelse (GET_MODE (dcond), NE, NULL_RTX, NULL_RTX,
1368 		     dcond, diff_label, profile_probability::even ());
1369 
1370 	  do_add3 (src1_addr, src1_addr, GEN_INT (load_mode_size));
1371 	  do_add3 (src2_addr, src2_addr, GEN_INT (load_mode_size));
1372 	  emit_move_insn (cmp_rem_before, cmp_rem);
1373 	  do_add3 (cmp_rem, cmp_rem, GEN_INT (-load_mode_size));
1374 	  if (bytes_is_const)
1375 	    bytes_remaining -= load_mode_size;
1376 	  else
1377 	    /* See if remaining length is now zero.  We previously set
1378 	       target to 0 so we can just jump to the end.  */
1379 	    do_ifelse (CCmode, EQ, cmp_rem, const0_rtx, NULL_RTX,
1380 		       final_label, profile_probability::unlikely ());
1381 	}
1382 
1383       /* Cases:
1384 	 bytes_is_const
1385 	   We can always shift back to do an overlapping compare
1386 	   of the last chunk because we know length >= 8.
1387 
1388 	 !bytes_is_const
1389 	   align>=load_mode_size
1390 	     Read word_mode and mask
1391 	   align<load_mode_size
1392 	     avoid stepping past end
1393 
1394 	  Three strategies:
1395 	  * decrement address and do overlapping compare
1396 	  * read word_mode and mask
1397 	  * carefully avoid crossing 4k boundary
1398        */
1399 
1400       if ((!bytes_is_const || (bytes_is_const && bytes_remaining && isP7))
1401 	  && align1 >= load_mode_size && align2 >= load_mode_size)
1402 	{
1403 	  /* Alignment is larger than word_mode so we do not need to be
1404 	     concerned with extra page crossings.  But, we do not know
1405 	     that the length is larger than load_mode_size so we might
1406 	     end up compareing against data before the block if we try
1407 	     an overlapping compare.  Also we use this on P7 for fixed length
1408 	     remainder because P7 doesn't like overlapping unaligned.
1409 	     Strategy: load 8B, shift off bytes past length, and compare.  */
1410 	  emit_label (final_cleanup);
1411 	  do_load_mask_compare (load_mode, diff, cmp_rem, dcond,
1412 				src1_addr, src2_addr, orig_src1, orig_src2);
1413 	}
1414       else if (bytes_remaining && bytes_is_const)
1415 	{
1416 	  /* We do not do loop expand if length < 32 so we know at the
1417 	     end we can do an overlapping compare.
1418 	     Strategy: shift address back and do word_mode load that
1419 	     ends at the end of the block.  */
1420 	  emit_label (final_cleanup);
1421 	  do_overlap_load_compare (load_mode, true, bytes_remaining, diff,
1422 				   cmp_rem, dcond, src1_addr, src2_addr,
1423 				   orig_src1, orig_src2);
1424 	}
1425       else if (!bytes_is_const)
1426 	{
1427 	  rtx handle4k_label = gen_label_rtx ();
1428 	  rtx nonconst_overlap = gen_label_rtx ();
1429 	  emit_label (nonconst_overlap);
1430 
1431 	  /* Here we have to handle the case where whe have runtime
1432 	     length which may be too short for overlap compare, and
1433 	     alignment is not at least load_mode_size so we have to
1434 	     tread carefully to avoid stepping across 4k boundaries.  */
1435 
1436 	  /* If the length after the loop was larger than word_mode
1437 	     size, we can just do an overlapping compare and we're
1438 	     done.  We fall through to this code from the word_mode
1439 	     compare that preceeds this.  */
1440 	  do_overlap_load_compare (load_mode, false, 0, diff,
1441 				   cmp_rem, dcond, src1_addr, src2_addr,
1442 				   orig_src1, orig_src2);
1443 
1444 	  rtx diff_ref = gen_rtx_LABEL_REF (VOIDmode, diff_label);
1445 	  j = emit_jump_insn (gen_rtx_SET (pc_rtx, diff_ref));
1446 	  JUMP_LABEL (j) = diff_label;
1447 	  LABEL_NUSES (diff_label) += 1;
1448 	  emit_barrier ();
1449 
1450 	  /* If we couldn't do the overlap compare we have to be more
1451 	     careful of the 4k boundary.  Test to see if either
1452 	     address is less than word_mode_size away from a 4k
1453 	     boundary.  If not, then we can do a load/shift/compare
1454 	     and we are done.  We come to this code if length was less
1455 	     than word_mode_size.  */
1456 
1457 	  emit_label (final_cleanup);
1458 
1459 	  /* We can still avoid the slow case if the length was larger
1460 	     than one loop iteration, in which case go do the overlap
1461 	     load compare path.  */
1462 	  do_ifelse (CCmode, GT, bytes_rtx, GEN_INT (loop_bytes),
1463 		     NULL_RTX, nonconst_overlap, profile_probability::even ());
1464 
1465 	  rtx rem4k = gen_reg_rtx (word_mode);
1466 	  rtx dist1 = gen_reg_rtx (word_mode);
1467 	  rtx dist2 = gen_reg_rtx (word_mode);
1468 	  do_sub3 (rem4k, GEN_INT (4096), cmp_rem);
1469 	  if (word_mode == SImode)
1470 	    emit_insn (gen_andsi3 (dist1, src1_addr, GEN_INT (0xfff)));
1471 	  else
1472 	    emit_insn (gen_anddi3 (dist1, src1_addr, GEN_INT (0xfff)));
1473 	  do_ifelse (CCmode, LE, dist1, rem4k, NULL_RTX,
1474 		     handle4k_label, profile_probability::very_unlikely ());
1475 	  if (word_mode == SImode)
1476 	    emit_insn (gen_andsi3 (dist2, src2_addr, GEN_INT (0xfff)));
1477 	  else
1478 	    emit_insn (gen_anddi3 (dist2, src2_addr, GEN_INT (0xfff)));
1479 	  do_ifelse (CCmode, LE, dist2, rem4k, NULL_RTX,
1480 		     handle4k_label, profile_probability::very_unlikely ());
1481 
1482 	  /* We don't have a 4k boundary to deal with, so do
1483 	     a load/shift/compare and jump to diff.  */
1484 
1485 	  do_load_mask_compare (load_mode, diff, cmp_rem, dcond,
1486 				src1_addr, src2_addr, orig_src1, orig_src2);
1487 
1488 	  j = emit_jump_insn (gen_rtx_SET (pc_rtx, diff_ref));
1489 	  JUMP_LABEL (j) = diff_label;
1490 	  LABEL_NUSES (diff_label) += 1;
1491 	  emit_barrier ();
1492 
1493 	  /* Finally in the unlikely case we are inching up to a
1494 	     4k boundary we use a compact lbzx/compare loop to do
1495 	     it a byte at a time.  */
1496 
1497 	  emit_label (handle4k_label);
1498 
1499 	  rtx ctr = gen_rtx_REG (Pmode, CTR_REGNO);
1500 	  emit_move_insn (ctr, cmp_rem);
1501 	  rtx ixreg = gen_reg_rtx (Pmode);
1502 	  emit_move_insn (ixreg, const0_rtx);
1503 
1504 	  rtx src1_ix = gen_rtx_PLUS (word_mode, src1_addr, ixreg);
1505 	  rtx src2_ix = gen_rtx_PLUS (word_mode, src2_addr, ixreg);
1506 	  rtx d1 = gen_reg_rtx (word_mode);
1507 	  rtx d2 = gen_reg_rtx (word_mode);
1508 
1509 	  rtx fc_loop = gen_label_rtx ();
1510 	  emit_label (fc_loop);
1511 
1512 	  do_load_for_compare_from_addr (QImode, d1, src1_ix, orig_src1);
1513 	  do_load_for_compare_from_addr (QImode, d2, src2_ix, orig_src2);
1514 
1515 	  do_add3 (ixreg, ixreg, const1_rtx);
1516 
1517 	  rtx cond = gen_reg_rtx (CCmode);
1518 	  rtx subexpr = gen_rtx_MINUS (word_mode, d1, d2);
1519 	  rs6000_emit_dot_insn (diff, subexpr, 2, cond);
1520 
1521 	  rtx eqrtx = gen_rtx_EQ (VOIDmode, d1, d2);
1522 	  if (TARGET_64BIT)
1523 	    j = emit_jump_insn (gen_bdnztf_di (fc_loop, ctr, ctr,
1524 					       eqrtx, cond));
1525 	  else
1526 	    j = emit_jump_insn (gen_bdnztf_si (fc_loop, ctr, ctr,
1527 					       eqrtx, cond));
1528 	  add_reg_br_prob_note (j, profile_probability::likely ());
1529 	  JUMP_LABEL (j) = fc_loop;
1530 	  LABEL_NUSES (fc_loop) += 1;
1531 
1532 	  if (TARGET_64BIT)
1533 	    emit_insn (gen_movsi (target, gen_lowpart (SImode, diff)));
1534 	  else
1535 	    emit_move_insn (target, diff);
1536 
1537 	  /* Since we are comparing bytes, the difference can be used
1538 	     as the final result and we are done here.  */
1539 	  j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref));
1540 	  JUMP_LABEL (j) = final_label;
1541 	  LABEL_NUSES (final_label) += 1;
1542 	  emit_barrier ();
1543 	}
1544     }
1545 
1546   emit_label (diff_label);
1547   /* difference handling, 64->32 conversion */
1548 
1549   /* We need to produce DI result from sub, then convert to target SI
1550      while maintaining <0 / ==0 / >0 properties.  This sequence works:
1551      subfc L,A,B
1552      subfe H,H,H
1553      popcntd L,L
1554      rldimi L,H,6,0
1555 
1556      This is an alternate one Segher cooked up if somebody
1557      wants to expand this for something that doesn't have popcntd:
1558      subfc L,a,b
1559      subfe H,x,x
1560      addic t,L,-1
1561      subfe v,t,L
1562      or z,v,H
1563 
1564      And finally, p9 can just do this:
1565      cmpld A,B
1566      setb r */
1567 
1568   if (TARGET_P9_MISC)
1569     emit_insn (gen_setb_unsigned (target, dcond));
1570   else
1571     {
1572       if (TARGET_64BIT)
1573 	{
1574 	  rtx tmp_reg_ca = gen_reg_rtx (DImode);
1575 	  emit_insn (gen_subfdi3_carry_in_xx (tmp_reg_ca));
1576 	  emit_insn (gen_popcntddi2 (diff, diff));
1577 	  emit_insn (gen_iordi3 (diff, diff, tmp_reg_ca));
1578 	  emit_insn (gen_movsi (target, gen_lowpart (SImode, diff)));
1579 	}
1580       else
1581 	{
1582 	  rtx tmp_reg_ca = gen_reg_rtx (SImode);
1583 	  emit_insn (gen_subfsi3_carry_in_xx (tmp_reg_ca));
1584 	  emit_insn (gen_popcntdsi2 (diff, diff));
1585 	  emit_insn (gen_iorsi3 (target, diff, tmp_reg_ca));
1586 	}
1587     }
1588 
1589   if (library_call_label != NULL)
1590     {
1591       /* Branch around memcmp call.  */
1592       j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref));
1593       JUMP_LABEL (j) = final_label;
1594       LABEL_NUSES (final_label) += 1;
1595       emit_barrier ();
1596 
1597       /* Make memcmp library call.  cmp_rem is the remaining bytes that
1598 	 were compared and cmp_rem is the expected amount to be compared
1599 	 by memcmp.  If we don't find a difference in the loop compare, do
1600 	 the library call directly instead of doing a small compare just
1601 	 to get to an arbitrary boundary before calling it anyway.
1602 	 Also, update addresses to point to the next word to examine.  */
1603       emit_label (library_call_label);
1604 
1605       rtx len_rtx = gen_reg_rtx (word_mode);
1606       if (bytes_is_const)
1607 	{
1608 	  emit_move_insn (len_rtx, cmp_rem);
1609 	  do_add3 (src1_addr, src1_addr, iv1);
1610 	  do_add3 (src2_addr, src2_addr, iv1);
1611 	}
1612       else
1613 	emit_move_insn (len_rtx, bytes_rtx);
1614 
1615       tree fun = builtin_decl_explicit (BUILT_IN_MEMCMP);
1616       emit_library_call_value (XEXP (DECL_RTL (fun), 0),
1617 			       target, LCT_NORMAL, GET_MODE (target),
1618 			       src1_addr, Pmode,
1619 			       src2_addr, Pmode,
1620 			       len_rtx, GET_MODE (len_rtx));
1621     }
1622 
1623   /* emit final_label */
1624   emit_label (final_label);
1625   return true;
1626 }
1627 
1628 /* Generate code to convert a DImode-plus-carry subtract result into
1629    a SImode result that has the same <0 / ==0 / >0 properties to
1630    produce the final result from memcmp.
1631 
1632    TARGET is the rtx for the register to receive the memcmp result.
1633    SUB_RESULT is the rtx for the register contining the subtract result.  */
1634 
1635 void
1636 generate_6432_conversion(rtx target, rtx sub_result)
1637 {
1638   /* We need to produce DI result from sub, then convert to target SI
1639      while maintaining <0 / ==0 / >0 properties.  This sequence works:
1640      subfc L,A,B
1641      subfe H,H,H
1642      popcntd L,L
1643      rldimi L,H,6,0
1644 
1645      This is an alternate one Segher cooked up if somebody
1646      wants to expand this for something that doesn't have popcntd:
1647      subfc L,a,b
1648      subfe H,x,x
1649      addic t,L,-1
1650      subfe v,t,L
1651      or z,v,H
1652 
1653      And finally, p9 can just do this:
1654      cmpld A,B
1655      setb r */
1656 
1657   if (TARGET_64BIT)
1658     {
1659       rtx tmp_reg_ca = gen_reg_rtx (DImode);
1660       emit_insn (gen_subfdi3_carry_in_xx (tmp_reg_ca));
1661       rtx popcnt = gen_reg_rtx (DImode);
1662       emit_insn (gen_popcntddi2 (popcnt, sub_result));
1663       rtx tmp2 = gen_reg_rtx (DImode);
1664       emit_insn (gen_iordi3 (tmp2, popcnt, tmp_reg_ca));
1665       emit_insn (gen_movsi (target, gen_lowpart (SImode, tmp2)));
1666     }
1667   else
1668     {
1669       rtx tmp_reg_ca = gen_reg_rtx (SImode);
1670       emit_insn (gen_subfsi3_carry_in_xx (tmp_reg_ca));
1671       rtx popcnt = gen_reg_rtx (SImode);
1672       emit_insn (gen_popcntdsi2 (popcnt, sub_result));
1673       emit_insn (gen_iorsi3 (target, popcnt, tmp_reg_ca));
1674     }
1675 }
1676 
1677 /* Generate memcmp expansion using in-line non-loop GPR instructions.
1678    The bool return indicates whether code for a 64->32 conversion
1679    should be generated.
1680 
1681    BYTES is the number of bytes to be compared.
1682    BASE_ALIGN is the minimum alignment for both blocks to compare.
1683    ORIG_SRC1 is the original pointer to the first block to compare.
1684    ORIG_SRC2 is the original pointer to the second block to compare.
1685    SUB_RESULT is the reg rtx for the result from the final subtract.
1686    COND is rtx for a condition register that will be used for the final
1687    compare on power9 or better.
1688    FINAL_RESULT is the reg rtx for the final memcmp result.
1689    P_CONVERT_LABEL is a pointer to rtx that will be used to store the
1690    label generated for a branch to the 64->32 code, if such a branch
1691    is needed.
1692    P_FINAL_LABEL is a pointer to rtx that will be used to store the label
1693    for the end of the memcmp if a branch there is needed.
1694 */
1695 
1696 bool
1697 expand_block_compare_gpr(unsigned HOST_WIDE_INT bytes, unsigned int base_align,
1698 			 rtx orig_src1, rtx orig_src2,
1699 			 rtx sub_result, rtx cond, rtx final_result,
1700 			 rtx *p_convert_label, rtx *p_final_label)
1701 {
1702   /* Example of generated code for 18 bytes aligned 1 byte.
1703      Compiled with -fno-reorder-blocks for clarity.
1704              ldbrx 10,31,8
1705              ldbrx 9,7,8
1706              subfc. 9,9,10
1707              bne 0,.L6487
1708              addi 9,12,8
1709              addi 5,11,8
1710              ldbrx 10,0,9
1711              ldbrx 9,0,5
1712              subfc. 9,9,10
1713              bne 0,.L6487
1714              addi 9,12,16
1715              lhbrx 10,0,9
1716              addi 9,11,16
1717              lhbrx 9,0,9
1718              subf 9,9,10
1719              b .L6488
1720              .p2align 4,,15
1721      .L6487: #convert_label
1722              popcntd 9,9
1723              subfe 10,10,10
1724              or 9,9,10
1725      .L6488: #final_label
1726              extsw 10,9
1727 
1728      We start off with DImode for two blocks that jump to the DI->SI conversion
1729      if the difference is found there, then a final block of HImode that skips
1730      the DI->SI conversion.  */
1731 
1732   unsigned HOST_WIDE_INT offset = 0;
1733   unsigned int load_mode_size;
1734   HOST_WIDE_INT cmp_bytes = 0;
1735   rtx src1 = orig_src1;
1736   rtx src2 = orig_src2;
1737   rtx tmp_reg_src1 = gen_reg_rtx (word_mode);
1738   rtx tmp_reg_src2 = gen_reg_rtx (word_mode);
1739   bool need_6432_conv = false;
1740   rtx convert_label = NULL;
1741   rtx final_label = NULL;
1742   machine_mode load_mode;
1743 
1744   while (bytes > 0)
1745     {
1746       unsigned int align = compute_current_alignment (base_align, offset);
1747       load_mode = select_block_compare_mode (offset, bytes, align);
1748       load_mode_size = GET_MODE_SIZE (load_mode);
1749       if (bytes >= load_mode_size)
1750 	cmp_bytes = load_mode_size;
1751       else if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
1752 	{
1753 	  /* Move this load back so it doesn't go past the end.
1754 	     P8/P9 can do this efficiently.  */
1755 	  unsigned int extra_bytes = load_mode_size - bytes;
1756 	  cmp_bytes = bytes;
1757 	  if (extra_bytes < offset)
1758 	    {
1759 	      offset -= extra_bytes;
1760 	      cmp_bytes = load_mode_size;
1761 	      bytes = cmp_bytes;
1762 	    }
1763 	}
1764       else
1765 	/* P7 and earlier can't do the overlapping load trick fast,
1766 	   so this forces a non-overlapping load and a shift to get
1767 	   rid of the extra bytes.  */
1768 	cmp_bytes = bytes;
1769 
1770       src1 = adjust_address (orig_src1, load_mode, offset);
1771       src2 = adjust_address (orig_src2, load_mode, offset);
1772 
1773       if (!REG_P (XEXP (src1, 0)))
1774 	{
1775 	  rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0));
1776 	  src1 = replace_equiv_address (src1, src1_reg);
1777 	}
1778       set_mem_size (src1, load_mode_size);
1779 
1780       if (!REG_P (XEXP (src2, 0)))
1781 	{
1782 	  rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0));
1783 	  src2 = replace_equiv_address (src2, src2_reg);
1784 	}
1785       set_mem_size (src2, load_mode_size);
1786 
1787       do_load_for_compare (tmp_reg_src1, src1, load_mode);
1788       do_load_for_compare (tmp_reg_src2, src2, load_mode);
1789 
1790       if (cmp_bytes < load_mode_size)
1791 	{
1792 	  /* Shift unneeded bytes off.  */
1793 	  rtx sh = GEN_INT (BITS_PER_UNIT * (load_mode_size - cmp_bytes));
1794 	  if (word_mode == DImode)
1795 	    {
1796 	      emit_insn (gen_lshrdi3 (tmp_reg_src1, tmp_reg_src1, sh));
1797 	      emit_insn (gen_lshrdi3 (tmp_reg_src2, tmp_reg_src2, sh));
1798 	    }
1799 	  else
1800 	    {
1801 	      emit_insn (gen_lshrsi3 (tmp_reg_src1, tmp_reg_src1, sh));
1802 	      emit_insn (gen_lshrsi3 (tmp_reg_src2, tmp_reg_src2, sh));
1803 	    }
1804 	}
1805 
1806       int remain = bytes - cmp_bytes;
1807       if (GET_MODE_SIZE (GET_MODE (final_result)) > GET_MODE_SIZE (load_mode))
1808 	{
1809 	  /* Final_result is larger than load size so we don't need to
1810 	     reduce result size.  */
1811 
1812 	  /* We previously did a block that need 64->32 conversion but
1813 	     the current block does not, so a label is needed to jump
1814 	     to the end.  */
1815 	  if (need_6432_conv && !final_label)
1816 	    final_label = gen_label_rtx ();
1817 
1818 	  if (remain > 0)
1819 	    {
1820 	      /* This is not the last block, branch to the end if the result
1821 		 of this subtract is not zero.  */
1822 	      if (!final_label)
1823 		final_label = gen_label_rtx ();
1824 	      rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
1825 	      rtx tmp = gen_rtx_MINUS (word_mode, tmp_reg_src1, tmp_reg_src2);
1826 	      rtx cr = gen_reg_rtx (CCmode);
1827 	      rs6000_emit_dot_insn (tmp_reg_src2, tmp, 2, cr);
1828 	      emit_insn (gen_movsi (final_result,
1829 				    gen_lowpart (SImode, tmp_reg_src2)));
1830 	      rtx ne_rtx = gen_rtx_NE (VOIDmode, cr, const0_rtx);
1831 	      rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx,
1832 						 fin_ref, pc_rtx);
1833 	      rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
1834 	      add_reg_br_prob_note (j, profile_probability::unlikely ());
1835 	      JUMP_LABEL (j) = final_label;
1836 	      LABEL_NUSES (final_label) += 1;
1837 	    }
1838 	  else
1839 	    {
1840 	      if (word_mode == DImode)
1841 		{
1842 		  emit_insn (gen_subdi3 (tmp_reg_src2, tmp_reg_src1,
1843 					 tmp_reg_src2));
1844 		  emit_insn (gen_movsi (final_result,
1845 					gen_lowpart (SImode, tmp_reg_src2)));
1846 		}
1847 	      else
1848 		emit_insn (gen_subsi3 (final_result, tmp_reg_src1, tmp_reg_src2));
1849 
1850 	      if (final_label)
1851 		{
1852 		  rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
1853 		  rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
1854 		  JUMP_LABEL (j) = final_label;
1855 		  LABEL_NUSES (final_label) += 1;
1856 		  emit_barrier ();
1857 		}
1858 	    }
1859 	}
1860       else
1861 	{
1862 	  /* Do we need a 64->32 conversion block? We need the 64->32
1863 	     conversion even if final_result size == load_mode size because
1864 	     the subtract generates one extra bit.  */
1865 	  need_6432_conv = true;
1866 
1867 	  if (remain > 0)
1868 	    {
1869 	      if (!convert_label)
1870 		convert_label = gen_label_rtx ();
1871 
1872 	      /* Compare to zero and branch to convert_label if not zero.  */
1873 	      rtx cvt_ref = gen_rtx_LABEL_REF (VOIDmode, convert_label);
1874 	      if (TARGET_P9_MISC)
1875 		{
1876 		/* Generate a compare, and convert with a setb later.
1877 		   Use cond that is passed in because the caller needs
1878 		   to use it for the 64->32 conversion later.  */
1879 		  rtx cmp = gen_rtx_COMPARE (CCUNSmode, tmp_reg_src1,
1880 					     tmp_reg_src2);
1881 		  emit_insn (gen_rtx_SET (cond, cmp));
1882 		}
1883 	      else
1884 		{
1885 		  /* Generate a subfc. and use the longer sequence for
1886 		     conversion.  Cond is not used outside this
1887 		     function in this case.  */
1888 		  cond = gen_reg_rtx (CCmode);
1889 		  if (TARGET_64BIT)
1890 		    emit_insn (gen_subfdi3_carry_dot2 (sub_result, tmp_reg_src2,
1891 						       tmp_reg_src1, cond));
1892 		  else
1893 		    emit_insn (gen_subfsi3_carry_dot2 (sub_result, tmp_reg_src2,
1894 						       tmp_reg_src1, cond));
1895 		}
1896 
1897 	      rtx ne_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx);
1898 	      rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx,
1899 						 cvt_ref, pc_rtx);
1900 	      rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
1901 	      add_reg_br_prob_note (j, profile_probability::likely ());
1902 	      JUMP_LABEL (j) = convert_label;
1903 	      LABEL_NUSES (convert_label) += 1;
1904 	    }
1905 	  else
1906 	    {
1907 	      /* Just do the subtract/compare.  Since this is the last block
1908 		 the convert code will be generated immediately following.  */
1909 	      if (TARGET_P9_MISC)
1910 		{
1911 		  rtx cmp = gen_rtx_COMPARE (CCUNSmode, tmp_reg_src1,
1912 					     tmp_reg_src2);
1913 		  emit_insn (gen_rtx_SET (cond, cmp));
1914 		}
1915 	      else
1916 		if (TARGET_64BIT)
1917 		  emit_insn (gen_subfdi3_carry (sub_result, tmp_reg_src2,
1918 						tmp_reg_src1));
1919 		else
1920 		  emit_insn (gen_subfsi3_carry (sub_result, tmp_reg_src2,
1921 						tmp_reg_src1));
1922 	    }
1923 	}
1924 
1925       offset += cmp_bytes;
1926       bytes -= cmp_bytes;
1927     }
1928 
1929   if (convert_label)
1930     *p_convert_label = convert_label;
1931   if (final_label)
1932     *p_final_label = final_label;
1933   return need_6432_conv;
1934 }
1935 
1936 /* Expand a block compare operation, and return true if successful.
1937    Return false if we should let the compiler generate normal code,
1938    probably a memcmp call.
1939 
1940    OPERANDS[0] is the target (result).
1941    OPERANDS[1] is the first source.
1942    OPERANDS[2] is the second source.
1943    OPERANDS[3] is the length.
1944    OPERANDS[4] is the alignment.  */
1945 bool
1946 expand_block_compare (rtx operands[])
1947 {
1948   rtx target = operands[0];
1949   rtx orig_src1 = operands[1];
1950   rtx orig_src2 = operands[2];
1951   rtx bytes_rtx = operands[3];
1952   rtx align_rtx = operands[4];
1953 
1954   /* This case is complicated to handle because the subtract
1955      with carry instructions do not generate the 64-bit
1956      carry and so we must emit code to calculate it ourselves.
1957      We choose not to implement this yet.  */
1958   if (TARGET_32BIT && TARGET_POWERPC64)
1959     return false;
1960 
1961   bool isP7 = (rs6000_tune == PROCESSOR_POWER7);
1962 
1963   /* Allow this param to shut off all expansion.  */
1964   if (rs6000_block_compare_inline_limit == 0)
1965     return false;
1966 
1967   /* targetm.slow_unaligned_access -- don't do unaligned stuff.
1968      However slow_unaligned_access returns true on P7 even though the
1969      performance of this code is good there.  */
1970   if (!isP7
1971       && (targetm.slow_unaligned_access (word_mode, MEM_ALIGN (orig_src1))
1972 	  || targetm.slow_unaligned_access (word_mode, MEM_ALIGN (orig_src2))))
1973     return false;
1974 
1975   /* Unaligned l*brx traps on P7 so don't do this.  However this should
1976      not affect much because LE isn't really supported on P7 anyway.  */
1977   if (isP7 && !BYTES_BIG_ENDIAN)
1978     return false;
1979 
1980   /* If this is not a fixed size compare, try generating loop code and
1981      if that fails just call memcmp.  */
1982   if (!CONST_INT_P (bytes_rtx))
1983     return expand_compare_loop (operands);
1984 
1985   /* This must be a fixed size alignment.  */
1986   if (!CONST_INT_P (align_rtx))
1987     return false;
1988 
1989   unsigned int base_align = UINTVAL (align_rtx) / BITS_PER_UNIT;
1990 
1991   gcc_assert (GET_MODE (target) == SImode);
1992 
1993   /* Anything to move?  */
1994   unsigned HOST_WIDE_INT bytes = UINTVAL (bytes_rtx);
1995   if (bytes == 0)
1996     return true;
1997 
1998   /* P7/P8 code uses cond for subfc. but P9 uses
1999      it for cmpld which needs CCUNSmode.  */
2000   rtx cond = NULL;
2001   if (TARGET_P9_MISC)
2002     cond = gen_reg_rtx (CCUNSmode);
2003 
2004   /* Is it OK to use vec/vsx for this.  TARGET_VSX means we have at
2005      least POWER7 but we use TARGET_EFFICIENT_UNALIGNED_VSX which is
2006      at least POWER8.  That way we can rely on overlapping compares to
2007      do the final comparison of less than 16 bytes.  Also I do not
2008      want to deal with making this work for 32 bits.  In addition, we
2009      have to make sure that we have at least P8_VECTOR (we don't allow
2010      P9_VECTOR without P8_VECTOR).  */
2011   int use_vec = (bytes >= 33 && !TARGET_32BIT
2012 		 && TARGET_EFFICIENT_UNALIGNED_VSX && TARGET_P8_VECTOR);
2013 
2014   /* We don't want to generate too much code.  The loop code can take
2015      over for lengths greater than 31 bytes.  */
2016   unsigned HOST_WIDE_INT max_bytes = rs6000_block_compare_inline_limit;
2017 
2018   /* Don't generate too much code if vsx was disabled.  */
2019   if (!use_vec && max_bytes > 1)
2020     max_bytes = ((max_bytes + 1) / 2) - 1;
2021 
2022   if (!IN_RANGE (bytes, 1, max_bytes))
2023     return expand_compare_loop (operands);
2024 
2025   /* The code generated for p7 and older is not faster than glibc
2026      memcmp if alignment is small and length is not short, so bail
2027      out to avoid those conditions.  */
2028   if (!TARGET_EFFICIENT_OVERLAPPING_UNALIGNED
2029       && ((base_align == 1 && bytes > 16)
2030 	  || (base_align == 2 && bytes > 32)))
2031     return false;
2032 
2033   rtx final_label = NULL;
2034 
2035   if (use_vec)
2036     {
2037       rtx final_move_label = gen_label_rtx ();
2038       rtx s1addr = gen_reg_rtx (Pmode);
2039       rtx s2addr = gen_reg_rtx (Pmode);
2040       rtx off_reg = gen_reg_rtx (Pmode);
2041       rtx cleanup_label = NULL;
2042       rtx vec_result = gen_reg_rtx (V16QImode);
2043       rtx s1data = gen_reg_rtx (V16QImode);
2044       rtx s2data = gen_reg_rtx (V16QImode);
2045       rtx result_reg = gen_reg_rtx (word_mode);
2046       emit_move_insn (result_reg, GEN_INT (0));
2047 
2048       expand_cmp_vec_sequence (bytes, orig_src1, orig_src2,
2049 			       s1addr, s2addr, off_reg, s1data, s2data,
2050 			       vec_result, false,
2051 			       &cleanup_label, final_move_label, false);
2052 
2053       if (cleanup_label)
2054 	emit_label (cleanup_label);
2055 
2056       emit_insn (gen_one_cmplv16qi2 (vec_result, vec_result));
2057 
2058       emit_final_compare_vec (s1data, s2data, result_reg,
2059 			      s1addr, s2addr, orig_src1, orig_src2,
2060 			      off_reg, vec_result);
2061 
2062       emit_label (final_move_label);
2063       emit_insn (gen_movsi (target,
2064 			    gen_lowpart (SImode, result_reg)));
2065     }
2066   else
2067     { /* generate GPR code */
2068 
2069       rtx convert_label = NULL;
2070       rtx sub_result = gen_reg_rtx (word_mode);
2071       bool need_6432_conversion =
2072 	expand_block_compare_gpr(bytes, base_align,
2073 				 orig_src1, orig_src2,
2074 				 sub_result, cond, target,
2075 				 &convert_label, &final_label);
2076 
2077       if (need_6432_conversion)
2078 	{
2079 	  if (convert_label)
2080 	    emit_label (convert_label);
2081 	  if (TARGET_P9_MISC)
2082 	    emit_insn (gen_setb_unsigned (target, cond));
2083 	  else
2084 	    generate_6432_conversion(target, sub_result);
2085 	}
2086     }
2087 
2088   if (final_label)
2089     emit_label (final_label);
2090 
2091   return true;
2092 }
2093 
2094 /* Generate page crossing check and branch code to set up for
2095    strncmp when we don't have DI alignment.
2096    STRNCMP_LABEL is the label to branch if there is a page crossing.
2097    SRC_ADDR is the string address to be examined.
2098    BYTES is the max number of bytes to compare.  */
2099 static void
2100 expand_strncmp_align_check (rtx strncmp_label, rtx src_addr, HOST_WIDE_INT bytes)
2101 {
2102   rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, strncmp_label);
2103   rtx src_pgoff = gen_reg_rtx (GET_MODE (src_addr));
2104   do_and3 (src_pgoff, src_addr, GEN_INT (0xfff));
2105   rtx cond = gen_reg_rtx (CCmode);
2106   emit_move_insn (cond, gen_rtx_COMPARE (CCmode, src_pgoff,
2107 					 GEN_INT (4096 - bytes)));
2108 
2109   rtx cmp_rtx = gen_rtx_GE (VOIDmode, cond, const0_rtx);
2110 
2111   rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
2112 				     lab_ref, pc_rtx);
2113   rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
2114   add_reg_br_prob_note (j, profile_probability::unlikely ());
2115   JUMP_LABEL (j) = strncmp_label;
2116   LABEL_NUSES (strncmp_label) += 1;
2117 }
2118 
2119 /* Generate the sequence of compares for strcmp/strncmp using gpr instructions.
2120    BYTES_TO_COMPARE is the number of bytes to be compared.
2121    BASE_ALIGN is the smaller of the alignment of the two strings.
2122    ORIG_SRC1 is the unmodified rtx for the first string.
2123    ORIG_SRC2 is the unmodified rtx for the second string.
2124    TMP_REG_SRC1 is the register for loading the first string.
2125    TMP_REG_SRC2 is the register for loading the second string.
2126    RESULT_REG is the rtx for the result register.
2127    EQUALITY_COMPARE_REST is a flag to indicate we need to make a cleanup call
2128    to strcmp/strncmp if we have equality at the end of the inline comparison.
2129    P_CLEANUP_LABEL is a pointer to rtx for a label we generate if we need code
2130    to clean up and generate the final comparison result.
2131    FINAL_MOVE_LABEL is rtx for a label we can branch to when we can just
2132    set the final result.  */
2133 static void
2134 expand_strncmp_gpr_sequence (unsigned HOST_WIDE_INT bytes_to_compare,
2135 			     unsigned int base_align,
2136 			     rtx orig_src1, rtx orig_src2,
2137 			     rtx tmp_reg_src1, rtx tmp_reg_src2, rtx result_reg,
2138 			     bool equality_compare_rest, rtx *p_cleanup_label,
2139 			     rtx final_move_label)
2140 {
2141   unsigned int word_mode_size = GET_MODE_SIZE (word_mode);
2142   machine_mode load_mode;
2143   unsigned int load_mode_size;
2144   unsigned HOST_WIDE_INT cmp_bytes = 0;
2145   unsigned HOST_WIDE_INT offset = 0;
2146   rtx src1_addr = force_reg (Pmode, XEXP (orig_src1, 0));
2147   rtx src2_addr = force_reg (Pmode, XEXP (orig_src2, 0));
2148   gcc_assert (p_cleanup_label != NULL);
2149   rtx cleanup_label = *p_cleanup_label;
2150 
2151   while (bytes_to_compare > 0)
2152     {
2153       /* GPR compare sequence:
2154          check each 8B with: ld/ld/cmpb/cmpb/orc./bne
2155 
2156          cleanup code at end:
2157          cntlzd        get bit of first zero/diff byte
2158          subfic        convert for rldcl use
2159          rldcl rldcl   extract diff/zero byte
2160          subf          subtract for final result
2161 
2162          The last compare can branch around the cleanup code if the
2163          result is zero because the strings are exactly equal.  */
2164 
2165       unsigned int align = compute_current_alignment (base_align, offset);
2166       load_mode = select_block_compare_mode (offset, bytes_to_compare, align);
2167       load_mode_size = GET_MODE_SIZE (load_mode);
2168       if (bytes_to_compare >= load_mode_size)
2169 	cmp_bytes = load_mode_size;
2170       else if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
2171 	{
2172 	  /* Move this load back so it doesn't go past the end.
2173 	     P8/P9 can do this efficiently.  */
2174 	  unsigned int extra_bytes = load_mode_size - bytes_to_compare;
2175 	  cmp_bytes = bytes_to_compare;
2176 	  if (extra_bytes < offset)
2177 	    {
2178 	      offset -= extra_bytes;
2179 	      cmp_bytes = load_mode_size;
2180 	      bytes_to_compare = cmp_bytes;
2181 	    }
2182 	}
2183       else
2184 	/* P7 and earlier can't do the overlapping load trick fast,
2185 	   so this forces a non-overlapping load and a shift to get
2186 	   rid of the extra bytes.  */
2187 	cmp_bytes = bytes_to_compare;
2188 
2189       rtx offset_rtx;
2190       if (BYTES_BIG_ENDIAN || TARGET_AVOID_XFORM)
2191 	offset_rtx = GEN_INT (offset);
2192       else
2193 	{
2194 	  offset_rtx = gen_reg_rtx (Pmode);
2195 	  emit_move_insn (offset_rtx, GEN_INT (offset));
2196 	}
2197       rtx addr1 = gen_rtx_PLUS (Pmode, src1_addr, offset_rtx);
2198       rtx addr2 = gen_rtx_PLUS (Pmode, src2_addr, offset_rtx);
2199 
2200       do_load_for_compare_from_addr (load_mode, tmp_reg_src1, addr1, orig_src1);
2201       do_load_for_compare_from_addr (load_mode, tmp_reg_src2, addr2, orig_src2);
2202 
2203       /* We must always left-align the data we read, and
2204 	 clear any bytes to the right that are beyond the string.
2205 	 Otherwise the cmpb sequence won't produce the correct
2206 	 results.  However if there is only one byte left, we
2207 	 can just subtract to get the final result so the shifts
2208 	 and clears are not needed.  */
2209 
2210       unsigned HOST_WIDE_INT remain = bytes_to_compare - cmp_bytes;
2211 
2212       /* Loading just a single byte is a special case.  If we are
2213 	 loading more than that, we have to check whether we are
2214 	 looking at the entire chunk of data.  If not, rotate left and
2215 	 clear right so that bytes we aren't supposed to look at are
2216 	 zeroed, and the first byte we are supposed to compare is
2217 	 leftmost.  */
2218       if (load_mode_size != 1)
2219 	{
2220 	  if (load_mode_size < word_mode_size)
2221 	    {
2222 	      /* Rotate left first.  */
2223 	      rtx sh = GEN_INT (BITS_PER_UNIT
2224 				* (word_mode_size - load_mode_size));
2225 	      do_rotl3 (tmp_reg_src1, tmp_reg_src1, sh);
2226 	      do_rotl3 (tmp_reg_src2, tmp_reg_src2, sh);
2227 	    }
2228 
2229 	  if (cmp_bytes < word_mode_size)
2230 	    {
2231 	      /* Now clear right.  This plus the rotate can be
2232 		 turned into a rldicr instruction.  */
2233 	      HOST_WIDE_INT mb = BITS_PER_UNIT * (word_mode_size - cmp_bytes);
2234 	      rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb);
2235 	      do_and3 (tmp_reg_src1, tmp_reg_src1, mask);
2236 	      do_and3 (tmp_reg_src2, tmp_reg_src2, mask);
2237 	    }
2238 	}
2239 
2240       /* Cases to handle.  A and B are chunks of the two strings.
2241 	 1: Not end of comparison:
2242 	 A != B: branch to cleanup code to compute result.
2243 	 A == B: check for 0 byte, next block if not found.
2244 	 2: End of the inline comparison:
2245 	 A != B: branch to cleanup code to compute result.
2246 	 A == B: check for 0 byte, call strcmp/strncmp
2247 	 3: compared requested N bytes:
2248 	 A == B: branch to result 0.
2249 	 A != B: cleanup code to compute result.  */
2250 
2251       rtx dst_label;
2252       if (remain > 0 || equality_compare_rest)
2253 	{
2254 	  /* Branch to cleanup code, otherwise fall through to do
2255 	     more compares.  */
2256 	  if (!cleanup_label)
2257 	    cleanup_label = gen_label_rtx ();
2258 	  dst_label = cleanup_label;
2259 	}
2260       else
2261 	/* Branch to end and produce result of 0.  */
2262 	dst_label = final_move_label;
2263 
2264       if (load_mode_size == 1)
2265 	{
2266 	  /* Special case for comparing just single byte.  */
2267 	  if (equality_compare_rest)
2268 	    {
2269 	      /* Use subf./bne to branch to final_move_label if the
2270 		 byte differs, otherwise fall through to the strncmp
2271 		 call.  We must also check for a zero byte here as we
2272 		 must not make the library call if this is the end of
2273 		 the string.  */
2274 
2275 	      rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, final_move_label);
2276 	      rtx cond = gen_reg_rtx (CCmode);
2277 	      rtx diff_rtx = gen_rtx_MINUS (word_mode,
2278 					    tmp_reg_src1, tmp_reg_src2);
2279 	      rs6000_emit_dot_insn (result_reg, diff_rtx, 2, cond);
2280 	      rtx cmp_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx);
2281 
2282 	      rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
2283 						 lab_ref, pc_rtx);
2284 	      rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
2285 	      add_reg_br_prob_note (j, profile_probability::unlikely ());
2286 	      JUMP_LABEL (j) = final_move_label;
2287 	      LABEL_NUSES (final_move_label) += 1;
2288 
2289 	      /* Check for zero byte here before fall through to
2290 		 library call.  This catches the case where the
2291 		 strings are equal and end in a zero byte at this
2292 		 position.  */
2293 
2294 	      rtx cond0 = gen_reg_rtx (CCmode);
2295 	      emit_move_insn (cond0, gen_rtx_COMPARE (CCmode, tmp_reg_src1,
2296 						      const0_rtx));
2297 
2298 	      rtx cmp0eq_rtx = gen_rtx_EQ (VOIDmode, cond0, const0_rtx);
2299 
2300 	      rtx ifelse0 = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp0eq_rtx,
2301 						 lab_ref, pc_rtx);
2302 	      rtx_insn *j0 = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse0));
2303 	      add_reg_br_prob_note (j0, profile_probability::unlikely ());
2304 	      JUMP_LABEL (j0) = final_move_label;
2305 	      LABEL_NUSES (final_move_label) += 1;
2306 	    }
2307 	  else
2308 	    {
2309 	      /* This is the last byte to be compared so we can use
2310 		 subf to compute the final result and branch
2311 		 unconditionally to final_move_label.  */
2312 
2313 	      do_sub3 (result_reg, tmp_reg_src1, tmp_reg_src2);
2314 
2315 	      rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_move_label);
2316 	      rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
2317 	      JUMP_LABEL (j) = final_move_label;
2318 	      LABEL_NUSES (final_move_label) += 1;
2319 	      emit_barrier ();
2320 	    }
2321 	}
2322       else
2323 	{
2324 	  rtx cmpb_zero = gen_reg_rtx (word_mode);
2325 	  rtx cmpb_diff = gen_reg_rtx (word_mode);
2326 	  rtx zero_reg = gen_reg_rtx (word_mode);
2327 	  rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, dst_label);
2328 	  rtx cond = gen_reg_rtx (CCmode);
2329 
2330 	  emit_move_insn (zero_reg, GEN_INT (0));
2331 	  do_cmpb3 (cmpb_diff, tmp_reg_src1, tmp_reg_src2);
2332 	  do_cmpb3 (cmpb_zero, tmp_reg_src1, zero_reg);
2333 	  rtx not_diff = gen_rtx_NOT (word_mode, cmpb_diff);
2334 	  rtx orc_rtx = gen_rtx_IOR (word_mode, not_diff, cmpb_zero);
2335 
2336 	  rs6000_emit_dot_insn (result_reg, orc_rtx, 2, cond);
2337 
2338 	  rtx cmp_rtx;
2339 	  if (remain == 0 && !equality_compare_rest)
2340 	    cmp_rtx = gen_rtx_EQ (VOIDmode, cond, const0_rtx);
2341 	  else
2342 	    cmp_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx);
2343 
2344 	  rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
2345 					     lab_ref, pc_rtx);
2346 	  rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
2347 	  add_reg_br_prob_note (j, profile_probability::unlikely ());
2348 	  JUMP_LABEL (j) = dst_label;
2349 	  LABEL_NUSES (dst_label) += 1;
2350 	}
2351 
2352       offset += cmp_bytes;
2353       bytes_to_compare -= cmp_bytes;
2354     }
2355 
2356   *p_cleanup_label = cleanup_label;
2357   return;
2358 }
2359 
2360 /* Generate the final sequence that identifies the differing
2361    byte and generates the final result, taking into account
2362    zero bytes:
2363 
2364    cntlzd            get bit of first zero/diff byte
2365    addi              convert for rldcl use
2366    rldcl rldcl       extract diff/zero byte
2367    subf              subtract for final result
2368 
2369    STR1 is the reg rtx for data from string 1.
2370    STR2 is the reg rtx for data from string 2.
2371    RESULT is the reg rtx for the comparison result.  */
2372 
2373 static void
2374 emit_final_str_compare_gpr (rtx str1, rtx str2, rtx result)
2375 {
2376   machine_mode m = GET_MODE (str1);
2377   rtx rot_amt = gen_reg_rtx (m);
2378 
2379   rtx rot1_1 = gen_reg_rtx (m);
2380   rtx rot1_2 = gen_reg_rtx (m);
2381   rtx rot2_1 = gen_reg_rtx (m);
2382   rtx rot2_2 = gen_reg_rtx (m);
2383 
2384   if (m == SImode)
2385     {
2386       emit_insn (gen_clzsi2 (rot_amt, result));
2387       emit_insn (gen_addsi3 (rot_amt, rot_amt, GEN_INT (8)));
2388       emit_insn (gen_rotlsi3 (rot1_1, str1,
2389 			      gen_lowpart (SImode, rot_amt)));
2390       emit_insn (gen_andsi3_mask (rot1_2, rot1_1, GEN_INT (0xff)));
2391       emit_insn (gen_rotlsi3 (rot2_1, str2,
2392 			      gen_lowpart (SImode, rot_amt)));
2393       emit_insn (gen_andsi3_mask (rot2_2, rot2_1, GEN_INT (0xff)));
2394       emit_insn (gen_subsi3 (result, rot1_2, rot2_2));
2395     }
2396   else if (m == DImode)
2397     {
2398       emit_insn (gen_clzdi2 (rot_amt, result));
2399       emit_insn (gen_adddi3 (rot_amt, rot_amt, GEN_INT (8)));
2400       emit_insn (gen_rotldi3 (rot1_1, str1,
2401 			      gen_lowpart (SImode, rot_amt)));
2402       emit_insn (gen_anddi3_mask (rot1_2, rot1_1, GEN_INT (0xff)));
2403       emit_insn (gen_rotldi3 (rot2_1, str2,
2404 			      gen_lowpart (SImode, rot_amt)));
2405       emit_insn (gen_anddi3_mask (rot2_2, rot2_1, GEN_INT (0xff)));
2406       emit_insn (gen_subdi3 (result, rot1_2, rot2_2));
2407     }
2408   else
2409     gcc_unreachable ();
2410 
2411   return;
2412 }
2413 
2414 /* Expand a string compare operation with length, and return
2415    true if successful.  Return false if we should let the
2416    compiler generate normal code, probably a strncmp call.
2417 
2418    OPERANDS[0] is the target (result).
2419    OPERANDS[1] is the first source.
2420    OPERANDS[2] is the second source.
2421    If NO_LENGTH is zero, then:
2422    OPERANDS[3] is the length.
2423    OPERANDS[4] is the alignment in bytes.
2424    If NO_LENGTH is nonzero, then:
2425    OPERANDS[3] is the alignment in bytes.  */
2426 bool
2427 expand_strn_compare (rtx operands[], int no_length)
2428 {
2429   rtx target = operands[0];
2430   rtx orig_src1 = operands[1];
2431   rtx orig_src2 = operands[2];
2432   rtx bytes_rtx, align_rtx;
2433   if (no_length)
2434     {
2435       bytes_rtx = NULL;
2436       align_rtx = operands[3];
2437     }
2438   else
2439     {
2440       bytes_rtx = operands[3];
2441       align_rtx = operands[4];
2442     }
2443 
2444   rtx src1_addr = force_reg (Pmode, XEXP (orig_src1, 0));
2445   rtx src2_addr = force_reg (Pmode, XEXP (orig_src2, 0));
2446 
2447   /* If we have a length, it must be constant.  This simplifies things
2448      a bit as we don't have to generate code to check if we've exceeded
2449      the length.  Later this could be expanded to handle this case.  */
2450   if (!no_length && !CONST_INT_P (bytes_rtx))
2451     return false;
2452 
2453   /* This must be a fixed size alignment.  */
2454   if (!CONST_INT_P (align_rtx))
2455     return false;
2456 
2457   unsigned int base_align = UINTVAL (align_rtx);
2458   unsigned int align1 = MEM_ALIGN (orig_src1) / BITS_PER_UNIT;
2459   unsigned int align2 = MEM_ALIGN (orig_src2) / BITS_PER_UNIT;
2460 
2461   /* targetm.slow_unaligned_access -- don't do unaligned stuff.  */
2462   if (targetm.slow_unaligned_access (word_mode, align1)
2463       || targetm.slow_unaligned_access (word_mode, align2))
2464     return false;
2465 
2466   gcc_assert (GET_MODE (target) == SImode);
2467 
2468   unsigned int required_align = 8;
2469 
2470   unsigned HOST_WIDE_INT offset = 0;
2471   unsigned HOST_WIDE_INT bytes; /* N from the strncmp args if available.  */
2472   unsigned HOST_WIDE_INT compare_length; /* How much to compare inline.  */
2473 
2474   if (no_length)
2475     bytes = rs6000_string_compare_inline_limit;
2476   else
2477     bytes = UINTVAL (bytes_rtx);
2478 
2479   /* Is it OK to use vec/vsx for this.  TARGET_VSX means we have at
2480      least POWER7 but we use TARGET_EFFICIENT_UNALIGNED_VSX which is
2481      at least POWER8.  That way we can rely on overlapping compares to
2482      do the final comparison of less than 16 bytes.  Also I do not
2483      want to deal with making this work for 32 bits.  In addition, we
2484      have to make sure that we have at least P8_VECTOR (we don't allow
2485      P9_VECTOR without P8_VECTOR).  */
2486   int use_vec = (bytes >= 16 && !TARGET_32BIT
2487 		 && TARGET_EFFICIENT_UNALIGNED_VSX && TARGET_P8_VECTOR);
2488 
2489   if (use_vec)
2490     required_align = 16;
2491 
2492   machine_mode load_mode;
2493   rtx tmp_reg_src1, tmp_reg_src2;
2494   if (use_vec)
2495     {
2496       load_mode = V16QImode;
2497       tmp_reg_src1 = gen_reg_rtx (V16QImode);
2498       tmp_reg_src2 = gen_reg_rtx (V16QImode);
2499     }
2500   else
2501     {
2502       load_mode = select_block_compare_mode (0, bytes, base_align);
2503       tmp_reg_src1 = gen_reg_rtx (word_mode);
2504       tmp_reg_src2 = gen_reg_rtx (word_mode);
2505     }
2506 
2507   compare_length = rs6000_string_compare_inline_limit;
2508 
2509   /* If we have equality at the end of the last compare and we have not
2510      found the end of the string, we need to call strcmp/strncmp to
2511      compare the remainder.  */
2512   bool equality_compare_rest = false;
2513 
2514   if (no_length)
2515     {
2516       bytes = compare_length;
2517       equality_compare_rest = true;
2518     }
2519   else
2520     {
2521       if (bytes <= compare_length)
2522 	compare_length = bytes;
2523       else
2524 	equality_compare_rest = true;
2525     }
2526 
2527   rtx result_reg = gen_reg_rtx (word_mode);
2528   rtx final_move_label = gen_label_rtx ();
2529   rtx final_label = gen_label_rtx ();
2530   rtx begin_compare_label = NULL;
2531 
2532   if (base_align < required_align)
2533     {
2534       /* Generate code that checks distance to 4k boundary for this case.  */
2535       begin_compare_label = gen_label_rtx ();
2536       rtx strncmp_label = gen_label_rtx ();
2537       rtx jmp;
2538 
2539       /* Strncmp for power8 in glibc does this:
2540 	 rldicl r8,r3,0,52
2541 	 cmpldi cr7,r8,4096-16
2542 	 bgt    cr7,L(pagecross) */
2543 
2544       /* Make sure that the length we use for the alignment test and
2545          the subsequent code generation are in agreement so we do not
2546          go past the length we tested for a 4k boundary crossing.  */
2547       unsigned HOST_WIDE_INT align_test = compare_length;
2548       if (align_test < required_align)
2549         {
2550           align_test = HOST_WIDE_INT_1U << ceil_log2 (align_test);
2551           base_align = align_test;
2552         }
2553       else
2554         {
2555           align_test = ROUND_UP (align_test, required_align);
2556           base_align = required_align;
2557         }
2558 
2559       if (align1 < required_align)
2560         expand_strncmp_align_check (strncmp_label, src1_addr, align_test);
2561       if (align2 < required_align)
2562         expand_strncmp_align_check (strncmp_label, src2_addr, align_test);
2563 
2564       /* Now generate the following sequence:
2565 	 - branch to begin_compare
2566 	 - strncmp_label
2567 	 - call to strncmp
2568 	 - branch to final_label
2569 	 - begin_compare_label */
2570 
2571       rtx cmp_ref = gen_rtx_LABEL_REF (VOIDmode, begin_compare_label);
2572       jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, cmp_ref));
2573       JUMP_LABEL (jmp) = begin_compare_label;
2574       LABEL_NUSES (begin_compare_label) += 1;
2575       emit_barrier ();
2576 
2577       emit_label (strncmp_label);
2578 
2579       if (no_length)
2580 	{
2581 	  tree fun = builtin_decl_explicit (BUILT_IN_STRCMP);
2582 	  emit_library_call_value (XEXP (DECL_RTL (fun), 0),
2583 				   target, LCT_NORMAL, GET_MODE (target),
2584 				   force_reg (Pmode, src1_addr), Pmode,
2585 				   force_reg (Pmode, src2_addr), Pmode);
2586 	}
2587       else
2588 	{
2589 	  /* -m32 -mpowerpc64 results in word_mode being DImode even
2590 	     though otherwise it is 32-bit.  The length arg to strncmp
2591 	     is a size_t which will be the same size as pointers.  */
2592 	  rtx len_rtx = gen_reg_rtx (Pmode);
2593 	  emit_move_insn (len_rtx, gen_int_mode (bytes, Pmode));
2594 
2595 	  tree fun = builtin_decl_explicit (BUILT_IN_STRNCMP);
2596 	  emit_library_call_value (XEXP (DECL_RTL (fun), 0),
2597 				   target, LCT_NORMAL, GET_MODE (target),
2598 				   force_reg (Pmode, src1_addr), Pmode,
2599 				   force_reg (Pmode, src2_addr), Pmode,
2600 				   len_rtx, Pmode);
2601 	}
2602 
2603       rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
2604       jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
2605       JUMP_LABEL (jmp) = final_label;
2606       LABEL_NUSES (final_label) += 1;
2607       emit_barrier ();
2608       emit_label (begin_compare_label);
2609     }
2610 
2611   rtx cleanup_label = NULL;
2612   rtx s1addr = NULL, s2addr = NULL, off_reg = NULL, vec_result = NULL;
2613 
2614   /* Generate a sequence of GPR or VEC/VSX instructions to compare out
2615      to the length specified.  */
2616   if (use_vec)
2617     {
2618       s1addr = gen_reg_rtx (Pmode);
2619       s2addr = gen_reg_rtx (Pmode);
2620       off_reg = gen_reg_rtx (Pmode);
2621       vec_result = gen_reg_rtx (load_mode);
2622       emit_move_insn (result_reg, GEN_INT (0));
2623       expand_cmp_vec_sequence (compare_length,
2624 			       orig_src1, orig_src2,
2625 			       s1addr, s2addr, off_reg,
2626 			       tmp_reg_src1, tmp_reg_src2,
2627 			       vec_result,
2628 			       equality_compare_rest,
2629 			       &cleanup_label, final_move_label, true);
2630     }
2631   else
2632     expand_strncmp_gpr_sequence (compare_length, base_align,
2633 				 orig_src1, orig_src2,
2634 				 tmp_reg_src1, tmp_reg_src2,
2635 				 result_reg,
2636 				 equality_compare_rest,
2637 				 &cleanup_label, final_move_label);
2638 
2639   offset = compare_length;
2640 
2641   if (equality_compare_rest)
2642     {
2643       /* Update pointers past what has been compared already.  */
2644       rtx src1 = force_reg (Pmode,
2645 			    gen_rtx_PLUS (Pmode, src1_addr, GEN_INT (offset)));
2646       rtx src2 = force_reg (Pmode,
2647 			    gen_rtx_PLUS (Pmode, src2_addr, GEN_INT (offset)));
2648 
2649       /* Construct call to strcmp/strncmp to compare the rest of the string.  */
2650       if (no_length)
2651 	{
2652 	  tree fun = builtin_decl_explicit (BUILT_IN_STRCMP);
2653 	  emit_library_call_value (XEXP (DECL_RTL (fun), 0),
2654 				   target, LCT_NORMAL, GET_MODE (target),
2655 				   src1, Pmode, src2, Pmode);
2656 	}
2657       else
2658 	{
2659 	  rtx len_rtx = gen_reg_rtx (Pmode);
2660 	  emit_move_insn (len_rtx, gen_int_mode (bytes - compare_length, Pmode));
2661 	  tree fun = builtin_decl_explicit (BUILT_IN_STRNCMP);
2662 	  emit_library_call_value (XEXP (DECL_RTL (fun), 0),
2663 				   target, LCT_NORMAL, GET_MODE (target),
2664 				   src1, Pmode, src2, Pmode, len_rtx, Pmode);
2665 	}
2666 
2667       rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
2668       rtx jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
2669       JUMP_LABEL (jmp) = final_label;
2670       LABEL_NUSES (final_label) += 1;
2671       emit_barrier ();
2672     }
2673 
2674   if (cleanup_label)
2675     emit_label (cleanup_label);
2676 
2677   if (use_vec)
2678     emit_final_compare_vec (tmp_reg_src1, tmp_reg_src2, result_reg,
2679 			    s1addr, s2addr, orig_src1, orig_src2,
2680 			    off_reg, vec_result);
2681   else
2682     emit_final_str_compare_gpr (tmp_reg_src1, tmp_reg_src2, result_reg);
2683 
2684   emit_label (final_move_label);
2685   emit_insn (gen_movsi (target,
2686 			gen_lowpart (SImode, result_reg)));
2687   emit_label (final_label);
2688   return true;
2689 }
2690 
2691 /* Generate loads and stores for a move of v4si mode using lvx/stvx.
2692    This uses altivec_{l,st}vx_<mode>_internal which use unspecs to
2693    keep combine from changing what instruction gets used.
2694 
2695    DEST is the destination for the data.
2696    SRC is the source of the data for the move.  */
2697 
2698 static rtx
2699 gen_lvx_v4si_move (rtx dest, rtx src)
2700 {
2701   gcc_assert (MEM_P (dest) ^ MEM_P (src));
2702   gcc_assert (GET_MODE (dest) == V4SImode && GET_MODE (src) == V4SImode);
2703 
2704   if (MEM_P (dest))
2705     return gen_altivec_stvx_v4si_internal (dest, src);
2706   else
2707     return gen_altivec_lvx_v4si_internal (dest, src);
2708 }
2709 
2710 /* Expand a block move operation, and return 1 if successful.  Return 0
2711    if we should let the compiler generate normal code.
2712 
2713    operands[0] is the destination
2714    operands[1] is the source
2715    operands[2] is the length
2716    operands[3] is the alignment */
2717 
2718 #define MAX_MOVE_REG 4
2719 
2720 int
2721 expand_block_move (rtx operands[])
2722 {
2723   rtx orig_dest = operands[0];
2724   rtx orig_src	= operands[1];
2725   rtx bytes_rtx	= operands[2];
2726   rtx align_rtx = operands[3];
2727   int constp	= CONST_INT_P (bytes_rtx);
2728   int align;
2729   int bytes;
2730   int offset;
2731   int move_bytes;
2732   rtx stores[MAX_MOVE_REG];
2733   int num_reg = 0;
2734 
2735   /* If this is not a fixed size move, just call memcpy */
2736   if (! constp)
2737     return 0;
2738 
2739   /* This must be a fixed size alignment */
2740   gcc_assert (CONST_INT_P (align_rtx));
2741   align = INTVAL (align_rtx) * BITS_PER_UNIT;
2742 
2743   /* Anything to move? */
2744   bytes = INTVAL (bytes_rtx);
2745   if (bytes <= 0)
2746     return 1;
2747 
2748   if (bytes > rs6000_block_move_inline_limit)
2749     return 0;
2750 
2751   for (offset = 0; bytes > 0; offset += move_bytes, bytes -= move_bytes)
2752     {
2753       union {
2754 	rtx (*movmemsi) (rtx, rtx, rtx, rtx);
2755 	rtx (*mov) (rtx, rtx);
2756       } gen_func;
2757       machine_mode mode = BLKmode;
2758       rtx src, dest;
2759 
2760       /* Altivec first, since it will be faster than a string move
2761 	 when it applies, and usually not significantly larger.  */
2762       if (TARGET_ALTIVEC && bytes >= 16 && align >= 128)
2763 	{
2764 	  move_bytes = 16;
2765 	  mode = V4SImode;
2766 	  gen_func.mov = gen_lvx_v4si_move;
2767 	}
2768       else if (bytes >= 8 && TARGET_POWERPC64
2769 	       && (align >= 64 || !STRICT_ALIGNMENT))
2770 	{
2771 	  move_bytes = 8;
2772 	  mode = DImode;
2773 	  gen_func.mov = gen_movdi;
2774 	  if (offset == 0 && align < 64)
2775 	    {
2776 	      rtx addr;
2777 
2778 	      /* If the address form is reg+offset with offset not a
2779 		 multiple of four, reload into reg indirect form here
2780 		 rather than waiting for reload.  This way we get one
2781 		 reload, not one per load and/or store.  */
2782 	      addr = XEXP (orig_dest, 0);
2783 	      if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM)
2784 		  && CONST_INT_P (XEXP (addr, 1))
2785 		  && (INTVAL (XEXP (addr, 1)) & 3) != 0)
2786 		{
2787 		  addr = copy_addr_to_reg (addr);
2788 		  orig_dest = replace_equiv_address (orig_dest, addr);
2789 		}
2790 	      addr = XEXP (orig_src, 0);
2791 	      if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM)
2792 		  && CONST_INT_P (XEXP (addr, 1))
2793 		  && (INTVAL (XEXP (addr, 1)) & 3) != 0)
2794 		{
2795 		  addr = copy_addr_to_reg (addr);
2796 		  orig_src = replace_equiv_address (orig_src, addr);
2797 		}
2798 	    }
2799 	}
2800       else if (bytes >= 4 && (align >= 32 || !STRICT_ALIGNMENT))
2801 	{			/* move 4 bytes */
2802 	  move_bytes = 4;
2803 	  mode = SImode;
2804 	  gen_func.mov = gen_movsi;
2805 	}
2806       else if (bytes >= 2 && (align >= 16 || !STRICT_ALIGNMENT))
2807 	{			/* move 2 bytes */
2808 	  move_bytes = 2;
2809 	  mode = HImode;
2810 	  gen_func.mov = gen_movhi;
2811 	}
2812       else /* move 1 byte at a time */
2813 	{
2814 	  move_bytes = 1;
2815 	  mode = QImode;
2816 	  gen_func.mov = gen_movqi;
2817 	}
2818 
2819       src = adjust_address (orig_src, mode, offset);
2820       dest = adjust_address (orig_dest, mode, offset);
2821 
2822       if (mode != BLKmode)
2823 	{
2824 	  rtx tmp_reg = gen_reg_rtx (mode);
2825 
2826 	  emit_insn ((*gen_func.mov) (tmp_reg, src));
2827 	  stores[num_reg++] = (*gen_func.mov) (dest, tmp_reg);
2828 	}
2829 
2830       if (mode == BLKmode || num_reg >= MAX_MOVE_REG || bytes == move_bytes)
2831 	{
2832 	  int i;
2833 	  for (i = 0; i < num_reg; i++)
2834 	    emit_insn (stores[i]);
2835 	  num_reg = 0;
2836 	}
2837 
2838       if (mode == BLKmode)
2839 	{
2840 	  /* Move the address into scratch registers.  The movmemsi
2841 	     patterns require zero offset.  */
2842 	  if (!REG_P (XEXP (src, 0)))
2843 	    {
2844 	      rtx src_reg = copy_addr_to_reg (XEXP (src, 0));
2845 	      src = replace_equiv_address (src, src_reg);
2846 	    }
2847 	  set_mem_size (src, move_bytes);
2848 
2849 	  if (!REG_P (XEXP (dest, 0)))
2850 	    {
2851 	      rtx dest_reg = copy_addr_to_reg (XEXP (dest, 0));
2852 	      dest = replace_equiv_address (dest, dest_reg);
2853 	    }
2854 	  set_mem_size (dest, move_bytes);
2855 
2856 	  emit_insn ((*gen_func.movmemsi) (dest, src,
2857 					   GEN_INT (move_bytes & 31),
2858 					   align_rtx));
2859 	}
2860     }
2861 
2862   return 1;
2863 }
2864