xref: /netbsd-src/external/gpl3/gcc/dist/gcc/input.cc (revision b1e838363e3c6fc78a55519254d99869742dd33c)
1 /* Data and functions related to line maps and input files.
2    Copyright (C) 2004-2022 Free Software Foundation, Inc.
3 
4 This file is part of GCC.
5 
6 GCC is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 3, or (at your option) any later
9 version.
10 
11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 for more details.
15 
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3.  If not see
18 <http://www.gnu.org/licenses/>.  */
19 
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "intl.h"
24 #include "diagnostic.h"
25 #include "selftest.h"
26 #include "cpplib.h"
27 
28 #ifndef HAVE_ICONV
29 #define HAVE_ICONV 0
30 #endif
31 
32 /* Input charset configuration.  */
default_charset_callback(const char *)33 static const char *default_charset_callback (const char *)
34 {
35   return nullptr;
36 }
37 
38 void
initialize_input_context(diagnostic_input_charset_callback ccb,bool should_skip_bom)39 file_cache::initialize_input_context (diagnostic_input_charset_callback ccb,
40 				      bool should_skip_bom)
41 {
42   in_context.ccb = (ccb ? ccb : default_charset_callback);
43   in_context.should_skip_bom = should_skip_bom;
44 }
45 
46 /* This is a cache used by get_next_line to store the content of a
47    file to be searched for file lines.  */
48 class file_cache_slot
49 {
50 public:
51   file_cache_slot ();
52   ~file_cache_slot ();
53 
54   bool read_line_num (size_t line_num,
55 		      char ** line, ssize_t *line_len);
56 
57   /* Accessors.  */
get_file_path() const58   const char *get_file_path () const { return m_file_path; }
get_use_count() const59   unsigned get_use_count () const { return m_use_count; }
missing_trailing_newline_p() const60   bool missing_trailing_newline_p () const
61   {
62     return m_missing_trailing_newline;
63   }
64 
inc_use_count()65   void inc_use_count () { m_use_count++; }
66 
67   bool create (const file_cache::input_context &in_context,
68 	       const char *file_path, FILE *fp, unsigned highest_use_count);
69   void evict ();
70 
71  private:
72   /* These are information used to store a line boundary.  */
73   class line_info
74   {
75   public:
76     /* The line number.  It starts from 1.  */
77     size_t line_num;
78 
79     /* The position (byte count) of the beginning of the line,
80        relative to the file data pointer.  This starts at zero.  */
81     size_t start_pos;
82 
83     /* The position (byte count) of the last byte of the line.  This
84        normally points to the '\n' character, or to one byte after the
85        last byte of the file, if the file doesn't contain a '\n'
86        character.  */
87     size_t end_pos;
88 
line_info(size_t l,size_t s,size_t e)89     line_info (size_t l, size_t s, size_t e)
90       : line_num (l), start_pos (s), end_pos (e)
91     {}
92 
line_info()93     line_info ()
94       :line_num (0), start_pos (0), end_pos (0)
95     {}
96   };
97 
98   bool needs_read_p () const;
99   bool needs_grow_p () const;
100   void maybe_grow ();
101   bool read_data ();
102   bool maybe_read_data ();
103   bool get_next_line (char **line, ssize_t *line_len);
104   bool read_next_line (char ** line, ssize_t *line_len);
105   bool goto_next_line ();
106 
107   static const size_t buffer_size = 4 * 1024;
108   static const size_t line_record_size = 100;
109 
110   /* The number of time this file has been accessed.  This is used
111      to designate which file cache to evict from the cache
112      array.  */
113   unsigned m_use_count;
114 
115   /* The file_path is the key for identifying a particular file in
116      the cache.
117      For libcpp-using code, the underlying buffer for this field is
118      owned by the corresponding _cpp_file within the cpp_reader.  */
119   const char *m_file_path;
120 
121   FILE *m_fp;
122 
123   /* This points to the content of the file that we've read so
124      far.  */
125   char *m_data;
126 
127   /* The allocated buffer to be freed may start a little earlier than DATA,
128      e.g. if a UTF8 BOM was skipped at the beginning.  */
129   int m_alloc_offset;
130 
131   /*  The size of the DATA array above.*/
132   size_t m_size;
133 
134   /* The number of bytes read from the underlying file so far.  This
135      must be less (or equal) than SIZE above.  */
136   size_t m_nb_read;
137 
138   /* The index of the beginning of the current line.  */
139   size_t m_line_start_idx;
140 
141   /* The number of the previous line read.  This starts at 1.  Zero
142      means we've read no line so far.  */
143   size_t m_line_num;
144 
145   /* This is the total number of lines of the current file.  At the
146      moment, we try to get this information from the line map
147      subsystem.  Note that this is just a hint.  When using the C++
148      front-end, this hint is correct because the input file is then
149      completely tokenized before parsing starts; so the line map knows
150      the number of lines before compilation really starts.  For e.g,
151      the C front-end, it can happen that we start emitting diagnostics
152      before the line map has seen the end of the file.  */
153   size_t m_total_lines;
154 
155   /* Could this file be missing a trailing newline on its final line?
156      Initially true (to cope with empty files), set to true/false
157      as each line is read.  */
158   bool m_missing_trailing_newline;
159 
160   /* This is a record of the beginning and end of the lines we've seen
161      while reading the file.  This is useful to avoid walking the data
162      from the beginning when we are asked to read a line that is
163      before LINE_START_IDX above.  Note that the maximum size of this
164      record is line_record_size, so that the memory consumption
165      doesn't explode.  We thus scale total_lines down to
166      line_record_size.  */
167   vec<line_info, va_heap> m_line_record;
168 
offset_buffer(int offset)169   void offset_buffer (int offset)
170   {
171     gcc_assert (offset < 0 ? m_alloc_offset + offset >= 0
172 		: (size_t) offset <= m_size);
173     gcc_assert (m_data);
174     m_alloc_offset += offset;
175     m_data += offset;
176     m_size -= offset;
177   }
178 
179 };
180 
181 /* Current position in real source file.  */
182 
183 location_t input_location = UNKNOWN_LOCATION;
184 
185 class line_maps *line_table;
186 
187 /* A stashed copy of "line_table" for use by selftest::line_table_test.
188    This needs to be a global so that it can be a GC root, and thus
189    prevent the stashed copy from being garbage-collected if the GC runs
190    during a line_table_test.  */
191 
192 class line_maps *saved_line_table;
193 
194 /* Expand the source location LOC into a human readable location.  If
195    LOC resolves to a builtin location, the file name of the readable
196    location is set to the string "<built-in>". If EXPANSION_POINT_P is
197    TRUE and LOC is virtual, then it is resolved to the expansion
198    point of the involved macro.  Otherwise, it is resolved to the
199    spelling location of the token.
200 
201    When resolving to the spelling location of the token, if the
202    resulting location is for a built-in location (that is, it has no
203    associated line/column) in the context of a macro expansion, the
204    returned location is the first one (while unwinding the macro
205    location towards its expansion point) that is in real source
206    code.
207 
208    ASPECT controls which part of the location to use.  */
209 
210 static expanded_location
expand_location_1(location_t loc,bool expansion_point_p,enum location_aspect aspect)211 expand_location_1 (location_t loc,
212 		   bool expansion_point_p,
213 		   enum location_aspect aspect)
214 {
215   expanded_location xloc;
216   const line_map_ordinary *map;
217   enum location_resolution_kind lrk = LRK_MACRO_EXPANSION_POINT;
218   tree block = NULL;
219 
220   if (IS_ADHOC_LOC (loc))
221     {
222       block = LOCATION_BLOCK (loc);
223       loc = LOCATION_LOCUS (loc);
224     }
225 
226   memset (&xloc, 0, sizeof (xloc));
227 
228   if (loc >= RESERVED_LOCATION_COUNT)
229     {
230       if (!expansion_point_p)
231 	{
232 	  /* We want to resolve LOC to its spelling location.
233 
234 	     But if that spelling location is a reserved location that
235 	     appears in the context of a macro expansion (like for a
236 	     location for a built-in token), let's consider the first
237 	     location (toward the expansion point) that is not reserved;
238 	     that is, the first location that is in real source code.  */
239 	  loc = linemap_unwind_to_first_non_reserved_loc (line_table,
240 							  loc, NULL);
241 	  lrk = LRK_SPELLING_LOCATION;
242 	}
243       loc = linemap_resolve_location (line_table, loc, lrk, &map);
244 
245       /* loc is now either in an ordinary map, or is a reserved location.
246 	 If it is a compound location, the caret is in a spelling location,
247 	 but the start/finish might still be a virtual location.
248 	 Depending of what the caller asked for, we may need to recurse
249 	 one level in order to resolve any virtual locations in the
250 	 end-points.  */
251       switch (aspect)
252 	{
253 	default:
254 	  gcc_unreachable ();
255 	  /* Fall through.  */
256 	case LOCATION_ASPECT_CARET:
257 	  break;
258 	case LOCATION_ASPECT_START:
259 	  {
260 	    location_t start = get_start (loc);
261 	    if (start != loc)
262 	      return expand_location_1 (start, expansion_point_p, aspect);
263 	  }
264 	  break;
265 	case LOCATION_ASPECT_FINISH:
266 	  {
267 	    location_t finish = get_finish (loc);
268 	    if (finish != loc)
269 	      return expand_location_1 (finish, expansion_point_p, aspect);
270 	  }
271 	  break;
272 	}
273       xloc = linemap_expand_location (line_table, map, loc);
274     }
275 
276   xloc.data = block;
277   if (loc <= BUILTINS_LOCATION)
278     xloc.file = loc == UNKNOWN_LOCATION ? NULL : _("<built-in>");
279 
280   return xloc;
281 }
282 
283 /* Initialize the set of cache used for files accessed by caret
284    diagnostic.  */
285 
286 static void
diagnostic_file_cache_init(void)287 diagnostic_file_cache_init (void)
288 {
289   gcc_assert (global_dc);
290   if (global_dc->m_file_cache == NULL)
291     global_dc->m_file_cache = new file_cache ();
292 }
293 
294 /* Free the resources used by the set of cache used for files accessed
295    by caret diagnostic.  */
296 
297 void
diagnostic_file_cache_fini(void)298 diagnostic_file_cache_fini (void)
299 {
300   if (global_dc->m_file_cache)
301     {
302       delete global_dc->m_file_cache;
303       global_dc->m_file_cache = NULL;
304     }
305 }
306 
307 /* Return the total lines number that have been read so far by the
308    line map (in the preprocessor) so far.  For languages like C++ that
309    entirely preprocess the input file before starting to parse, this
310    equals the actual number of lines of the file.  */
311 
312 static size_t
total_lines_num(const char * file_path)313 total_lines_num (const char *file_path)
314 {
315   size_t r = 0;
316   location_t l = 0;
317   if (linemap_get_file_highest_location (line_table, file_path, &l))
318     {
319       gcc_assert (l >= RESERVED_LOCATION_COUNT);
320       expanded_location xloc = expand_location (l);
321       r = xloc.line;
322     }
323   return r;
324 }
325 
326 /* Lookup the cache used for the content of a given file accessed by
327    caret diagnostic.  Return the found cached file, or NULL if no
328    cached file was found.  */
329 
330 file_cache_slot *
lookup_file(const char * file_path)331 file_cache::lookup_file (const char *file_path)
332 {
333   gcc_assert (file_path);
334 
335   /* This will contain the found cached file.  */
336   file_cache_slot *r = NULL;
337   for (unsigned i = 0; i < num_file_slots; ++i)
338     {
339       file_cache_slot *c = &m_file_slots[i];
340       if (c->get_file_path () && !strcmp (c->get_file_path (), file_path))
341 	{
342 	  c->inc_use_count ();
343 	  r = c;
344 	}
345     }
346 
347   if (r)
348     r->inc_use_count ();
349 
350   return r;
351 }
352 
353 /* Purge any mention of FILENAME from the cache of files used for
354    printing source code.  For use in selftests when working
355    with tempfiles.  */
356 
357 void
diagnostics_file_cache_forcibly_evict_file(const char * file_path)358 diagnostics_file_cache_forcibly_evict_file (const char *file_path)
359 {
360   gcc_assert (file_path);
361 
362   if (!global_dc->m_file_cache)
363     return;
364 
365   global_dc->m_file_cache->forcibly_evict_file (file_path);
366 }
367 
368 void
forcibly_evict_file(const char * file_path)369 file_cache::forcibly_evict_file (const char *file_path)
370 {
371   gcc_assert (file_path);
372 
373   file_cache_slot *r = lookup_file (file_path);
374   if (!r)
375     /* Not found.  */
376     return;
377 
378   r->evict ();
379 }
380 
381 void
evict()382 file_cache_slot::evict ()
383 {
384   m_file_path = NULL;
385   if (m_fp)
386     fclose (m_fp);
387   m_fp = NULL;
388   m_nb_read = 0;
389   m_line_start_idx = 0;
390   m_line_num = 0;
391   m_line_record.truncate (0);
392   m_use_count = 0;
393   m_total_lines = 0;
394   m_missing_trailing_newline = true;
395 }
396 
397 /* Return the file cache that has been less used, recently, or the
398    first empty one.  If HIGHEST_USE_COUNT is non-null,
399    *HIGHEST_USE_COUNT is set to the highest use count of the entries
400    in the cache table.  */
401 
402 file_cache_slot*
evicted_cache_tab_entry(unsigned * highest_use_count)403 file_cache::evicted_cache_tab_entry (unsigned *highest_use_count)
404 {
405   diagnostic_file_cache_init ();
406 
407   file_cache_slot *to_evict = &m_file_slots[0];
408   unsigned huc = to_evict->get_use_count ();
409   for (unsigned i = 1; i < num_file_slots; ++i)
410     {
411       file_cache_slot *c = &m_file_slots[i];
412       bool c_is_empty = (c->get_file_path () == NULL);
413 
414       if (c->get_use_count () < to_evict->get_use_count ()
415 	  || (to_evict->get_file_path () && c_is_empty))
416 	/* We evict C because it's either an entry with a lower use
417 	   count or one that is empty.  */
418 	to_evict = c;
419 
420       if (huc < c->get_use_count ())
421 	huc = c->get_use_count ();
422 
423       if (c_is_empty)
424 	/* We've reached the end of the cache; subsequent elements are
425 	   all empty.  */
426 	break;
427     }
428 
429   if (highest_use_count)
430     *highest_use_count = huc;
431 
432   return to_evict;
433 }
434 
435 /* Create the cache used for the content of a given file to be
436    accessed by caret diagnostic.  This cache is added to an array of
437    cache and can be retrieved by lookup_file_in_cache_tab.  This
438    function returns the created cache.  Note that only the last
439    num_file_slots files are cached.  */
440 
441 file_cache_slot*
add_file(const char * file_path)442 file_cache::add_file (const char *file_path)
443 {
444 
445   FILE *fp = fopen (file_path, "r");
446   if (fp == NULL)
447     return NULL;
448 
449   unsigned highest_use_count = 0;
450   file_cache_slot *r = evicted_cache_tab_entry (&highest_use_count);
451   if (!r->create (in_context, file_path, fp, highest_use_count))
452     return NULL;
453   return r;
454 }
455 
456 /* Populate this slot for use on FILE_PATH and FP, dropping any
457    existing cached content within it.  */
458 
459 bool
create(const file_cache::input_context & in_context,const char * file_path,FILE * fp,unsigned highest_use_count)460 file_cache_slot::create (const file_cache::input_context &in_context,
461 			 const char *file_path, FILE *fp,
462 			 unsigned highest_use_count)
463 {
464   m_file_path = file_path;
465   if (m_fp)
466     fclose (m_fp);
467   m_fp = fp;
468   if (m_alloc_offset)
469     offset_buffer (-m_alloc_offset);
470   m_nb_read = 0;
471   m_line_start_idx = 0;
472   m_line_num = 0;
473   m_line_record.truncate (0);
474   /* Ensure that this cache entry doesn't get evicted next time
475      add_file_to_cache_tab is called.  */
476   m_use_count = ++highest_use_count;
477   m_total_lines = total_lines_num (file_path);
478   m_missing_trailing_newline = true;
479 
480 
481   /* Check the input configuration to determine if we need to do any
482      transformations, such as charset conversion or BOM skipping.  */
483   if (const char *input_charset = in_context.ccb (file_path))
484     {
485       /* Need a full-blown conversion of the input charset.  */
486       fclose (m_fp);
487       m_fp = NULL;
488       const cpp_converted_source cs
489 	= cpp_get_converted_source (file_path, input_charset);
490       if (!cs.data)
491 	return false;
492       if (m_data)
493 	XDELETEVEC (m_data);
494       m_data = cs.data;
495       m_nb_read = m_size = cs.len;
496       m_alloc_offset = cs.data - cs.to_free;
497     }
498   else if (in_context.should_skip_bom)
499     {
500       if (read_data ())
501 	{
502 	  const int offset = cpp_check_utf8_bom (m_data, m_nb_read);
503 	  offset_buffer (offset);
504 	  m_nb_read -= offset;
505 	}
506     }
507 
508   return true;
509 }
510 
511 /* file_cache's ctor.  */
512 
file_cache()513 file_cache::file_cache ()
514 : m_file_slots (new file_cache_slot[num_file_slots])
515 {
516   initialize_input_context (nullptr, false);
517 }
518 
519 /* file_cache's dtor.  */
520 
~file_cache()521 file_cache::~file_cache ()
522 {
523   delete[] m_file_slots;
524 }
525 
526 /* Lookup the cache used for the content of a given file accessed by
527    caret diagnostic.  If no cached file was found, create a new cache
528    for this file, add it to the array of cached file and return
529    it.  */
530 
531 file_cache_slot*
lookup_or_add_file(const char * file_path)532 file_cache::lookup_or_add_file (const char *file_path)
533 {
534   file_cache_slot *r = lookup_file (file_path);
535   if (r == NULL)
536     r = add_file (file_path);
537   return r;
538 }
539 
540 /* Default constructor for a cache of file used by caret
541    diagnostic.  */
542 
file_cache_slot()543 file_cache_slot::file_cache_slot ()
544 : m_use_count (0), m_file_path (NULL), m_fp (NULL), m_data (0),
545   m_alloc_offset (0), m_size (0), m_nb_read (0), m_line_start_idx (0),
546   m_line_num (0), m_total_lines (0), m_missing_trailing_newline (true)
547 {
548   m_line_record.create (0);
549 }
550 
551 /* Destructor for a cache of file used by caret diagnostic.  */
552 
~file_cache_slot()553 file_cache_slot::~file_cache_slot ()
554 {
555   if (m_fp)
556     {
557       fclose (m_fp);
558       m_fp = NULL;
559     }
560   if (m_data)
561     {
562       offset_buffer (-m_alloc_offset);
563       XDELETEVEC (m_data);
564       m_data = 0;
565     }
566   m_line_record.release ();
567 }
568 
569 /* Returns TRUE iff the cache would need to be filled with data coming
570    from the file.  That is, either the cache is empty or full or the
571    current line is empty.  Note that if the cache is full, it would
572    need to be extended and filled again.  */
573 
574 bool
needs_read_p() const575 file_cache_slot::needs_read_p () const
576 {
577   return m_fp && (m_nb_read == 0
578 	  || m_nb_read == m_size
579 	  || (m_line_start_idx >= m_nb_read - 1));
580 }
581 
582 /*  Return TRUE iff the cache is full and thus needs to be
583     extended.  */
584 
585 bool
needs_grow_p() const586 file_cache_slot::needs_grow_p () const
587 {
588   return m_nb_read == m_size;
589 }
590 
591 /* Grow the cache if it needs to be extended.  */
592 
593 void
maybe_grow()594 file_cache_slot::maybe_grow ()
595 {
596   if (!needs_grow_p ())
597     return;
598 
599   if (!m_data)
600     {
601       gcc_assert (m_size == 0 && m_alloc_offset == 0);
602       m_size = buffer_size;
603       m_data = XNEWVEC (char, m_size);
604     }
605   else
606     {
607       const int offset = m_alloc_offset;
608       offset_buffer (-offset);
609       m_size *= 2;
610       m_data = XRESIZEVEC (char, m_data, m_size);
611       offset_buffer (offset);
612     }
613 }
614 
615 /*  Read more data into the cache.  Extends the cache if need be.
616     Returns TRUE iff new data could be read.  */
617 
618 bool
read_data()619 file_cache_slot::read_data ()
620 {
621   if (feof (m_fp) || ferror (m_fp))
622     return false;
623 
624   maybe_grow ();
625 
626   char * from = m_data + m_nb_read;
627   size_t to_read = m_size - m_nb_read;
628   size_t nb_read = fread (from, 1, to_read, m_fp);
629 
630   if (ferror (m_fp))
631     return false;
632 
633   m_nb_read += nb_read;
634   return !!nb_read;
635 }
636 
637 /* Read new data iff the cache needs to be filled with more data
638    coming from the file FP.  Return TRUE iff the cache was filled with
639    mode data.  */
640 
641 bool
maybe_read_data()642 file_cache_slot::maybe_read_data ()
643 {
644   if (!needs_read_p ())
645     return false;
646   return read_data ();
647 }
648 
649 /* Read a new line from file FP, using C as a cache for the data
650    coming from the file.  Upon successful completion, *LINE is set to
651    the beginning of the line found.  *LINE points directly in the
652    line cache and is only valid until the next call of get_next_line.
653    *LINE_LEN is set to the length of the line.  Note that the line
654    does not contain any terminal delimiter.  This function returns
655    true if some data was read or process from the cache, false
656    otherwise.  Note that subsequent calls to get_next_line might
657    make the content of *LINE invalid.  */
658 
659 bool
get_next_line(char ** line,ssize_t * line_len)660 file_cache_slot::get_next_line (char **line, ssize_t *line_len)
661 {
662   /* Fill the cache with data to process.  */
663   maybe_read_data ();
664 
665   size_t remaining_size = m_nb_read - m_line_start_idx;
666   if (remaining_size == 0)
667     /* There is no more data to process.  */
668     return false;
669 
670   char *line_start = m_data + m_line_start_idx;
671 
672   char *next_line_start = NULL;
673   size_t len = 0;
674   char *line_end = (char *) memchr (line_start, '\n', remaining_size);
675   if (line_end == NULL)
676     {
677       /* We haven't found the end-of-line delimiter in the cache.
678 	 Fill the cache with more data from the file and look for the
679 	 '\n'.  */
680       while (maybe_read_data ())
681 	{
682 	  line_start = m_data + m_line_start_idx;
683 	  remaining_size = m_nb_read - m_line_start_idx;
684 	  line_end = (char *) memchr (line_start, '\n', remaining_size);
685 	  if (line_end != NULL)
686 	    {
687 	      next_line_start = line_end + 1;
688 	      break;
689 	    }
690 	}
691       if (line_end == NULL)
692 	{
693 	  /* We've loadded all the file into the cache and still no
694 	     '\n'.  Let's say the line ends up at one byte passed the
695 	     end of the file.  This is to stay consistent with the case
696 	     of when the line ends up with a '\n' and line_end points to
697 	     that terminal '\n'.  That consistency is useful below in
698 	     the len calculation.  */
699 	  line_end = m_data + m_nb_read ;
700 	  m_missing_trailing_newline = true;
701 	}
702       else
703 	m_missing_trailing_newline = false;
704     }
705   else
706     {
707       next_line_start = line_end + 1;
708       m_missing_trailing_newline = false;
709     }
710 
711   if (m_fp && ferror (m_fp))
712     return false;
713 
714   /* At this point, we've found the end of the of line.  It either
715      points to the '\n' or to one byte after the last byte of the
716      file.  */
717   gcc_assert (line_end != NULL);
718 
719   len = line_end - line_start;
720 
721   if (m_line_start_idx < m_nb_read)
722     *line = line_start;
723 
724   ++m_line_num;
725 
726   /* Before we update our line record, make sure the hint about the
727      total number of lines of the file is correct.  If it's not, then
728      we give up recording line boundaries from now on.  */
729   bool update_line_record = true;
730   if (m_line_num > m_total_lines)
731     update_line_record = false;
732 
733     /* Now update our line record so that re-reading lines from the
734      before m_line_start_idx is faster.  */
735   if (update_line_record
736       && m_line_record.length () < line_record_size)
737     {
738       /* If the file lines fits in the line record, we just record all
739 	 its lines ...*/
740       if (m_total_lines <= line_record_size
741 	  && m_line_num > m_line_record.length ())
742 	m_line_record.safe_push
743 	  (file_cache_slot::line_info (m_line_num,
744 				       m_line_start_idx,
745 				       line_end - m_data));
746       else if (m_total_lines > line_record_size)
747 	{
748 	  /* ... otherwise, we just scale total_lines down to
749 	     (line_record_size lines.  */
750 	  size_t n = (m_line_num * line_record_size) / m_total_lines;
751 	  if (m_line_record.length () == 0
752 	      || n >= m_line_record.length ())
753 	    m_line_record.safe_push
754 	      (file_cache_slot::line_info (m_line_num,
755 					   m_line_start_idx,
756 					   line_end - m_data));
757 	}
758     }
759 
760   /* Update m_line_start_idx so that it points to the next line to be
761      read.  */
762   if (next_line_start)
763     m_line_start_idx = next_line_start - m_data;
764   else
765     /* We didn't find any terminal '\n'.  Let's consider that the end
766        of line is the end of the data in the cache.  The next
767        invocation of get_next_line will either read more data from the
768        underlying file or return false early because we've reached the
769        end of the file.  */
770     m_line_start_idx = m_nb_read;
771 
772   *line_len = len;
773 
774   return true;
775 }
776 
777 /* Consume the next bytes coming from the cache (or from its
778    underlying file if there are remaining unread bytes in the file)
779    until we reach the next end-of-line (or end-of-file).  There is no
780    copying from the cache involved.  Return TRUE upon successful
781    completion.  */
782 
783 bool
goto_next_line()784 file_cache_slot::goto_next_line ()
785 {
786   char *l;
787   ssize_t len;
788 
789   return get_next_line (&l, &len);
790 }
791 
792 /* Read an arbitrary line number LINE_NUM from the file cached in C.
793    If the line was read successfully, *LINE points to the beginning
794    of the line in the file cache and *LINE_LEN is the length of the
795    line.  *LINE is not nul-terminated, but may contain zero bytes.
796    *LINE is only valid until the next call of read_line_num.
797    This function returns bool if a line was read.  */
798 
799 bool
read_line_num(size_t line_num,char ** line,ssize_t * line_len)800 file_cache_slot::read_line_num (size_t line_num,
801 		       char ** line, ssize_t *line_len)
802 {
803   gcc_assert (line_num > 0);
804 
805   if (line_num <= m_line_num)
806     {
807       /* We've been asked to read lines that are before m_line_num.
808 	 So lets use our line record (if it's not empty) to try to
809 	 avoid re-reading the file from the beginning again.  */
810 
811       if (m_line_record.is_empty ())
812 	{
813 	  m_line_start_idx = 0;
814 	  m_line_num = 0;
815 	}
816       else
817 	{
818 	  file_cache_slot::line_info *i = NULL;
819 	  if (m_total_lines <= line_record_size)
820 	    {
821 	      /* In languages where the input file is not totally
822 		 preprocessed up front, the m_total_lines hint
823 		 can be smaller than the number of lines of the
824 		 file.  In that case, only the first
825 		 m_total_lines have been recorded.
826 
827 		 Otherwise, the first m_total_lines we've read have
828 		 their start/end recorded here.  */
829 	      i = (line_num <= m_total_lines)
830 		? &m_line_record[line_num - 1]
831 		: &m_line_record[m_total_lines - 1];
832 	      gcc_assert (i->line_num <= line_num);
833 	    }
834 	  else
835 	    {
836 	      /*  So the file had more lines than our line record
837 		  size.  Thus the number of lines we've recorded has
838 		  been scaled down to line_record_size.  Let's
839 		  pick the start/end of the recorded line that is
840 		  closest to line_num.  */
841 	      size_t n = (line_num <= m_total_lines)
842 		? line_num * line_record_size / m_total_lines
843 		: m_line_record.length () - 1;
844 	      if (n < m_line_record.length ())
845 		{
846 		  i = &m_line_record[n];
847 		  gcc_assert (i->line_num <= line_num);
848 		}
849 	    }
850 
851 	  if (i && i->line_num == line_num)
852 	    {
853 	      /* We have the start/end of the line.  */
854 	      *line = m_data + i->start_pos;
855 	      *line_len = i->end_pos - i->start_pos;
856 	      return true;
857 	    }
858 
859 	  if (i)
860 	    {
861 	      m_line_start_idx = i->start_pos;
862 	      m_line_num = i->line_num - 1;
863 	    }
864 	  else
865 	    {
866 	      m_line_start_idx = 0;
867 	      m_line_num = 0;
868 	    }
869 	}
870     }
871 
872   /*  Let's walk from line m_line_num up to line_num - 1, without
873       copying any line.  */
874   while (m_line_num < line_num - 1)
875     if (!goto_next_line ())
876       return false;
877 
878   /* The line we want is the next one.  Let's read and copy it back to
879      the caller.  */
880   return get_next_line (line, line_len);
881 }
882 
883 /* Return the physical source line that corresponds to FILE_PATH/LINE.
884    The line is not nul-terminated.  The returned pointer is only
885    valid until the next call of location_get_source_line.
886    Note that the line can contain several null characters,
887    so the returned value's length has the actual length of the line.
888    If the function fails, a NULL char_span is returned.  */
889 
890 char_span
location_get_source_line(const char * file_path,int line)891 location_get_source_line (const char *file_path, int line)
892 {
893   char *buffer = NULL;
894   ssize_t len;
895 
896   if (line == 0)
897     return char_span (NULL, 0);
898 
899   if (file_path == NULL)
900     return char_span (NULL, 0);
901 
902   diagnostic_file_cache_init ();
903 
904   file_cache_slot *c = global_dc->m_file_cache->lookup_or_add_file (file_path);
905   if (c == NULL)
906     return char_span (NULL, 0);
907 
908   bool read = c->read_line_num (line, &buffer, &len);
909   if (!read)
910     return char_span (NULL, 0);
911 
912   return char_span (buffer, len);
913 }
914 
915 /* Determine if FILE_PATH missing a trailing newline on its final line.
916    Only valid to call once all of the file has been loaded, by
917    requesting a line number beyond the end of the file.  */
918 
919 bool
location_missing_trailing_newline(const char * file_path)920 location_missing_trailing_newline (const char *file_path)
921 {
922   diagnostic_file_cache_init ();
923 
924   file_cache_slot *c = global_dc->m_file_cache->lookup_or_add_file (file_path);
925   if (c == NULL)
926     return false;
927 
928   return c->missing_trailing_newline_p ();
929 }
930 
931 /* Test if the location originates from the spelling location of a
932    builtin-tokens.  That is, return TRUE if LOC is a (possibly
933    virtual) location of a built-in token that appears in the expansion
934    list of a macro.  Please note that this function also works on
935    tokens that result from built-in tokens.  For instance, the
936    function would return true if passed a token "4" that is the result
937    of the expansion of the built-in __LINE__ macro.  */
938 bool
is_location_from_builtin_token(location_t loc)939 is_location_from_builtin_token (location_t loc)
940 {
941   const line_map_ordinary *map = NULL;
942   loc = linemap_resolve_location (line_table, loc,
943 				  LRK_SPELLING_LOCATION, &map);
944   return loc == BUILTINS_LOCATION;
945 }
946 
947 /* Expand the source location LOC into a human readable location.  If
948    LOC is virtual, it resolves to the expansion point of the involved
949    macro.  If LOC resolves to a builtin location, the file name of the
950    readable location is set to the string "<built-in>".  */
951 
952 expanded_location
expand_location(location_t loc)953 expand_location (location_t loc)
954 {
955   return expand_location_1 (loc, /*expansion_point_p=*/true,
956 			    LOCATION_ASPECT_CARET);
957 }
958 
959 /* Expand the source location LOC into a human readable location.  If
960    LOC is virtual, it resolves to the expansion location of the
961    relevant macro.  If LOC resolves to a builtin location, the file
962    name of the readable location is set to the string
963    "<built-in>".  */
964 
965 expanded_location
expand_location_to_spelling_point(location_t loc,enum location_aspect aspect)966 expand_location_to_spelling_point (location_t loc,
967 				   enum location_aspect aspect)
968 {
969   return expand_location_1 (loc, /*expansion_point_p=*/false, aspect);
970 }
971 
972 /* The rich_location class within libcpp requires a way to expand
973    location_t instances, and relies on the client code
974    providing a symbol named
975      linemap_client_expand_location_to_spelling_point
976    to do this.
977 
978    This is the implementation for libcommon.a (all host binaries),
979    which simply calls into expand_location_1.  */
980 
981 expanded_location
linemap_client_expand_location_to_spelling_point(location_t loc,enum location_aspect aspect)982 linemap_client_expand_location_to_spelling_point (location_t loc,
983 						  enum location_aspect aspect)
984 {
985   return expand_location_1 (loc, /*expansion_point_p=*/false, aspect);
986 }
987 
988 
989 /* If LOCATION is in a system header and if it is a virtual location
990    for a token coming from the expansion of a macro, unwind it to
991    the location of the expansion point of the macro.  If the expansion
992    point is also in a system header return the original LOCATION.
993    Otherwise, return the location of the expansion point.
994 
995    This is used for instance when we want to emit diagnostics about a
996    token that may be located in a macro that is itself defined in a
997    system header, for example, for the NULL macro.  In such a case, if
998    LOCATION were passed directly to diagnostic functions such as
999    warning_at, the diagnostic would be suppressed (unless
1000    -Wsystem-headers).  */
1001 
1002 location_t
expansion_point_location_if_in_system_header(location_t location)1003 expansion_point_location_if_in_system_header (location_t location)
1004 {
1005   if (!in_system_header_at (location))
1006     return location;
1007 
1008   location_t xloc = linemap_resolve_location (line_table, location,
1009 					      LRK_MACRO_EXPANSION_POINT,
1010 					      NULL);
1011   return in_system_header_at (xloc) ? location : xloc;
1012 }
1013 
1014 /* If LOCATION is a virtual location for a token coming from the expansion
1015    of a macro, unwind to the location of the expansion point of the macro.  */
1016 
1017 location_t
expansion_point_location(location_t location)1018 expansion_point_location (location_t location)
1019 {
1020   return linemap_resolve_location (line_table, location,
1021 				   LRK_MACRO_EXPANSION_POINT, NULL);
1022 }
1023 
1024 /* Construct a location with caret at CARET, ranging from START to
1025    finish e.g.
1026 
1027                  11111111112
1028         12345678901234567890
1029      522
1030      523   return foo + bar;
1031                   ~~~~^~~~~
1032      524
1033 
1034    The location's caret is at the "+", line 523 column 15, but starts
1035    earlier, at the "f" of "foo" at column 11.  The finish is at the "r"
1036    of "bar" at column 19.  */
1037 
1038 location_t
make_location(location_t caret,location_t start,location_t finish)1039 make_location (location_t caret, location_t start, location_t finish)
1040 {
1041   location_t pure_loc = get_pure_location (caret);
1042   source_range src_range;
1043   src_range.m_start = get_start (start);
1044   src_range.m_finish = get_finish (finish);
1045   location_t combined_loc = COMBINE_LOCATION_DATA (line_table,
1046 						   pure_loc,
1047 						   src_range,
1048 						   NULL);
1049   return combined_loc;
1050 }
1051 
1052 /* Same as above, but taking a source range rather than two locations.  */
1053 
1054 location_t
make_location(location_t caret,source_range src_range)1055 make_location (location_t caret, source_range src_range)
1056 {
1057   location_t pure_loc = get_pure_location (caret);
1058   return COMBINE_LOCATION_DATA (line_table, pure_loc, src_range, NULL);
1059 }
1060 
1061 /* An expanded_location stores the column in byte units.  This function
1062    converts that column to display units.  That requires reading the associated
1063    source line in order to calculate the display width.  If that cannot be done
1064    for any reason, then returns the byte column as a fallback.  */
1065 int
location_compute_display_column(expanded_location exploc,const cpp_char_column_policy & policy)1066 location_compute_display_column (expanded_location exploc,
1067 				 const cpp_char_column_policy &policy)
1068 {
1069   if (!(exploc.file && *exploc.file && exploc.line && exploc.column))
1070     return exploc.column;
1071   char_span line = location_get_source_line (exploc.file, exploc.line);
1072   /* If line is NULL, this function returns exploc.column which is the
1073      desired fallback.  */
1074   return cpp_byte_column_to_display_column (line.get_buffer (), line.length (),
1075 					    exploc.column, policy);
1076 }
1077 
1078 /* Dump statistics to stderr about the memory usage of the line_table
1079    set of line maps.  This also displays some statistics about macro
1080    expansion.  */
1081 
1082 void
dump_line_table_statistics(void)1083 dump_line_table_statistics (void)
1084 {
1085   struct linemap_stats s;
1086   long total_used_map_size,
1087     macro_maps_size,
1088     total_allocated_map_size;
1089 
1090   memset (&s, 0, sizeof (s));
1091 
1092   linemap_get_statistics (line_table, &s);
1093 
1094   macro_maps_size = s.macro_maps_used_size
1095     + s.macro_maps_locations_size;
1096 
1097   total_allocated_map_size = s.ordinary_maps_allocated_size
1098     + s.macro_maps_allocated_size
1099     + s.macro_maps_locations_size;
1100 
1101   total_used_map_size = s.ordinary_maps_used_size
1102     + s.macro_maps_used_size
1103     + s.macro_maps_locations_size;
1104 
1105   fprintf (stderr, "Number of expanded macros:                     %5ld\n",
1106            s.num_expanded_macros);
1107   if (s.num_expanded_macros != 0)
1108     fprintf (stderr, "Average number of tokens per macro expansion:  %5ld\n",
1109              s.num_macro_tokens / s.num_expanded_macros);
1110   fprintf (stderr,
1111            "\nLine Table allocations during the "
1112 	   "compilation process\n");
1113   fprintf (stderr, "Number of ordinary maps used:        " PRsa (5) "\n",
1114 	   SIZE_AMOUNT (s.num_ordinary_maps_used));
1115   fprintf (stderr, "Ordinary map used size:              " PRsa (5) "\n",
1116 	   SIZE_AMOUNT (s.ordinary_maps_used_size));
1117   fprintf (stderr, "Number of ordinary maps allocated:   " PRsa (5) "\n",
1118 	   SIZE_AMOUNT (s.num_ordinary_maps_allocated));
1119   fprintf (stderr, "Ordinary maps allocated size:        " PRsa (5) "\n",
1120 	   SIZE_AMOUNT (s.ordinary_maps_allocated_size));
1121   fprintf (stderr, "Number of macro maps used:           " PRsa (5) "\n",
1122 	   SIZE_AMOUNT (s.num_macro_maps_used));
1123   fprintf (stderr, "Macro maps used size:                " PRsa (5) "\n",
1124 	   SIZE_AMOUNT (s.macro_maps_used_size));
1125   fprintf (stderr, "Macro maps locations size:           " PRsa (5) "\n",
1126 	   SIZE_AMOUNT (s.macro_maps_locations_size));
1127   fprintf (stderr, "Macro maps size:                     " PRsa (5) "\n",
1128 	   SIZE_AMOUNT (macro_maps_size));
1129   fprintf (stderr, "Duplicated maps locations size:      " PRsa (5) "\n",
1130 	   SIZE_AMOUNT (s.duplicated_macro_maps_locations_size));
1131   fprintf (stderr, "Total allocated maps size:           " PRsa (5) "\n",
1132 	   SIZE_AMOUNT (total_allocated_map_size));
1133   fprintf (stderr, "Total used maps size:                " PRsa (5) "\n",
1134 	   SIZE_AMOUNT (total_used_map_size));
1135   fprintf (stderr, "Ad-hoc table size:                   " PRsa (5) "\n",
1136 	   SIZE_AMOUNT (s.adhoc_table_size));
1137   fprintf (stderr, "Ad-hoc table entries used:           " PRsa (5) "\n",
1138 	   SIZE_AMOUNT (s.adhoc_table_entries_used));
1139   fprintf (stderr, "optimized_ranges:                    " PRsa (5) "\n",
1140 	   SIZE_AMOUNT (line_table->num_optimized_ranges));
1141   fprintf (stderr, "unoptimized_ranges:                  " PRsa (5) "\n",
1142 	   SIZE_AMOUNT (line_table->num_unoptimized_ranges));
1143 
1144   fprintf (stderr, "\n");
1145 }
1146 
1147 /* Get location one beyond the final location in ordinary map IDX.  */
1148 
1149 static location_t
get_end_location(class line_maps * set,unsigned int idx)1150 get_end_location (class line_maps *set, unsigned int idx)
1151 {
1152   if (idx == LINEMAPS_ORDINARY_USED (set) - 1)
1153     return set->highest_location;
1154 
1155   struct line_map *next_map = LINEMAPS_ORDINARY_MAP_AT (set, idx + 1);
1156   return MAP_START_LOCATION (next_map);
1157 }
1158 
1159 /* Helper function for write_digit_row.  */
1160 
1161 static void
write_digit(FILE * stream,int digit)1162 write_digit (FILE *stream, int digit)
1163 {
1164   fputc ('0' + (digit % 10), stream);
1165 }
1166 
1167 /* Helper function for dump_location_info.
1168    Write a row of numbers to STREAM, numbering a source line,
1169    giving the units, tens, hundreds etc of the column number.  */
1170 
1171 static void
write_digit_row(FILE * stream,int indent,const line_map_ordinary * map,location_t loc,int max_col,int divisor)1172 write_digit_row (FILE *stream, int indent,
1173 		 const line_map_ordinary *map,
1174 		 location_t loc, int max_col, int divisor)
1175 {
1176   fprintf (stream, "%*c", indent, ' ');
1177   fprintf (stream, "|");
1178   for (int column = 1; column < max_col; column++)
1179     {
1180       location_t column_loc = loc + (column << map->m_range_bits);
1181       write_digit (stream, column_loc / divisor);
1182     }
1183   fprintf (stream, "\n");
1184 }
1185 
1186 /* Write a half-closed (START) / half-open (END) interval of
1187    location_t to STREAM.  */
1188 
1189 static void
dump_location_range(FILE * stream,location_t start,location_t end)1190 dump_location_range (FILE *stream,
1191 		     location_t start, location_t end)
1192 {
1193   fprintf (stream,
1194 	   "  location_t interval: %u <= loc < %u\n",
1195 	   start, end);
1196 }
1197 
1198 /* Write a labelled description of a half-closed (START) / half-open (END)
1199    interval of location_t to STREAM.  */
1200 
1201 static void
dump_labelled_location_range(FILE * stream,const char * name,location_t start,location_t end)1202 dump_labelled_location_range (FILE *stream,
1203 			      const char *name,
1204 			      location_t start, location_t end)
1205 {
1206   fprintf (stream, "%s\n", name);
1207   dump_location_range (stream, start, end);
1208   fprintf (stream, "\n");
1209 }
1210 
1211 /* Write a visualization of the locations in the line_table to STREAM.  */
1212 
1213 void
dump_location_info(FILE * stream)1214 dump_location_info (FILE *stream)
1215 {
1216   /* Visualize the reserved locations.  */
1217   dump_labelled_location_range (stream, "RESERVED LOCATIONS",
1218 				0, RESERVED_LOCATION_COUNT);
1219 
1220   /* Visualize the ordinary line_map instances, rendering the sources. */
1221   for (unsigned int idx = 0; idx < LINEMAPS_ORDINARY_USED (line_table); idx++)
1222     {
1223       location_t end_location = get_end_location (line_table, idx);
1224       /* half-closed: doesn't include this one. */
1225 
1226       const line_map_ordinary *map
1227 	= LINEMAPS_ORDINARY_MAP_AT (line_table, idx);
1228       fprintf (stream, "ORDINARY MAP: %i\n", idx);
1229       dump_location_range (stream,
1230 			   MAP_START_LOCATION (map), end_location);
1231       fprintf (stream, "  file: %s\n", ORDINARY_MAP_FILE_NAME (map));
1232       fprintf (stream, "  starting at line: %i\n",
1233 	       ORDINARY_MAP_STARTING_LINE_NUMBER (map));
1234       fprintf (stream, "  column and range bits: %i\n",
1235 	       map->m_column_and_range_bits);
1236       fprintf (stream, "  column bits: %i\n",
1237 	       map->m_column_and_range_bits - map->m_range_bits);
1238       fprintf (stream, "  range bits: %i\n",
1239 	       map->m_range_bits);
1240       const char * reason;
1241       switch (map->reason) {
1242       case LC_ENTER:
1243 	reason = "LC_ENTER";
1244 	break;
1245       case LC_LEAVE:
1246 	reason = "LC_LEAVE";
1247 	break;
1248       case LC_RENAME:
1249 	reason = "LC_RENAME";
1250 	break;
1251       case LC_RENAME_VERBATIM:
1252 	reason = "LC_RENAME_VERBATIM";
1253 	break;
1254       case LC_ENTER_MACRO:
1255 	reason = "LC_RENAME_MACRO";
1256 	break;
1257       default:
1258 	reason = "Unknown";
1259       }
1260       fprintf (stream, "  reason: %d (%s)\n", map->reason, reason);
1261 
1262       const line_map_ordinary *includer_map
1263 	= linemap_included_from_linemap (line_table, map);
1264       fprintf (stream, "  included from location: %d",
1265 	       linemap_included_from (map));
1266       if (includer_map) {
1267 	fprintf (stream, " (in ordinary map %d)",
1268 		 int (includer_map - line_table->info_ordinary.maps));
1269       }
1270       fprintf (stream, "\n");
1271 
1272       /* Render the span of source lines that this "map" covers.  */
1273       for (location_t loc = MAP_START_LOCATION (map);
1274 	   loc < end_location;
1275 	   loc += (1 << map->m_range_bits) )
1276 	{
1277 	  gcc_assert (pure_location_p (line_table, loc) );
1278 
1279 	  expanded_location exploc
1280 	    = linemap_expand_location (line_table, map, loc);
1281 
1282 	  if (exploc.column == 0)
1283 	    {
1284 	      /* Beginning of a new source line: draw the line.  */
1285 
1286 	      char_span line_text = location_get_source_line (exploc.file,
1287 							      exploc.line);
1288 	      if (!line_text)
1289 		break;
1290 	      fprintf (stream,
1291 		       "%s:%3i|loc:%5i|%.*s\n",
1292 		       exploc.file, exploc.line,
1293 		       loc,
1294 		       (int)line_text.length (), line_text.get_buffer ());
1295 
1296 	      /* "loc" is at column 0, which means "the whole line".
1297 		 Render the locations *within* the line, by underlining
1298 		 it, showing the location_t numeric values
1299 		 at each column.  */
1300 	      size_t max_col = (1 << map->m_column_and_range_bits) - 1;
1301 	      if (max_col > line_text.length ())
1302 		max_col = line_text.length () + 1;
1303 
1304 	      int len_lnum = num_digits (exploc.line);
1305 	      if (len_lnum < 3)
1306 		len_lnum = 3;
1307 	      int len_loc = num_digits (loc);
1308 	      if (len_loc < 5)
1309 		len_loc = 5;
1310 
1311 	      int indent = 6 + strlen (exploc.file) + len_lnum + len_loc;
1312 
1313 	      /* Thousands.  */
1314 	      if (end_location > 999)
1315 		write_digit_row (stream, indent, map, loc, max_col, 1000);
1316 
1317 	      /* Hundreds.  */
1318 	      if (end_location > 99)
1319 		write_digit_row (stream, indent, map, loc, max_col, 100);
1320 
1321 	      /* Tens.  */
1322 	      write_digit_row (stream, indent, map, loc, max_col, 10);
1323 
1324 	      /* Units.  */
1325 	      write_digit_row (stream, indent, map, loc, max_col, 1);
1326 	    }
1327 	}
1328       fprintf (stream, "\n");
1329     }
1330 
1331   /* Visualize unallocated values.  */
1332   dump_labelled_location_range (stream, "UNALLOCATED LOCATIONS",
1333 				line_table->highest_location,
1334 				LINEMAPS_MACRO_LOWEST_LOCATION (line_table));
1335 
1336   /* Visualize the macro line_map instances, rendering the sources. */
1337   for (unsigned int i = 0; i < LINEMAPS_MACRO_USED (line_table); i++)
1338     {
1339       /* Each macro map that is allocated owns location_t values
1340 	 that are *lower* that the one before them.
1341 	 Hence it's meaningful to view them either in order of ascending
1342 	 source locations, or in order of ascending macro map index.  */
1343       const bool ascending_location_ts = true;
1344       unsigned int idx = (ascending_location_ts
1345 			  ? (LINEMAPS_MACRO_USED (line_table) - (i + 1))
1346 			  : i);
1347       const line_map_macro *map = LINEMAPS_MACRO_MAP_AT (line_table, idx);
1348       fprintf (stream, "MACRO %i: %s (%u tokens)\n",
1349 	       idx,
1350 	       linemap_map_get_macro_name (map),
1351 	       MACRO_MAP_NUM_MACRO_TOKENS (map));
1352       dump_location_range (stream,
1353 			   map->start_location,
1354 			   (map->start_location
1355 			    + MACRO_MAP_NUM_MACRO_TOKENS (map)));
1356       inform (MACRO_MAP_EXPANSION_POINT_LOCATION (map),
1357 	      "expansion point is location %i",
1358 	      MACRO_MAP_EXPANSION_POINT_LOCATION (map));
1359       fprintf (stream, "  map->start_location: %u\n",
1360 	       map->start_location);
1361 
1362       fprintf (stream, "  macro_locations:\n");
1363       for (unsigned int i = 0; i < MACRO_MAP_NUM_MACRO_TOKENS (map); i++)
1364 	{
1365 	  location_t x = MACRO_MAP_LOCATIONS (map)[2 * i];
1366 	  location_t y = MACRO_MAP_LOCATIONS (map)[(2 * i) + 1];
1367 
1368 	  /* linemap_add_macro_token encodes token numbers in an expansion
1369 	     by putting them after MAP_START_LOCATION. */
1370 
1371 	  /* I'm typically seeing 4 uninitialized entries at the end of
1372 	     0xafafafaf.
1373 	     This appears to be due to macro.cc:replace_args
1374 	     adding 2 extra args for padding tokens; presumably there may
1375 	     be a leading and/or trailing padding token injected,
1376 	     each for 2 more location slots.
1377 	     This would explain there being up to 4 location_ts slots
1378 	     that may be uninitialized.  */
1379 
1380 	  fprintf (stream, "    %u: %u, %u\n",
1381 		   i,
1382 		   x,
1383 		   y);
1384 	  if (x == y)
1385 	    {
1386 	      if (x < MAP_START_LOCATION (map))
1387 		inform (x, "token %u has %<x-location == y-location == %u%>",
1388 			i, x);
1389 	      else
1390 		fprintf (stream,
1391 			 "x-location == y-location == %u encodes token # %u\n",
1392 			 x, x - MAP_START_LOCATION (map));
1393 		}
1394 	  else
1395 	    {
1396 	      inform (x, "token %u has %<x-location == %u%>", i, x);
1397 	      inform (x, "token %u has %<y-location == %u%>", i, y);
1398 	    }
1399 	}
1400       fprintf (stream, "\n");
1401     }
1402 
1403   /* It appears that MAX_LOCATION_T itself is never assigned to a
1404      macro map, presumably due to an off-by-one error somewhere
1405      between the logic in linemap_enter_macro and
1406      LINEMAPS_MACRO_LOWEST_LOCATION.  */
1407   dump_labelled_location_range (stream, "MAX_LOCATION_T",
1408 				MAX_LOCATION_T,
1409 				MAX_LOCATION_T + 1);
1410 
1411   /* Visualize ad-hoc values.  */
1412   dump_labelled_location_range (stream, "AD-HOC LOCATIONS",
1413 				MAX_LOCATION_T + 1, UINT_MAX);
1414 }
1415 
1416 /* string_concat's constructor.  */
1417 
string_concat(int num,location_t * locs)1418 string_concat::string_concat (int num, location_t *locs)
1419   : m_num (num)
1420 {
1421   m_locs = ggc_vec_alloc <location_t> (num);
1422   for (int i = 0; i < num; i++)
1423     m_locs[i] = locs[i];
1424 }
1425 
1426 /* string_concat_db's constructor.  */
1427 
string_concat_db()1428 string_concat_db::string_concat_db ()
1429 {
1430   m_table = hash_map <location_hash, string_concat *>::create_ggc (64);
1431 }
1432 
1433 /* Record that a string concatenation occurred, covering NUM
1434    string literal tokens.  LOCS is an array of size NUM, containing the
1435    locations of the tokens.  A copy of LOCS is taken.  */
1436 
1437 void
record_string_concatenation(int num,location_t * locs)1438 string_concat_db::record_string_concatenation (int num, location_t *locs)
1439 {
1440   gcc_assert (num > 1);
1441   gcc_assert (locs);
1442 
1443   location_t key_loc = get_key_loc (locs[0]);
1444   /* We don't record data for 'RESERVED_LOCATION_P (key_loc)' key values:
1445      any data now recorded under key 'key_loc' would be overwritten by a
1446      subsequent call with the same key 'key_loc'.  */
1447   if (RESERVED_LOCATION_P (key_loc))
1448     return;
1449 
1450   string_concat *concat
1451     = new (ggc_alloc <string_concat> ()) string_concat (num, locs);
1452   m_table->put (key_loc, concat);
1453 }
1454 
1455 /* Determine if LOC was the location of the initial token of a
1456    concatenation of string literal tokens.
1457    If so, *OUT_NUM is written to with the number of tokens, and
1458    *OUT_LOCS with the location of an array of locations of the
1459    tokens, and return true.  *OUT_LOCS is a borrowed pointer to
1460    storage owned by the string_concat_db.
1461    Otherwise, return false.  */
1462 
1463 bool
get_string_concatenation(location_t loc,int * out_num,location_t ** out_locs)1464 string_concat_db::get_string_concatenation (location_t loc,
1465 					    int *out_num,
1466 					    location_t **out_locs)
1467 {
1468   gcc_assert (out_num);
1469   gcc_assert (out_locs);
1470 
1471   location_t key_loc = get_key_loc (loc);
1472   /* We don't record data for 'RESERVED_LOCATION_P (key_loc)' key values; see
1473      discussion in 'string_concat_db::record_string_concatenation'.  */
1474   if (RESERVED_LOCATION_P (key_loc))
1475     return false;
1476 
1477   string_concat **concat = m_table->get (key_loc);
1478   if (!concat)
1479     return false;
1480 
1481   *out_num = (*concat)->m_num;
1482   *out_locs =(*concat)->m_locs;
1483   return true;
1484 }
1485 
1486 /* Internal function.  Canonicalize LOC into a form suitable for
1487    use as a key within the database, stripping away macro expansion,
1488    ad-hoc information, and range information, using the location of
1489    the start of LOC within an ordinary linemap.  */
1490 
1491 location_t
get_key_loc(location_t loc)1492 string_concat_db::get_key_loc (location_t loc)
1493 {
1494   loc = linemap_resolve_location (line_table, loc, LRK_SPELLING_LOCATION,
1495 				  NULL);
1496 
1497   loc = get_range_from_loc (line_table, loc).m_start;
1498 
1499   return loc;
1500 }
1501 
1502 /* Helper class for use within get_substring_ranges_for_loc.
1503    An vec of cpp_string with responsibility for releasing all of the
1504    str->text for each str in the vector.  */
1505 
1506 class auto_cpp_string_vec :  public auto_vec <cpp_string>
1507 {
1508  public:
auto_cpp_string_vec(int alloc)1509   auto_cpp_string_vec (int alloc)
1510     : auto_vec <cpp_string> (alloc) {}
1511 
~auto_cpp_string_vec()1512   ~auto_cpp_string_vec ()
1513   {
1514     /* Clean up the copies within this vec.  */
1515     int i;
1516     cpp_string *str;
1517     FOR_EACH_VEC_ELT (*this, i, str)
1518       free (const_cast <unsigned char *> (str->text));
1519   }
1520 };
1521 
1522 /* Attempt to populate RANGES with source location information on the
1523    individual characters within the string literal found at STRLOC.
1524    If CONCATS is non-NULL, then any string literals that the token at
1525    STRLOC  was concatenated with are also added to RANGES.
1526 
1527    Return NULL if successful, or an error message if any errors occurred (in
1528    which case RANGES may be only partially populated and should not
1529    be used).
1530 
1531    This is implemented by re-parsing the relevant source line(s).  */
1532 
1533 static const char *
get_substring_ranges_for_loc(cpp_reader * pfile,string_concat_db * concats,location_t strloc,enum cpp_ttype type,cpp_substring_ranges & ranges)1534 get_substring_ranges_for_loc (cpp_reader *pfile,
1535 			      string_concat_db *concats,
1536 			      location_t strloc,
1537 			      enum cpp_ttype type,
1538 			      cpp_substring_ranges &ranges)
1539 {
1540   gcc_assert (pfile);
1541 
1542   if (strloc == UNKNOWN_LOCATION)
1543     return "unknown location";
1544 
1545   /* Reparsing the strings requires accurate location information.
1546      If -ftrack-macro-expansion has been overridden from its default
1547      of 2, then we might have a location of a macro expansion point,
1548      rather than the location of the literal itself.
1549      Avoid this by requiring that we have full macro expansion tracking
1550      for substring locations to be available.  */
1551   if (cpp_get_options (pfile)->track_macro_expansion != 2)
1552     return "track_macro_expansion != 2";
1553 
1554   /* If #line or # 44 "file"-style directives are present, then there's
1555      no guarantee that the line numbers we have can be used to locate
1556      the strings.  For example, we might have a .i file with # directives
1557      pointing back to lines within a .c file, but the .c file might
1558      have been edited since the .i file was created.
1559      In such a case, the safest course is to disable on-demand substring
1560      locations.  */
1561   if (line_table->seen_line_directive)
1562     return "seen line directive";
1563 
1564   /* If string concatenation has occurred at STRLOC, get the locations
1565      of all of the literal tokens making up the compound string.
1566      Otherwise, just use STRLOC.  */
1567   int num_locs = 1;
1568   location_t *strlocs = &strloc;
1569   if (concats)
1570     concats->get_string_concatenation (strloc, &num_locs, &strlocs);
1571 
1572   auto_cpp_string_vec strs (num_locs);
1573   auto_vec <cpp_string_location_reader> loc_readers (num_locs);
1574   for (int i = 0; i < num_locs; i++)
1575     {
1576       /* Get range of strloc.  We will use it to locate the start and finish
1577 	 of the literal token within the line.  */
1578       source_range src_range = get_range_from_loc (line_table, strlocs[i]);
1579 
1580       if (src_range.m_start >= LINEMAPS_MACRO_LOWEST_LOCATION (line_table))
1581 	{
1582 	  /* If the string token was within a macro expansion, then we can
1583 	     cope with it for the simple case where we have a single token.
1584 	     Otherwise, bail out.  */
1585 	  if (src_range.m_start != src_range.m_finish)
1586 	    return "macro expansion";
1587 	}
1588       else
1589 	{
1590 	  if (src_range.m_start >= LINE_MAP_MAX_LOCATION_WITH_COLS)
1591 	    /* If so, we can't reliably determine where the token started within
1592 	       its line.  */
1593 	    return "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS";
1594 
1595 	  if (src_range.m_finish >= LINE_MAP_MAX_LOCATION_WITH_COLS)
1596 	    /* If so, we can't reliably determine where the token finished
1597 	       within its line.  */
1598 	    return "range ends after LINE_MAP_MAX_LOCATION_WITH_COLS";
1599 	}
1600 
1601       expanded_location start
1602 	= expand_location_to_spelling_point (src_range.m_start,
1603 					     LOCATION_ASPECT_START);
1604       expanded_location finish
1605 	= expand_location_to_spelling_point (src_range.m_finish,
1606 					     LOCATION_ASPECT_FINISH);
1607       if (start.file != finish.file)
1608 	return "range endpoints are in different files";
1609       if (start.line != finish.line)
1610 	return "range endpoints are on different lines";
1611       if (start.column > finish.column)
1612 	return "range endpoints are reversed";
1613 
1614       char_span line = location_get_source_line (start.file, start.line);
1615       if (!line)
1616 	return "unable to read source line";
1617 
1618       /* Determine the location of the literal (including quotes
1619 	 and leading prefix chars, such as the 'u' in a u""
1620 	 token).  */
1621       size_t literal_length = finish.column - start.column + 1;
1622 
1623       /* Ensure that we don't crash if we got the wrong location.  */
1624       if (start.column < 1)
1625 	return "zero start column";
1626       if (line.length () < (start.column - 1 + literal_length))
1627 	return "line is not wide enough";
1628 
1629       char_span literal = line.subspan (start.column - 1, literal_length);
1630 
1631       cpp_string from;
1632       from.len = literal_length;
1633       /* Make a copy of the literal, to avoid having to rely on
1634 	 the lifetime of the copy of the line within the cache.
1635 	 This will be released by the auto_cpp_string_vec dtor.  */
1636       from.text = (unsigned char *)literal.xstrdup ();
1637       strs.safe_push (from);
1638 
1639       /* For very long lines, a new linemap could have started
1640 	 halfway through the token.
1641 	 Ensure that the loc_reader uses the linemap of the
1642 	 *end* of the token for its start location.  */
1643       const line_map_ordinary *start_ord_map;
1644       linemap_resolve_location (line_table, src_range.m_start,
1645 				LRK_SPELLING_LOCATION, &start_ord_map);
1646       const line_map_ordinary *final_ord_map;
1647       linemap_resolve_location (line_table, src_range.m_finish,
1648 				LRK_SPELLING_LOCATION, &final_ord_map);
1649       if (start_ord_map == NULL || final_ord_map == NULL)
1650 	return "failed to get ordinary maps";
1651       /* Bulletproofing.  We ought to only have different ordinary maps
1652 	 for start vs finish due to line-length jumps.  */
1653       if (start_ord_map != final_ord_map
1654 	  && start_ord_map->to_file != final_ord_map->to_file)
1655 	return "start and finish are spelled in different ordinary maps";
1656       /* The file from linemap_resolve_location ought to match that from
1657 	 expand_location_to_spelling_point.  */
1658       if (start_ord_map->to_file != start.file)
1659 	return "mismatching file after resolving linemap";
1660 
1661       location_t start_loc
1662 	= linemap_position_for_line_and_column (line_table, final_ord_map,
1663 						start.line, start.column);
1664 
1665       cpp_string_location_reader loc_reader (start_loc, line_table);
1666       loc_readers.safe_push (loc_reader);
1667     }
1668 
1669   /* Rerun cpp_interpret_string, or rather, a modified version of it.  */
1670   const char *err = cpp_interpret_string_ranges (pfile, strs.address (),
1671 						 loc_readers.address (),
1672 						 num_locs, &ranges, type);
1673   if (err)
1674     return err;
1675 
1676   /* Success: "ranges" should now contain information on the string.  */
1677   return NULL;
1678 }
1679 
1680 /* Attempt to populate *OUT_LOC with source location information on the
1681    given characters within the string literal found at STRLOC.
1682    CARET_IDX, START_IDX, and END_IDX refer to offsets within the execution
1683    character set.
1684 
1685    For example, given CARET_IDX = 4, START_IDX = 3, END_IDX  = 7
1686    and string literal "012345\n789"
1687    *OUT_LOC is written to with:
1688      "012345\n789"
1689          ~^~~~~
1690 
1691    If CONCATS is non-NULL, then any string literals that the token at
1692    STRLOC was concatenated with are also considered.
1693 
1694    This is implemented by re-parsing the relevant source line(s).
1695 
1696    Return NULL if successful, or an error message if any errors occurred.
1697    Error messages are intended for GCC developers (to help debugging) rather
1698    than for end-users.  */
1699 
1700 const char *
get_location_within_string(cpp_reader * pfile,string_concat_db * concats,location_t strloc,enum cpp_ttype type,int caret_idx,int start_idx,int end_idx,location_t * out_loc)1701 get_location_within_string (cpp_reader *pfile,
1702 			    string_concat_db *concats,
1703 			    location_t strloc,
1704 			    enum cpp_ttype type,
1705 			    int caret_idx, int start_idx, int end_idx,
1706 			    location_t *out_loc)
1707 {
1708   gcc_checking_assert (caret_idx >= 0);
1709   gcc_checking_assert (start_idx >= 0);
1710   gcc_checking_assert (end_idx >= 0);
1711   gcc_assert (out_loc);
1712 
1713   cpp_substring_ranges ranges;
1714   const char *err
1715     = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1716   if (err)
1717     return err;
1718 
1719   if (caret_idx >= ranges.get_num_ranges ())
1720     return "caret_idx out of range";
1721   if (start_idx >= ranges.get_num_ranges ())
1722     return "start_idx out of range";
1723   if (end_idx >= ranges.get_num_ranges ())
1724     return "end_idx out of range";
1725 
1726   *out_loc = make_location (ranges.get_range (caret_idx).m_start,
1727 			    ranges.get_range (start_idx).m_start,
1728 			    ranges.get_range (end_idx).m_finish);
1729   return NULL;
1730 }
1731 
1732 #if CHECKING_P
1733 
1734 namespace selftest {
1735 
1736 /* Selftests of location handling.  */
1737 
1738 /* Attempt to populate *OUT_RANGE with source location information on the
1739    given character within the string literal found at STRLOC.
1740    CHAR_IDX refers to an offset within the execution character set.
1741    If CONCATS is non-NULL, then any string literals that the token at
1742    STRLOC was concatenated with are also considered.
1743 
1744    This is implemented by re-parsing the relevant source line(s).
1745 
1746    Return NULL if successful, or an error message if any errors occurred.
1747    Error messages are intended for GCC developers (to help debugging) rather
1748    than for end-users.  */
1749 
1750 static const char *
get_source_range_for_char(cpp_reader * pfile,string_concat_db * concats,location_t strloc,enum cpp_ttype type,int char_idx,source_range * out_range)1751 get_source_range_for_char (cpp_reader *pfile,
1752 			   string_concat_db *concats,
1753 			   location_t strloc,
1754 			   enum cpp_ttype type,
1755 			   int char_idx,
1756 			   source_range *out_range)
1757 {
1758   gcc_checking_assert (char_idx >= 0);
1759   gcc_assert (out_range);
1760 
1761   cpp_substring_ranges ranges;
1762   const char *err
1763     = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1764   if (err)
1765     return err;
1766 
1767   if (char_idx >= ranges.get_num_ranges ())
1768     return "char_idx out of range";
1769 
1770   *out_range = ranges.get_range (char_idx);
1771   return NULL;
1772 }
1773 
1774 /* As get_source_range_for_char, but write to *OUT the number
1775    of ranges that are available.  */
1776 
1777 static const char *
get_num_source_ranges_for_substring(cpp_reader * pfile,string_concat_db * concats,location_t strloc,enum cpp_ttype type,int * out)1778 get_num_source_ranges_for_substring (cpp_reader *pfile,
1779 				     string_concat_db *concats,
1780 				     location_t strloc,
1781 				     enum cpp_ttype type,
1782 				     int *out)
1783 {
1784   gcc_assert (out);
1785 
1786   cpp_substring_ranges ranges;
1787   const char *err
1788     = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1789 
1790   if (err)
1791     return err;
1792 
1793   *out = ranges.get_num_ranges ();
1794   return NULL;
1795 }
1796 
1797 /* Selftests of location handling.  */
1798 
1799 /* Verify that compare() on linenum_type handles comparisons over the full
1800    range of the type.  */
1801 
1802 static void
test_linenum_comparisons()1803 test_linenum_comparisons ()
1804 {
1805   linenum_type min_line (0);
1806   linenum_type max_line (0xffffffff);
1807   ASSERT_EQ (0, compare (min_line, min_line));
1808   ASSERT_EQ (0, compare (max_line, max_line));
1809 
1810   ASSERT_GT (compare (max_line, min_line), 0);
1811   ASSERT_LT (compare (min_line, max_line), 0);
1812 }
1813 
1814 /* Helper function for verifying location data: when location_t
1815    values are > LINE_MAP_MAX_LOCATION_WITH_COLS, they are treated
1816    as having column 0.  */
1817 
1818 static bool
should_have_column_data_p(location_t loc)1819 should_have_column_data_p (location_t loc)
1820 {
1821   if (IS_ADHOC_LOC (loc))
1822     loc = get_location_from_adhoc_loc (line_table, loc);
1823   if (loc > LINE_MAP_MAX_LOCATION_WITH_COLS)
1824     return false;
1825   return true;
1826 }
1827 
1828 /* Selftest for should_have_column_data_p.  */
1829 
1830 static void
test_should_have_column_data_p()1831 test_should_have_column_data_p ()
1832 {
1833   ASSERT_TRUE (should_have_column_data_p (RESERVED_LOCATION_COUNT));
1834   ASSERT_TRUE
1835     (should_have_column_data_p (LINE_MAP_MAX_LOCATION_WITH_COLS));
1836   ASSERT_FALSE
1837     (should_have_column_data_p (LINE_MAP_MAX_LOCATION_WITH_COLS + 1));
1838 }
1839 
1840 /* Verify the result of LOCATION_FILE/LOCATION_LINE/LOCATION_COLUMN
1841    on LOC.  */
1842 
1843 static void
assert_loceq(const char * exp_filename,int exp_linenum,int exp_colnum,location_t loc)1844 assert_loceq (const char *exp_filename, int exp_linenum, int exp_colnum,
1845 	      location_t loc)
1846 {
1847   ASSERT_STREQ (exp_filename, LOCATION_FILE (loc));
1848   ASSERT_EQ (exp_linenum, LOCATION_LINE (loc));
1849   /* If location_t values are sufficiently high, then column numbers
1850      will be unavailable and LOCATION_COLUMN (loc) will be 0.
1851      When close to the threshold, column numbers *may* be present: if
1852      the final linemap before the threshold contains a line that straddles
1853      the threshold, locations in that line have column information.  */
1854   if (should_have_column_data_p (loc))
1855     ASSERT_EQ (exp_colnum, LOCATION_COLUMN (loc));
1856 }
1857 
1858 /* Various selftests involve constructing a line table and one or more
1859    line maps within it.
1860 
1861    For maximum test coverage we want to run these tests with a variety
1862    of situations:
1863    - line_table->default_range_bits: some frontends use a non-zero value
1864    and others use zero
1865    - the fallback modes within line-map.cc: there are various threshold
1866    values for location_t beyond line-map.cc changes
1867    behavior (disabling of the range-packing optimization, disabling
1868    of column-tracking).  We can exercise these by starting the line_table
1869    at interesting values at or near these thresholds.
1870 
1871    The following struct describes a particular case within our test
1872    matrix.  */
1873 
1874 class line_table_case
1875 {
1876 public:
line_table_case(int default_range_bits,int base_location)1877   line_table_case (int default_range_bits, int base_location)
1878   : m_default_range_bits (default_range_bits),
1879     m_base_location (base_location)
1880   {}
1881 
1882   int m_default_range_bits;
1883   int m_base_location;
1884 };
1885 
1886 /* Constructor.  Store the old value of line_table, and create a new
1887    one, using sane defaults.  */
1888 
line_table_test()1889 line_table_test::line_table_test ()
1890 {
1891   gcc_assert (saved_line_table == NULL);
1892   saved_line_table = line_table;
1893   line_table = ggc_alloc<line_maps> ();
1894   linemap_init (line_table, BUILTINS_LOCATION);
1895   gcc_assert (saved_line_table->reallocator);
1896   line_table->reallocator = saved_line_table->reallocator;
1897   gcc_assert (saved_line_table->round_alloc_size);
1898   line_table->round_alloc_size = saved_line_table->round_alloc_size;
1899   line_table->default_range_bits = 0;
1900 }
1901 
1902 /* Constructor.  Store the old value of line_table, and create a new
1903    one, using the sitation described in CASE_.  */
1904 
line_table_test(const line_table_case & case_)1905 line_table_test::line_table_test (const line_table_case &case_)
1906 {
1907   gcc_assert (saved_line_table == NULL);
1908   saved_line_table = line_table;
1909   line_table = ggc_alloc<line_maps> ();
1910   linemap_init (line_table, BUILTINS_LOCATION);
1911   gcc_assert (saved_line_table->reallocator);
1912   line_table->reallocator = saved_line_table->reallocator;
1913   gcc_assert (saved_line_table->round_alloc_size);
1914   line_table->round_alloc_size = saved_line_table->round_alloc_size;
1915   line_table->default_range_bits = case_.m_default_range_bits;
1916   if (case_.m_base_location)
1917     {
1918       line_table->highest_location = case_.m_base_location;
1919       line_table->highest_line = case_.m_base_location;
1920     }
1921 }
1922 
1923 /* Destructor.  Restore the old value of line_table.  */
1924 
~line_table_test()1925 line_table_test::~line_table_test ()
1926 {
1927   gcc_assert (saved_line_table != NULL);
1928   line_table = saved_line_table;
1929   saved_line_table = NULL;
1930 }
1931 
1932 /* Verify basic operation of ordinary linemaps.  */
1933 
1934 static void
test_accessing_ordinary_linemaps(const line_table_case & case_)1935 test_accessing_ordinary_linemaps (const line_table_case &case_)
1936 {
1937   line_table_test ltt (case_);
1938 
1939   /* Build a simple linemap describing some locations. */
1940   linemap_add (line_table, LC_ENTER, false, "foo.c", 0);
1941 
1942   linemap_line_start (line_table, 1, 100);
1943   location_t loc_a = linemap_position_for_column (line_table, 1);
1944   location_t loc_b = linemap_position_for_column (line_table, 23);
1945 
1946   linemap_line_start (line_table, 2, 100);
1947   location_t loc_c = linemap_position_for_column (line_table, 1);
1948   location_t loc_d = linemap_position_for_column (line_table, 17);
1949 
1950   /* Example of a very long line.  */
1951   linemap_line_start (line_table, 3, 2000);
1952   location_t loc_e = linemap_position_for_column (line_table, 700);
1953 
1954   /* Transitioning back to a short line.  */
1955   linemap_line_start (line_table, 4, 0);
1956   location_t loc_back_to_short = linemap_position_for_column (line_table, 100);
1957 
1958   if (should_have_column_data_p (loc_back_to_short))
1959     {
1960       /* Verify that we switched to short lines in the linemap.  */
1961       line_map_ordinary *map = LINEMAPS_LAST_ORDINARY_MAP (line_table);
1962       ASSERT_EQ (7, map->m_column_and_range_bits - map->m_range_bits);
1963     }
1964 
1965   /* Example of a line that will eventually be seen to be longer
1966      than LINE_MAP_MAX_COLUMN_NUMBER; the initially seen width is
1967      below that.  */
1968   linemap_line_start (line_table, 5, 2000);
1969 
1970   location_t loc_start_of_very_long_line
1971     = linemap_position_for_column (line_table, 2000);
1972   location_t loc_too_wide
1973     = linemap_position_for_column (line_table, 4097);
1974   location_t loc_too_wide_2
1975     = linemap_position_for_column (line_table, 4098);
1976 
1977   /* ...and back to a sane line length.  */
1978   linemap_line_start (line_table, 6, 100);
1979   location_t loc_sane_again = linemap_position_for_column (line_table, 10);
1980 
1981   linemap_add (line_table, LC_LEAVE, false, NULL, 0);
1982 
1983   /* Multiple files.  */
1984   linemap_add (line_table, LC_ENTER, false, "bar.c", 0);
1985   linemap_line_start (line_table, 1, 200);
1986   location_t loc_f = linemap_position_for_column (line_table, 150);
1987   linemap_add (line_table, LC_LEAVE, false, NULL, 0);
1988 
1989   /* Verify that we can recover the location info.  */
1990   assert_loceq ("foo.c", 1, 1, loc_a);
1991   assert_loceq ("foo.c", 1, 23, loc_b);
1992   assert_loceq ("foo.c", 2, 1, loc_c);
1993   assert_loceq ("foo.c", 2, 17, loc_d);
1994   assert_loceq ("foo.c", 3, 700, loc_e);
1995   assert_loceq ("foo.c", 4, 100, loc_back_to_short);
1996 
1997   /* In the very wide line, the initial location should be fully tracked.  */
1998   assert_loceq ("foo.c", 5, 2000, loc_start_of_very_long_line);
1999   /* ...but once we exceed LINE_MAP_MAX_COLUMN_NUMBER column-tracking should
2000      be disabled.  */
2001   assert_loceq ("foo.c", 5, 0, loc_too_wide);
2002   assert_loceq ("foo.c", 5, 0, loc_too_wide_2);
2003   /*...and column-tracking should be re-enabled for subsequent lines.  */
2004   assert_loceq ("foo.c", 6, 10, loc_sane_again);
2005 
2006   assert_loceq ("bar.c", 1, 150, loc_f);
2007 
2008   ASSERT_FALSE (is_location_from_builtin_token (loc_a));
2009   ASSERT_TRUE (pure_location_p (line_table, loc_a));
2010 
2011   /* Verify using make_location to build a range, and extracting data
2012      back from it.  */
2013   location_t range_c_b_d = make_location (loc_c, loc_b, loc_d);
2014   ASSERT_FALSE (pure_location_p (line_table, range_c_b_d));
2015   ASSERT_EQ (loc_c, get_location_from_adhoc_loc (line_table, range_c_b_d));
2016   source_range src_range = get_range_from_loc (line_table, range_c_b_d);
2017   ASSERT_EQ (loc_b, src_range.m_start);
2018   ASSERT_EQ (loc_d, src_range.m_finish);
2019 }
2020 
2021 /* Verify various properties of UNKNOWN_LOCATION.  */
2022 
2023 static void
test_unknown_location()2024 test_unknown_location ()
2025 {
2026   ASSERT_EQ (NULL, LOCATION_FILE (UNKNOWN_LOCATION));
2027   ASSERT_EQ (0, LOCATION_LINE (UNKNOWN_LOCATION));
2028   ASSERT_EQ (0, LOCATION_COLUMN (UNKNOWN_LOCATION));
2029 }
2030 
2031 /* Verify various properties of BUILTINS_LOCATION.  */
2032 
2033 static void
test_builtins()2034 test_builtins ()
2035 {
2036   assert_loceq (_("<built-in>"), 0, 0, BUILTINS_LOCATION);
2037   ASSERT_PRED1 (is_location_from_builtin_token, BUILTINS_LOCATION);
2038 }
2039 
2040 /* Regression test for make_location.
2041    Ensure that we use pure locations for the start/finish of the range,
2042    rather than storing a packed or ad-hoc range as the start/finish.  */
2043 
2044 static void
test_make_location_nonpure_range_endpoints(const line_table_case & case_)2045 test_make_location_nonpure_range_endpoints (const line_table_case &case_)
2046 {
2047   /* Issue seen with testsuite/c-c++-common/Wlogical-not-parentheses-2.c
2048      with C++ frontend.
2049      ....................0000000001111111111222.
2050      ....................1234567890123456789012.  */
2051   const char *content = "     r += !aaa == bbb;\n";
2052   temp_source_file tmp (SELFTEST_LOCATION, ".C", content);
2053   line_table_test ltt (case_);
2054   linemap_add (line_table, LC_ENTER, false, tmp.get_filename (), 1);
2055 
2056   const location_t c11 = linemap_position_for_column (line_table, 11);
2057   const location_t c12 = linemap_position_for_column (line_table, 12);
2058   const location_t c13 = linemap_position_for_column (line_table, 13);
2059   const location_t c14 = linemap_position_for_column (line_table, 14);
2060   const location_t c21 = linemap_position_for_column (line_table, 21);
2061 
2062   if (c21 > LINE_MAP_MAX_LOCATION_WITH_COLS)
2063     return;
2064 
2065   /* Use column 13 for the caret location, arbitrarily, to verify that we
2066      handle start != caret.  */
2067   const location_t aaa = make_location (c13, c12, c14);
2068   ASSERT_EQ (c13, get_pure_location (aaa));
2069   ASSERT_EQ (c12, get_start (aaa));
2070   ASSERT_FALSE (IS_ADHOC_LOC (get_start (aaa)));
2071   ASSERT_EQ (c14, get_finish (aaa));
2072   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (aaa)));
2073 
2074   /* Make a location using a location with a range as the start-point.  */
2075   const location_t not_aaa = make_location (c11, aaa, c14);
2076   ASSERT_EQ (c11, get_pure_location (not_aaa));
2077   /* It should use the start location of the range, not store the range
2078      itself.  */
2079   ASSERT_EQ (c12, get_start (not_aaa));
2080   ASSERT_FALSE (IS_ADHOC_LOC (get_start (not_aaa)));
2081   ASSERT_EQ (c14, get_finish (not_aaa));
2082   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (not_aaa)));
2083 
2084   /* Similarly, make a location with a range as the end-point.  */
2085   const location_t aaa_eq_bbb = make_location (c12, c12, c21);
2086   ASSERT_EQ (c12, get_pure_location (aaa_eq_bbb));
2087   ASSERT_EQ (c12, get_start (aaa_eq_bbb));
2088   ASSERT_FALSE (IS_ADHOC_LOC (get_start (aaa_eq_bbb)));
2089   ASSERT_EQ (c21, get_finish (aaa_eq_bbb));
2090   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (aaa_eq_bbb)));
2091   const location_t not_aaa_eq_bbb = make_location (c11, c12, aaa_eq_bbb);
2092   /* It should use the finish location of the range, not store the range
2093      itself.  */
2094   ASSERT_EQ (c11, get_pure_location (not_aaa_eq_bbb));
2095   ASSERT_EQ (c12, get_start (not_aaa_eq_bbb));
2096   ASSERT_FALSE (IS_ADHOC_LOC (get_start (not_aaa_eq_bbb)));
2097   ASSERT_EQ (c21, get_finish (not_aaa_eq_bbb));
2098   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (not_aaa_eq_bbb)));
2099 }
2100 
2101 /* Verify reading of input files (e.g. for caret-based diagnostics).  */
2102 
2103 static void
test_reading_source_line()2104 test_reading_source_line ()
2105 {
2106   /* Create a tempfile and write some text to it.  */
2107   temp_source_file tmp (SELFTEST_LOCATION, ".txt",
2108 			"01234567890123456789\n"
2109 			"This is the test text\n"
2110 			"This is the 3rd line");
2111 
2112   /* Read back a specific line from the tempfile.  */
2113   char_span source_line = location_get_source_line (tmp.get_filename (), 3);
2114   ASSERT_TRUE (source_line);
2115   ASSERT_TRUE (source_line.get_buffer () != NULL);
2116   ASSERT_EQ (20, source_line.length ());
2117   ASSERT_TRUE (!strncmp ("This is the 3rd line",
2118 			 source_line.get_buffer (), source_line.length ()));
2119 
2120   source_line = location_get_source_line (tmp.get_filename (), 2);
2121   ASSERT_TRUE (source_line);
2122   ASSERT_TRUE (source_line.get_buffer () != NULL);
2123   ASSERT_EQ (21, source_line.length ());
2124   ASSERT_TRUE (!strncmp ("This is the test text",
2125 			 source_line.get_buffer (), source_line.length ()));
2126 
2127   source_line = location_get_source_line (tmp.get_filename (), 4);
2128   ASSERT_FALSE (source_line);
2129   ASSERT_TRUE (source_line.get_buffer () == NULL);
2130 }
2131 
2132 /* Tests of lexing.  */
2133 
2134 /* Verify that token TOK from PARSER has cpp_token_as_text
2135    equal to EXPECTED_TEXT.  */
2136 
2137 #define ASSERT_TOKEN_AS_TEXT_EQ(PARSER, TOK, EXPECTED_TEXT)		\
2138   SELFTEST_BEGIN_STMT							\
2139     unsigned char *actual_txt = cpp_token_as_text ((PARSER), (TOK));	\
2140     ASSERT_STREQ ((EXPECTED_TEXT), (const char *)actual_txt);		\
2141   SELFTEST_END_STMT
2142 
2143 /* Verify that TOK's src_loc is within EXP_FILENAME at EXP_LINENUM,
2144    and ranges from EXP_START_COL to EXP_FINISH_COL.
2145    Use LOC as the effective location of the selftest.  */
2146 
2147 static void
assert_token_loc_eq(const location & loc,const cpp_token * tok,const char * exp_filename,int exp_linenum,int exp_start_col,int exp_finish_col)2148 assert_token_loc_eq (const location &loc,
2149 		     const cpp_token *tok,
2150 		     const char *exp_filename, int exp_linenum,
2151 		     int exp_start_col, int exp_finish_col)
2152 {
2153   location_t tok_loc = tok->src_loc;
2154   ASSERT_STREQ_AT (loc, exp_filename, LOCATION_FILE (tok_loc));
2155   ASSERT_EQ_AT (loc, exp_linenum, LOCATION_LINE (tok_loc));
2156 
2157   /* If location_t values are sufficiently high, then column numbers
2158      will be unavailable.  */
2159   if (!should_have_column_data_p (tok_loc))
2160     return;
2161 
2162   ASSERT_EQ_AT (loc, exp_start_col, LOCATION_COLUMN (tok_loc));
2163   source_range tok_range = get_range_from_loc (line_table, tok_loc);
2164   ASSERT_EQ_AT (loc, exp_start_col, LOCATION_COLUMN (tok_range.m_start));
2165   ASSERT_EQ_AT (loc, exp_finish_col, LOCATION_COLUMN (tok_range.m_finish));
2166 }
2167 
2168 /* Use assert_token_loc_eq to verify the TOK->src_loc, using
2169    SELFTEST_LOCATION as the effective location of the selftest.  */
2170 
2171 #define ASSERT_TOKEN_LOC_EQ(TOK, EXP_FILENAME, EXP_LINENUM, \
2172 			    EXP_START_COL, EXP_FINISH_COL) \
2173   assert_token_loc_eq (SELFTEST_LOCATION, (TOK), (EXP_FILENAME), \
2174 		       (EXP_LINENUM), (EXP_START_COL), (EXP_FINISH_COL))
2175 
2176 /* Test of lexing a file using libcpp, verifying tokens and their
2177    location information.  */
2178 
2179 static void
test_lexer(const line_table_case & case_)2180 test_lexer (const line_table_case &case_)
2181 {
2182   /* Create a tempfile and write some text to it.  */
2183   const char *content =
2184     /*00000000011111111112222222222333333.3333444444444.455555555556
2185       12345678901234567890123456789012345.6789012345678.901234567890.  */
2186     ("test_name /* c-style comment */\n"
2187      "                                  \"test literal\"\n"
2188      " // test c++-style comment\n"
2189      "   42\n");
2190   temp_source_file tmp (SELFTEST_LOCATION, ".txt", content);
2191 
2192   line_table_test ltt (case_);
2193 
2194   cpp_reader *parser = cpp_create_reader (CLK_GNUC89, NULL, line_table);
2195 
2196   const char *fname = cpp_read_main_file (parser, tmp.get_filename ());
2197   ASSERT_NE (fname, NULL);
2198 
2199   /* Verify that we get the expected tokens back, with the correct
2200      location information.  */
2201 
2202   location_t loc;
2203   const cpp_token *tok;
2204   tok = cpp_get_token_with_location (parser, &loc);
2205   ASSERT_NE (tok, NULL);
2206   ASSERT_EQ (tok->type, CPP_NAME);
2207   ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "test_name");
2208   ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 1, 1, 9);
2209 
2210   tok = cpp_get_token_with_location (parser, &loc);
2211   ASSERT_NE (tok, NULL);
2212   ASSERT_EQ (tok->type, CPP_STRING);
2213   ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "\"test literal\"");
2214   ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 2, 35, 48);
2215 
2216   tok = cpp_get_token_with_location (parser, &loc);
2217   ASSERT_NE (tok, NULL);
2218   ASSERT_EQ (tok->type, CPP_NUMBER);
2219   ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "42");
2220   ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 4, 4, 5);
2221 
2222   tok = cpp_get_token_with_location (parser, &loc);
2223   ASSERT_NE (tok, NULL);
2224   ASSERT_EQ (tok->type, CPP_EOF);
2225 
2226   cpp_finish (parser, NULL);
2227   cpp_destroy (parser);
2228 }
2229 
2230 /* Forward decls.  */
2231 
2232 class lexer_test;
2233 class lexer_test_options;
2234 
2235 /* A class for specifying options of a lexer_test.
2236    The "apply" vfunc is called during the lexer_test constructor.  */
2237 
2238 class lexer_test_options
2239 {
2240  public:
2241   virtual void apply (lexer_test &) = 0;
2242 };
2243 
2244 /* Wrapper around an cpp_reader *, which calls cpp_finish and cpp_destroy
2245    in its dtor.
2246 
2247    This is needed by struct lexer_test to ensure that the cleanup of the
2248    cpp_reader happens *after* the cleanup of the temp_source_file.  */
2249 
2250 class cpp_reader_ptr
2251 {
2252  public:
cpp_reader_ptr(cpp_reader * ptr)2253   cpp_reader_ptr (cpp_reader *ptr) : m_ptr (ptr) {}
2254 
~cpp_reader_ptr()2255   ~cpp_reader_ptr ()
2256   {
2257     cpp_finish (m_ptr, NULL);
2258     cpp_destroy (m_ptr);
2259   }
2260 
operator cpp_reader*() const2261   operator cpp_reader * () const { return m_ptr; }
2262 
2263  private:
2264   cpp_reader *m_ptr;
2265 };
2266 
2267 /* A struct for writing lexer tests.  */
2268 
2269 class lexer_test
2270 {
2271 public:
2272   lexer_test (const line_table_case &case_, const char *content,
2273 	      lexer_test_options *options);
2274   ~lexer_test ();
2275 
2276   const cpp_token *get_token ();
2277 
2278   /* The ordering of these fields matters.
2279      The line_table_test must be first, since the cpp_reader_ptr
2280      uses it.
2281      The cpp_reader must be cleaned up *after* the temp_source_file
2282      since the filenames in input.cc's input cache are owned by the
2283      cpp_reader; in particular, when ~temp_source_file evicts the
2284      filename the filenames must still be alive.  */
2285   line_table_test m_ltt;
2286   cpp_reader_ptr m_parser;
2287   temp_source_file m_tempfile;
2288   string_concat_db m_concats;
2289   bool m_implicitly_expect_EOF;
2290 };
2291 
2292 /* Use an EBCDIC encoding for the execution charset, specifically
2293    IBM1047-encoded (aka "EBCDIC 1047", or "Code page 1047").
2294 
2295    This exercises iconv integration within libcpp.
2296    Not every build of iconv supports the given charset,
2297    so we need to flag this error and handle it gracefully.  */
2298 
2299 class ebcdic_execution_charset : public lexer_test_options
2300 {
2301  public:
ebcdic_execution_charset()2302   ebcdic_execution_charset () : m_num_iconv_errors (0)
2303     {
2304       gcc_assert (s_singleton == NULL);
2305       s_singleton = this;
2306     }
~ebcdic_execution_charset()2307   ~ebcdic_execution_charset ()
2308     {
2309       gcc_assert (s_singleton == this);
2310       s_singleton = NULL;
2311     }
2312 
apply(lexer_test & test)2313   void apply (lexer_test &test) FINAL OVERRIDE
2314   {
2315     cpp_options *cpp_opts = cpp_get_options (test.m_parser);
2316     cpp_opts->narrow_charset = "IBM1047";
2317 
2318     cpp_callbacks *callbacks = cpp_get_callbacks (test.m_parser);
2319     callbacks->diagnostic = on_diagnostic;
2320   }
2321 
on_diagnostic(cpp_reader * pfile ATTRIBUTE_UNUSED,enum cpp_diagnostic_level level ATTRIBUTE_UNUSED,enum cpp_warning_reason reason ATTRIBUTE_UNUSED,rich_location * richloc ATTRIBUTE_UNUSED,const char * msgid,va_list * ap ATTRIBUTE_UNUSED)2322   static bool on_diagnostic (cpp_reader *pfile ATTRIBUTE_UNUSED,
2323 			     enum cpp_diagnostic_level level ATTRIBUTE_UNUSED,
2324 			     enum cpp_warning_reason reason ATTRIBUTE_UNUSED,
2325 			     rich_location *richloc ATTRIBUTE_UNUSED,
2326 			     const char *msgid, va_list *ap ATTRIBUTE_UNUSED)
2327     ATTRIBUTE_FPTR_PRINTF(5,0)
2328   {
2329     gcc_assert (s_singleton);
2330     /* Avoid exgettext from picking this up, it is translated in libcpp.  */
2331     const char *msg = "conversion from %s to %s not supported by iconv";
2332 #ifdef ENABLE_NLS
2333     msg = dgettext ("cpplib", msg);
2334 #endif
2335     /* Detect and record errors emitted by libcpp/charset.cc:init_iconv_desc
2336        when the local iconv build doesn't support the conversion.  */
2337     if (strcmp (msgid, msg) == 0)
2338       {
2339 	s_singleton->m_num_iconv_errors++;
2340 	return true;
2341       }
2342 
2343     /* Otherwise, we have an unexpected error.  */
2344     abort ();
2345   }
2346 
iconv_errors_occurred_p() const2347   bool iconv_errors_occurred_p () const { return m_num_iconv_errors > 0; }
2348 
2349  private:
2350   static ebcdic_execution_charset *s_singleton;
2351   int m_num_iconv_errors;
2352 };
2353 
2354 ebcdic_execution_charset *ebcdic_execution_charset::s_singleton;
2355 
2356 /* A lexer_test_options subclass that records a list of diagnostic
2357    messages emitted by the lexer.  */
2358 
2359 class lexer_diagnostic_sink : public lexer_test_options
2360 {
2361  public:
lexer_diagnostic_sink()2362   lexer_diagnostic_sink ()
2363   {
2364     gcc_assert (s_singleton == NULL);
2365     s_singleton = this;
2366   }
~lexer_diagnostic_sink()2367   ~lexer_diagnostic_sink ()
2368   {
2369     gcc_assert (s_singleton == this);
2370     s_singleton = NULL;
2371 
2372     int i;
2373     char *str;
2374     FOR_EACH_VEC_ELT (m_diagnostics, i, str)
2375       free (str);
2376   }
2377 
apply(lexer_test & test)2378   void apply (lexer_test &test) FINAL OVERRIDE
2379   {
2380     cpp_callbacks *callbacks = cpp_get_callbacks (test.m_parser);
2381     callbacks->diagnostic = on_diagnostic;
2382   }
2383 
on_diagnostic(cpp_reader * pfile ATTRIBUTE_UNUSED,enum cpp_diagnostic_level level ATTRIBUTE_UNUSED,enum cpp_warning_reason reason ATTRIBUTE_UNUSED,rich_location * richloc ATTRIBUTE_UNUSED,const char * msgid,va_list * ap)2384   static bool on_diagnostic (cpp_reader *pfile ATTRIBUTE_UNUSED,
2385 			     enum cpp_diagnostic_level level ATTRIBUTE_UNUSED,
2386 			     enum cpp_warning_reason reason ATTRIBUTE_UNUSED,
2387 			     rich_location *richloc ATTRIBUTE_UNUSED,
2388 			     const char *msgid, va_list *ap)
2389     ATTRIBUTE_FPTR_PRINTF(5,0)
2390   {
2391     char *msg = xvasprintf (msgid, *ap);
2392     s_singleton->m_diagnostics.safe_push (msg);
2393     return true;
2394   }
2395 
2396   auto_vec<char *> m_diagnostics;
2397 
2398  private:
2399   static lexer_diagnostic_sink *s_singleton;
2400 };
2401 
2402 lexer_diagnostic_sink *lexer_diagnostic_sink::s_singleton;
2403 
2404 /* Constructor.  Override line_table with a new instance based on CASE_,
2405    and write CONTENT to a tempfile.  Create a cpp_reader, and use it to
2406    start parsing the tempfile.  */
2407 
lexer_test(const line_table_case & case_,const char * content,lexer_test_options * options)2408 lexer_test::lexer_test (const line_table_case &case_, const char *content,
2409 			lexer_test_options *options)
2410 : m_ltt (case_),
2411   m_parser (cpp_create_reader (CLK_GNUC99, NULL, line_table)),
2412   /* Create a tempfile and write the text to it.  */
2413   m_tempfile (SELFTEST_LOCATION, ".c", content),
2414   m_concats (),
2415   m_implicitly_expect_EOF (true)
2416 {
2417   if (options)
2418     options->apply (*this);
2419 
2420   cpp_init_iconv (m_parser);
2421 
2422   /* Parse the file.  */
2423   const char *fname = cpp_read_main_file (m_parser,
2424 					  m_tempfile.get_filename ());
2425   ASSERT_NE (fname, NULL);
2426 }
2427 
2428 /* Destructor.  By default, verify that the next token in m_parser is EOF.  */
2429 
~lexer_test()2430 lexer_test::~lexer_test ()
2431 {
2432   location_t loc;
2433   const cpp_token *tok;
2434 
2435   if (m_implicitly_expect_EOF)
2436     {
2437       tok = cpp_get_token_with_location (m_parser, &loc);
2438       ASSERT_NE (tok, NULL);
2439       ASSERT_EQ (tok->type, CPP_EOF);
2440     }
2441 }
2442 
2443 /* Get the next token from m_parser.  */
2444 
2445 const cpp_token *
get_token()2446 lexer_test::get_token ()
2447 {
2448   location_t loc;
2449   const cpp_token *tok;
2450 
2451   tok = cpp_get_token_with_location (m_parser, &loc);
2452   ASSERT_NE (tok, NULL);
2453   return tok;
2454 }
2455 
2456 /* Verify that locations within string literals are correctly handled.  */
2457 
2458 /* Verify get_source_range_for_substring for token(s) at STRLOC,
2459    using the string concatenation database for TEST.
2460 
2461    Assert that the character at index IDX is on EXPECTED_LINE,
2462    and that it begins at column EXPECTED_START_COL and ends at
2463    EXPECTED_FINISH_COL (unless the locations are beyond
2464    LINE_MAP_MAX_LOCATION_WITH_COLS, in which case don't check their
2465    columns).  */
2466 
2467 static void
assert_char_at_range(const location & loc,lexer_test & test,location_t strloc,enum cpp_ttype type,int idx,int expected_line,int expected_start_col,int expected_finish_col)2468 assert_char_at_range (const location &loc,
2469 		      lexer_test& test,
2470 		      location_t strloc, enum cpp_ttype type, int idx,
2471 		      int expected_line, int expected_start_col,
2472 		      int expected_finish_col)
2473 {
2474   cpp_reader *pfile = test.m_parser;
2475   string_concat_db *concats = &test.m_concats;
2476 
2477   source_range actual_range = source_range();
2478   const char *err
2479     = get_source_range_for_char (pfile, concats, strloc, type, idx,
2480 				 &actual_range);
2481   if (should_have_column_data_p (strloc))
2482     ASSERT_EQ_AT (loc, NULL, err);
2483   else
2484     {
2485       ASSERT_STREQ_AT (loc,
2486 		       "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2487 		       err);
2488       return;
2489     }
2490 
2491   int actual_start_line = LOCATION_LINE (actual_range.m_start);
2492   ASSERT_EQ_AT (loc, expected_line, actual_start_line);
2493   int actual_finish_line = LOCATION_LINE (actual_range.m_finish);
2494   ASSERT_EQ_AT (loc, expected_line, actual_finish_line);
2495 
2496   if (should_have_column_data_p (actual_range.m_start))
2497     {
2498       int actual_start_col = LOCATION_COLUMN (actual_range.m_start);
2499       ASSERT_EQ_AT (loc, expected_start_col, actual_start_col);
2500     }
2501   if (should_have_column_data_p (actual_range.m_finish))
2502     {
2503       int actual_finish_col = LOCATION_COLUMN (actual_range.m_finish);
2504       ASSERT_EQ_AT (loc, expected_finish_col, actual_finish_col);
2505     }
2506 }
2507 
2508 /* Macro for calling assert_char_at_range, supplying SELFTEST_LOCATION for
2509    the effective location of any errors.  */
2510 
2511 #define ASSERT_CHAR_AT_RANGE(LEXER_TEST, STRLOC, TYPE, IDX, EXPECTED_LINE, \
2512 			     EXPECTED_START_COL, EXPECTED_FINISH_COL)	\
2513   assert_char_at_range (SELFTEST_LOCATION, (LEXER_TEST), (STRLOC), (TYPE), \
2514 			(IDX), (EXPECTED_LINE), (EXPECTED_START_COL), \
2515 			(EXPECTED_FINISH_COL))
2516 
2517 /* Verify get_num_source_ranges_for_substring for token(s) at STRLOC,
2518    using the string concatenation database for TEST.
2519 
2520    Assert that the token(s) at STRLOC contain EXPECTED_NUM_RANGES.  */
2521 
2522 static void
assert_num_substring_ranges(const location & loc,lexer_test & test,location_t strloc,enum cpp_ttype type,int expected_num_ranges)2523 assert_num_substring_ranges (const location &loc,
2524 			     lexer_test& test,
2525 			     location_t strloc,
2526 			     enum cpp_ttype type,
2527 			     int expected_num_ranges)
2528 {
2529   cpp_reader *pfile = test.m_parser;
2530   string_concat_db *concats = &test.m_concats;
2531 
2532   int actual_num_ranges = -1;
2533   const char *err
2534     = get_num_source_ranges_for_substring (pfile, concats, strloc, type,
2535 					   &actual_num_ranges);
2536   if (should_have_column_data_p (strloc))
2537     ASSERT_EQ_AT (loc, NULL, err);
2538   else
2539     {
2540       ASSERT_STREQ_AT (loc,
2541 		       "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2542 		       err);
2543       return;
2544     }
2545   ASSERT_EQ_AT (loc, expected_num_ranges, actual_num_ranges);
2546 }
2547 
2548 /* Macro for calling assert_num_substring_ranges, supplying
2549    SELFTEST_LOCATION for the effective location of any errors.  */
2550 
2551 #define ASSERT_NUM_SUBSTRING_RANGES(LEXER_TEST, STRLOC, TYPE, \
2552 				    EXPECTED_NUM_RANGES)		\
2553   assert_num_substring_ranges (SELFTEST_LOCATION, (LEXER_TEST), (STRLOC), \
2554 			       (TYPE), (EXPECTED_NUM_RANGES))
2555 
2556 
2557 /* Verify that get_num_source_ranges_for_substring for token(s) at STRLOC
2558    returns an error (using the string concatenation database for TEST).  */
2559 
2560 static void
assert_has_no_substring_ranges(const location & loc,lexer_test & test,location_t strloc,enum cpp_ttype type,const char * expected_err)2561 assert_has_no_substring_ranges (const location &loc,
2562 				lexer_test& test,
2563 				location_t strloc,
2564 				enum cpp_ttype type,
2565 				const char *expected_err)
2566 {
2567   cpp_reader *pfile = test.m_parser;
2568   string_concat_db *concats = &test.m_concats;
2569   cpp_substring_ranges ranges;
2570   const char *actual_err
2571     = get_substring_ranges_for_loc (pfile, concats, strloc,
2572 				    type, ranges);
2573   if (should_have_column_data_p (strloc))
2574     ASSERT_STREQ_AT (loc, expected_err, actual_err);
2575   else
2576     ASSERT_STREQ_AT (loc,
2577 		     "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2578 		     actual_err);
2579 }
2580 
2581 #define ASSERT_HAS_NO_SUBSTRING_RANGES(LEXER_TEST, STRLOC, TYPE, ERR)    \
2582     assert_has_no_substring_ranges (SELFTEST_LOCATION, (LEXER_TEST), \
2583 				    (STRLOC), (TYPE), (ERR))
2584 
2585 /* Lex a simple string literal.  Verify the substring location data, before
2586    and after running cpp_interpret_string on it.  */
2587 
2588 static void
test_lexer_string_locations_simple(const line_table_case & case_)2589 test_lexer_string_locations_simple (const line_table_case &case_)
2590 {
2591   /* Digits 0-9 (with 0 at column 10), the simple way.
2592      ....................000000000.11111111112.2222222223333333333
2593      ....................123456789.01234567890.1234567890123456789
2594      We add a trailing comment to ensure that we correctly locate
2595      the end of the string literal token.  */
2596   const char *content = "        \"0123456789\" /* not a string */\n";
2597   lexer_test test (case_, content, NULL);
2598 
2599   /* Verify that we get the expected token back, with the correct
2600      location information.  */
2601   const cpp_token *tok = test.get_token ();
2602   ASSERT_EQ (tok->type, CPP_STRING);
2603   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
2604   ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 20);
2605 
2606   /* At this point in lexing, the quote characters are treated as part of
2607      the string (they are stripped off by cpp_interpret_string).  */
2608 
2609   ASSERT_EQ (tok->val.str.len, 12);
2610 
2611   /* Verify that cpp_interpret_string works.  */
2612   cpp_string dst_string;
2613   const enum cpp_ttype type = CPP_STRING;
2614   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2615 				      &dst_string, type);
2616   ASSERT_TRUE (result);
2617   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
2618   free (const_cast <unsigned char *> (dst_string.text));
2619 
2620   /* Verify ranges of individual characters.  This no longer includes the
2621      opening quote, but does include the closing quote.  */
2622   for (int i = 0; i <= 10; i++)
2623     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1,
2624 			  10 + i, 10 + i);
2625 
2626   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2627 }
2628 
2629 /* As test_lexer_string_locations_simple, but use an EBCDIC execution
2630    encoding.  */
2631 
2632 static void
test_lexer_string_locations_ebcdic(const line_table_case & case_)2633 test_lexer_string_locations_ebcdic (const line_table_case &case_)
2634 {
2635   /* EBCDIC support requires iconv.  */
2636   if (!HAVE_ICONV)
2637     return;
2638 
2639   /* Digits 0-9 (with 0 at column 10), the simple way.
2640      ....................000000000.11111111112.2222222223333333333
2641      ....................123456789.01234567890.1234567890123456789
2642      We add a trailing comment to ensure that we correctly locate
2643      the end of the string literal token.  */
2644   const char *content = "        \"0123456789\" /* not a string */\n";
2645   ebcdic_execution_charset use_ebcdic;
2646   lexer_test test (case_, content, &use_ebcdic);
2647 
2648   /* Verify that we get the expected token back, with the correct
2649      location information.  */
2650   const cpp_token *tok = test.get_token ();
2651   ASSERT_EQ (tok->type, CPP_STRING);
2652   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
2653   ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 20);
2654 
2655   /* At this point in lexing, the quote characters are treated as part of
2656      the string (they are stripped off by cpp_interpret_string).  */
2657 
2658   ASSERT_EQ (tok->val.str.len, 12);
2659 
2660   /* The remainder of the test requires an iconv implementation that
2661      can convert from UTF-8 to the EBCDIC encoding requested above.  */
2662   if (use_ebcdic.iconv_errors_occurred_p ())
2663     return;
2664 
2665   /* Verify that cpp_interpret_string works.  */
2666   cpp_string dst_string;
2667   const enum cpp_ttype type = CPP_STRING;
2668   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2669 				      &dst_string, type);
2670   ASSERT_TRUE (result);
2671   /* We should now have EBCDIC-encoded text, specifically
2672      IBM1047-encoded (aka "EBCDIC 1047", or "Code page 1047").
2673      The digits 0-9 are encoded as 240-249 i.e. 0xf0-0xf9.  */
2674   ASSERT_STREQ ("\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9",
2675 		(const char *)dst_string.text);
2676   free (const_cast <unsigned char *> (dst_string.text));
2677 
2678   /* Verify that we don't attempt to record substring location information
2679      for such cases.  */
2680   ASSERT_HAS_NO_SUBSTRING_RANGES
2681     (test, tok->src_loc, type,
2682      "execution character set != source character set");
2683 }
2684 
2685 /* Lex a string literal containing a hex-escaped character.
2686    Verify the substring location data, before and after running
2687    cpp_interpret_string on it.  */
2688 
2689 static void
test_lexer_string_locations_hex(const line_table_case & case_)2690 test_lexer_string_locations_hex (const line_table_case &case_)
2691 {
2692   /* Digits 0-9, expressing digit 5 in ASCII as "\x35"
2693      and with a space in place of digit 6, to terminate the escaped
2694      hex code.
2695      ....................000000000.111111.11112222.
2696      ....................123456789.012345.67890123.  */
2697   const char *content = "        \"01234\\x35 789\"\n";
2698   lexer_test test (case_, content, NULL);
2699 
2700   /* Verify that we get the expected token back, with the correct
2701      location information.  */
2702   const cpp_token *tok = test.get_token ();
2703   ASSERT_EQ (tok->type, CPP_STRING);
2704   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\x35 789\"");
2705   ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 23);
2706 
2707   /* At this point in lexing, the quote characters are treated as part of
2708      the string (they are stripped off by cpp_interpret_string).  */
2709   ASSERT_EQ (tok->val.str.len, 15);
2710 
2711   /* Verify that cpp_interpret_string works.  */
2712   cpp_string dst_string;
2713   const enum cpp_ttype type = CPP_STRING;
2714   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2715 				      &dst_string, type);
2716   ASSERT_TRUE (result);
2717   ASSERT_STREQ ("012345 789", (const char *)dst_string.text);
2718   free (const_cast <unsigned char *> (dst_string.text));
2719 
2720   /* Verify ranges of individual characters.  This no longer includes the
2721      opening quote, but does include the closing quote.  */
2722   for (int i = 0; i <= 4; i++)
2723     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2724   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 5, 1, 15, 18);
2725   for (int i = 6; i <= 10; i++)
2726     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 13 + i, 13 + i);
2727 
2728   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2729 }
2730 
2731 /* Lex a string literal containing an octal-escaped character.
2732    Verify the substring location data after running cpp_interpret_string
2733    on it.  */
2734 
2735 static void
test_lexer_string_locations_oct(const line_table_case & case_)2736 test_lexer_string_locations_oct (const line_table_case &case_)
2737 {
2738   /* Digits 0-9, expressing digit 5 in ASCII as "\065"
2739      and with a space in place of digit 6, to terminate the escaped
2740      octal code.
2741      ....................000000000.111111.11112222.2222223333333333444
2742      ....................123456789.012345.67890123.4567890123456789012  */
2743   const char *content = "        \"01234\\065 789\" /* not a string */\n";
2744   lexer_test test (case_, content, NULL);
2745 
2746   /* Verify that we get the expected token back, with the correct
2747      location information.  */
2748   const cpp_token *tok = test.get_token ();
2749   ASSERT_EQ (tok->type, CPP_STRING);
2750   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\065 789\"");
2751 
2752   /* Verify that cpp_interpret_string works.  */
2753   cpp_string dst_string;
2754   const enum cpp_ttype type = CPP_STRING;
2755   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2756 				      &dst_string, type);
2757   ASSERT_TRUE (result);
2758   ASSERT_STREQ ("012345 789", (const char *)dst_string.text);
2759   free (const_cast <unsigned char *> (dst_string.text));
2760 
2761   /* Verify ranges of individual characters.  This no longer includes the
2762      opening quote, but does include the closing quote.  */
2763   for (int i = 0; i < 5; i++)
2764     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2765   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 5, 1, 15, 18);
2766   for (int i = 6; i <= 10; i++)
2767     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 13 + i, 13 + i);
2768 
2769   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2770 }
2771 
2772 /* Test of string literal containing letter escapes.  */
2773 
2774 static void
test_lexer_string_locations_letter_escape_1(const line_table_case & case_)2775 test_lexer_string_locations_letter_escape_1 (const line_table_case &case_)
2776 {
2777   /* The string "\tfoo\\\nbar" i.e. tab, "foo", backslash, newline, bar.
2778      .....................000000000.1.11111.1.1.11222.22222223333333
2779      .....................123456789.0.12345.6.7.89012.34567890123456.  */
2780   const char *content = ("        \"\\tfoo\\\\\\nbar\" /* non-str */\n");
2781   lexer_test test (case_, content, NULL);
2782 
2783   /* Verify that we get the expected tokens back.  */
2784   const cpp_token *tok = test.get_token ();
2785   ASSERT_EQ (tok->type, CPP_STRING);
2786   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"\\tfoo\\\\\\nbar\"");
2787 
2788   /* Verify ranges of individual characters. */
2789   /* "\t".  */
2790   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2791 			0, 1, 10, 11);
2792   /* "foo". */
2793   for (int i = 1; i <= 3; i++)
2794     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2795 			  i, 1, 11 + i, 11 + i);
2796   /* "\\" and "\n".  */
2797   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2798 			4, 1, 15, 16);
2799   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2800 			5, 1, 17, 18);
2801 
2802   /* "bar" and closing quote for nul-terminator.  */
2803   for (int i = 6; i <= 9; i++)
2804     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2805 			  i, 1, 13 + i, 13 + i);
2806 
2807   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 10);
2808 }
2809 
2810 /* Another test of a string literal containing a letter escape.
2811    Based on string seen in
2812      printf ("%-%\n");
2813    in gcc.dg/format/c90-printf-1.c.  */
2814 
2815 static void
test_lexer_string_locations_letter_escape_2(const line_table_case & case_)2816 test_lexer_string_locations_letter_escape_2 (const line_table_case &case_)
2817 {
2818   /* .....................000000000.1111.11.1111.22222222223.
2819      .....................123456789.0123.45.6789.01234567890.  */
2820   const char *content = ("        \"%-%\\n\" /* non-str */\n");
2821   lexer_test test (case_, content, NULL);
2822 
2823   /* Verify that we get the expected tokens back.  */
2824   const cpp_token *tok = test.get_token ();
2825   ASSERT_EQ (tok->type, CPP_STRING);
2826   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"%-%\\n\"");
2827 
2828   /* Verify ranges of individual characters. */
2829   /* "%-%".  */
2830   for (int i = 0; i < 3; i++)
2831     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2832 			  i, 1, 10 + i, 10 + i);
2833   /* "\n".  */
2834   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2835 			3, 1, 13, 14);
2836 
2837   /* Closing quote for nul-terminator.  */
2838   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2839 			4, 1, 15, 15);
2840 
2841   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 5);
2842 }
2843 
2844 /* Lex a string literal containing UCN 4 characters.
2845    Verify the substring location data after running cpp_interpret_string
2846    on it.  */
2847 
2848 static void
test_lexer_string_locations_ucn4(const line_table_case & case_)2849 test_lexer_string_locations_ucn4 (const line_table_case &case_)
2850 {
2851   /* Digits 0-9, expressing digits 5 and 6 as Roman numerals expressed
2852      as UCN 4.
2853      ....................000000000.111111.111122.222222223.33333333344444
2854      ....................123456789.012345.678901.234567890.12345678901234  */
2855   const char *content = "        \"01234\\u2174\\u2175789\" /* non-str */\n";
2856   lexer_test test (case_, content, NULL);
2857 
2858   /* Verify that we get the expected token back, with the correct
2859      location information.  */
2860   const cpp_token *tok = test.get_token ();
2861   ASSERT_EQ (tok->type, CPP_STRING);
2862   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\u2174\\u2175789\"");
2863 
2864   /* Verify that cpp_interpret_string works.
2865      The string should be encoded in the execution character
2866      set.  Assuming that is UTF-8, we should have the following:
2867      -----------  ----  -----  -------  ----------------
2868      Byte offset  Byte  Octal  Unicode  Source Column(s)
2869      -----------  ----  -----  -------  ----------------
2870      0            0x30         '0'      10
2871      1            0x31         '1'      11
2872      2            0x32         '2'      12
2873      3            0x33         '3'      13
2874      4            0x34         '4'      14
2875      5            0xE2  \342   U+2174   15-20
2876      6            0x85  \205    (cont)  15-20
2877      7            0xB4  \264    (cont)  15-20
2878      8            0xE2  \342   U+2175   21-26
2879      9            0x85  \205    (cont)  21-26
2880      10           0xB5  \265    (cont)  21-26
2881      11           0x37         '7'      27
2882      12           0x38         '8'      28
2883      13           0x39         '9'      29
2884      14           0x00                  30 (closing quote)
2885      -----------  ----  -----  -------  ---------------.  */
2886 
2887   cpp_string dst_string;
2888   const enum cpp_ttype type = CPP_STRING;
2889   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2890 				      &dst_string, type);
2891   ASSERT_TRUE (result);
2892   ASSERT_STREQ ("01234\342\205\264\342\205\265789",
2893 		(const char *)dst_string.text);
2894   free (const_cast <unsigned char *> (dst_string.text));
2895 
2896   /* Verify ranges of individual characters.  This no longer includes the
2897      opening quote, but does include the closing quote.
2898      '01234'.  */
2899   for (int i = 0; i <= 4; i++)
2900     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2901   /* U+2174.  */
2902   for (int i = 5; i <= 7; i++)
2903     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 15, 20);
2904   /* U+2175.  */
2905   for (int i = 8; i <= 10; i++)
2906     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 21, 26);
2907   /* '789' and nul terminator  */
2908   for (int i = 11; i <= 14; i++)
2909     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 16 + i, 16 + i);
2910 
2911   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 15);
2912 }
2913 
2914 /* Lex a string literal containing UCN 8 characters.
2915    Verify the substring location data after running cpp_interpret_string
2916    on it.  */
2917 
2918 static void
test_lexer_string_locations_ucn8(const line_table_case & case_)2919 test_lexer_string_locations_ucn8 (const line_table_case &case_)
2920 {
2921   /* Digits 0-9, expressing digits 5 and 6 as Roman numerals as UCN 8.
2922      ....................000000000.111111.1111222222.2222333333333.344444
2923      ....................123456789.012345.6789012345.6789012345678.901234  */
2924   const char *content = "        \"01234\\U00002174\\U00002175789\" /* */\n";
2925   lexer_test test (case_, content, NULL);
2926 
2927   /* Verify that we get the expected token back, with the correct
2928      location information.  */
2929   const cpp_token *tok = test.get_token ();
2930   ASSERT_EQ (tok->type, CPP_STRING);
2931   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok,
2932 			   "\"01234\\U00002174\\U00002175789\"");
2933 
2934   /* Verify that cpp_interpret_string works.
2935      The UTF-8 encoding of the string is identical to that from
2936      the ucn4 testcase above; the only difference is the column
2937      locations.  */
2938   cpp_string dst_string;
2939   const enum cpp_ttype type = CPP_STRING;
2940   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2941 				      &dst_string, type);
2942   ASSERT_TRUE (result);
2943   ASSERT_STREQ ("01234\342\205\264\342\205\265789",
2944 		(const char *)dst_string.text);
2945   free (const_cast <unsigned char *> (dst_string.text));
2946 
2947   /* Verify ranges of individual characters.  This no longer includes the
2948      opening quote, but does include the closing quote.
2949      '01234'.  */
2950   for (int i = 0; i <= 4; i++)
2951     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2952   /* U+2174.  */
2953   for (int i = 5; i <= 7; i++)
2954     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 15, 24);
2955   /* U+2175.  */
2956   for (int i = 8; i <= 10; i++)
2957     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 25, 34);
2958   /* '789' at columns 35-37  */
2959   for (int i = 11; i <= 13; i++)
2960     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 24 + i, 24 + i);
2961   /* Closing quote/nul-terminator at column 38.  */
2962   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 14, 1, 38, 38);
2963 
2964   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 15);
2965 }
2966 
2967 /* Fetch a big-endian 32-bit value and convert to host endianness.  */
2968 
2969 static uint32_t
uint32_from_big_endian(const uint32_t * ptr_be_value)2970 uint32_from_big_endian (const uint32_t *ptr_be_value)
2971 {
2972   const unsigned char *buf = (const unsigned char *)ptr_be_value;
2973   return (((uint32_t) buf[0] << 24)
2974 	  | ((uint32_t) buf[1] << 16)
2975 	  | ((uint32_t) buf[2] << 8)
2976 	  | (uint32_t) buf[3]);
2977 }
2978 
2979 /* Lex a wide string literal and verify that attempts to read substring
2980    location data from it fail gracefully.  */
2981 
2982 static void
test_lexer_string_locations_wide_string(const line_table_case & case_)2983 test_lexer_string_locations_wide_string (const line_table_case &case_)
2984 {
2985   /* Digits 0-9.
2986      ....................000000000.11111111112.22222222233333
2987      ....................123456789.01234567890.12345678901234  */
2988   const char *content = "       L\"0123456789\" /* non-str */\n";
2989   lexer_test test (case_, content, NULL);
2990 
2991   /* Verify that we get the expected token back, with the correct
2992      location information.  */
2993   const cpp_token *tok = test.get_token ();
2994   ASSERT_EQ (tok->type, CPP_WSTRING);
2995   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "L\"0123456789\"");
2996 
2997   /* Verify that cpp_interpret_string works, using CPP_WSTRING.  */
2998   cpp_string dst_string;
2999   const enum cpp_ttype type = CPP_WSTRING;
3000   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3001 				      &dst_string, type);
3002   ASSERT_TRUE (result);
3003   /* The cpp_reader defaults to big-endian with
3004      CHAR_BIT * sizeof (int) for the wchar_precision, so dst_string should
3005      now be encoded as UTF-32BE.  */
3006   const uint32_t *be32_chars = (const uint32_t *)dst_string.text;
3007   ASSERT_EQ ('0', uint32_from_big_endian (&be32_chars[0]));
3008   ASSERT_EQ ('5', uint32_from_big_endian (&be32_chars[5]));
3009   ASSERT_EQ ('9', uint32_from_big_endian (&be32_chars[9]));
3010   ASSERT_EQ (0, uint32_from_big_endian (&be32_chars[10]));
3011   free (const_cast <unsigned char *> (dst_string.text));
3012 
3013   /* We don't yet support generating substring location information
3014      for L"" strings.  */
3015   ASSERT_HAS_NO_SUBSTRING_RANGES
3016     (test, tok->src_loc, type,
3017      "execution character set != source character set");
3018 }
3019 
3020 /* Fetch a big-endian 16-bit value and convert to host endianness.  */
3021 
3022 static uint16_t
uint16_from_big_endian(const uint16_t * ptr_be_value)3023 uint16_from_big_endian (const uint16_t *ptr_be_value)
3024 {
3025   const unsigned char *buf = (const unsigned char *)ptr_be_value;
3026   return ((uint16_t) buf[0] << 8) | (uint16_t) buf[1];
3027 }
3028 
3029 /* Lex a u"" string literal and verify that attempts to read substring
3030    location data from it fail gracefully.  */
3031 
3032 static void
test_lexer_string_locations_string16(const line_table_case & case_)3033 test_lexer_string_locations_string16 (const line_table_case &case_)
3034 {
3035   /* Digits 0-9.
3036      ....................000000000.11111111112.22222222233333
3037      ....................123456789.01234567890.12345678901234  */
3038   const char *content = "       u\"0123456789\" /* non-str */\n";
3039   lexer_test test (case_, content, NULL);
3040 
3041   /* Verify that we get the expected token back, with the correct
3042      location information.  */
3043   const cpp_token *tok = test.get_token ();
3044   ASSERT_EQ (tok->type, CPP_STRING16);
3045   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u\"0123456789\"");
3046 
3047   /* Verify that cpp_interpret_string works, using CPP_STRING16.  */
3048   cpp_string dst_string;
3049   const enum cpp_ttype type = CPP_STRING16;
3050   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3051 				      &dst_string, type);
3052   ASSERT_TRUE (result);
3053 
3054   /* The cpp_reader defaults to big-endian, so dst_string should
3055      now be encoded as UTF-16BE.  */
3056   const uint16_t *be16_chars = (const uint16_t *)dst_string.text;
3057   ASSERT_EQ ('0', uint16_from_big_endian (&be16_chars[0]));
3058   ASSERT_EQ ('5', uint16_from_big_endian (&be16_chars[5]));
3059   ASSERT_EQ ('9', uint16_from_big_endian (&be16_chars[9]));
3060   ASSERT_EQ (0, uint16_from_big_endian (&be16_chars[10]));
3061   free (const_cast <unsigned char *> (dst_string.text));
3062 
3063   /* We don't yet support generating substring location information
3064      for L"" strings.  */
3065   ASSERT_HAS_NO_SUBSTRING_RANGES
3066     (test, tok->src_loc, type,
3067      "execution character set != source character set");
3068 }
3069 
3070 /* Lex a U"" string literal and verify that attempts to read substring
3071    location data from it fail gracefully.  */
3072 
3073 static void
test_lexer_string_locations_string32(const line_table_case & case_)3074 test_lexer_string_locations_string32 (const line_table_case &case_)
3075 {
3076   /* Digits 0-9.
3077      ....................000000000.11111111112.22222222233333
3078      ....................123456789.01234567890.12345678901234  */
3079   const char *content = "       U\"0123456789\" /* non-str */\n";
3080   lexer_test test (case_, content, NULL);
3081 
3082   /* Verify that we get the expected token back, with the correct
3083      location information.  */
3084   const cpp_token *tok = test.get_token ();
3085   ASSERT_EQ (tok->type, CPP_STRING32);
3086   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "U\"0123456789\"");
3087 
3088   /* Verify that cpp_interpret_string works, using CPP_STRING32.  */
3089   cpp_string dst_string;
3090   const enum cpp_ttype type = CPP_STRING32;
3091   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3092 				      &dst_string, type);
3093   ASSERT_TRUE (result);
3094 
3095   /* The cpp_reader defaults to big-endian, so dst_string should
3096      now be encoded as UTF-32BE.  */
3097   const uint32_t *be32_chars = (const uint32_t *)dst_string.text;
3098   ASSERT_EQ ('0', uint32_from_big_endian (&be32_chars[0]));
3099   ASSERT_EQ ('5', uint32_from_big_endian (&be32_chars[5]));
3100   ASSERT_EQ ('9', uint32_from_big_endian (&be32_chars[9]));
3101   ASSERT_EQ (0, uint32_from_big_endian (&be32_chars[10]));
3102   free (const_cast <unsigned char *> (dst_string.text));
3103 
3104   /* We don't yet support generating substring location information
3105      for L"" strings.  */
3106   ASSERT_HAS_NO_SUBSTRING_RANGES
3107     (test, tok->src_loc, type,
3108      "execution character set != source character set");
3109 }
3110 
3111 /* Lex a u8-string literal.
3112    Verify the substring location data after running cpp_interpret_string
3113    on it.  */
3114 
3115 static void
test_lexer_string_locations_u8(const line_table_case & case_)3116 test_lexer_string_locations_u8 (const line_table_case &case_)
3117 {
3118   /* Digits 0-9.
3119      ....................000000000.11111111112.22222222233333
3120      ....................123456789.01234567890.12345678901234  */
3121   const char *content = "      u8\"0123456789\" /* non-str */\n";
3122   lexer_test test (case_, content, NULL);
3123 
3124   /* Verify that we get the expected token back, with the correct
3125      location information.  */
3126   const cpp_token *tok = test.get_token ();
3127   ASSERT_EQ (tok->type, CPP_UTF8STRING);
3128   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u8\"0123456789\"");
3129 
3130   /* Verify that cpp_interpret_string works.  */
3131   cpp_string dst_string;
3132   const enum cpp_ttype type = CPP_STRING;
3133   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3134 				      &dst_string, type);
3135   ASSERT_TRUE (result);
3136   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3137   free (const_cast <unsigned char *> (dst_string.text));
3138 
3139   /* Verify ranges of individual characters.  This no longer includes the
3140      opening quote, but does include the closing quote.  */
3141   for (int i = 0; i <= 10; i++)
3142     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
3143 }
3144 
3145 /* Lex a string literal containing UTF-8 source characters.
3146    Verify the substring location data after running cpp_interpret_string
3147    on it.  */
3148 
3149 static void
test_lexer_string_locations_utf8_source(const line_table_case & case_)3150 test_lexer_string_locations_utf8_source (const line_table_case &case_)
3151 {
3152  /* This string literal is written out to the source file as UTF-8,
3153     and is of the form "before mojibake after", where "mojibake"
3154     is written as the following four unicode code points:
3155        U+6587 CJK UNIFIED IDEOGRAPH-6587
3156        U+5B57 CJK UNIFIED IDEOGRAPH-5B57
3157        U+5316 CJK UNIFIED IDEOGRAPH-5316
3158        U+3051 HIRAGANA LETTER KE.
3159      Each of these is 3 bytes wide when encoded in UTF-8, whereas the
3160      "before" and "after" are 1 byte per unicode character.
3161 
3162      The numbering shown are "columns", which are *byte* numbers within
3163      the line, rather than unicode character numbers.
3164 
3165      .................... 000000000.1111111.
3166      .................... 123456789.0123456.  */
3167   const char *content = ("        \"before "
3168 			 /* U+6587 CJK UNIFIED IDEOGRAPH-6587
3169 			      UTF-8: 0xE6 0x96 0x87
3170 			      C octal escaped UTF-8: \346\226\207
3171 			    "column" numbers: 17-19.  */
3172 			 "\346\226\207"
3173 
3174 			 /* U+5B57 CJK UNIFIED IDEOGRAPH-5B57
3175 			      UTF-8: 0xE5 0xAD 0x97
3176 			      C octal escaped UTF-8: \345\255\227
3177 			    "column" numbers: 20-22.  */
3178 			 "\345\255\227"
3179 
3180 			 /* U+5316 CJK UNIFIED IDEOGRAPH-5316
3181 			      UTF-8: 0xE5 0x8C 0x96
3182 			      C octal escaped UTF-8: \345\214\226
3183 			    "column" numbers: 23-25.  */
3184 			 "\345\214\226"
3185 
3186 			 /* U+3051 HIRAGANA LETTER KE
3187 			      UTF-8: 0xE3 0x81 0x91
3188 			      C octal escaped UTF-8: \343\201\221
3189 			    "column" numbers: 26-28.  */
3190 			 "\343\201\221"
3191 
3192 			 /* column numbers 29 onwards
3193 			  2333333.33334444444444
3194 			  9012345.67890123456789. */
3195 			 " after\" /* non-str */\n");
3196   lexer_test test (case_, content, NULL);
3197 
3198   /* Verify that we get the expected token back, with the correct
3199      location information.  */
3200   const cpp_token *tok = test.get_token ();
3201   ASSERT_EQ (tok->type, CPP_STRING);
3202   ASSERT_TOKEN_AS_TEXT_EQ
3203     (test.m_parser, tok,
3204      "\"before \346\226\207\345\255\227\345\214\226\343\201\221 after\"");
3205 
3206   /* Verify that cpp_interpret_string works.  */
3207   cpp_string dst_string;
3208   const enum cpp_ttype type = CPP_STRING;
3209   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3210 				      &dst_string, type);
3211   ASSERT_TRUE (result);
3212   ASSERT_STREQ
3213     ("before \346\226\207\345\255\227\345\214\226\343\201\221 after",
3214      (const char *)dst_string.text);
3215   free (const_cast <unsigned char *> (dst_string.text));
3216 
3217   /* Verify ranges of individual characters.  This no longer includes the
3218      opening quote, but does include the closing quote.
3219      Assuming that both source and execution encodings are UTF-8, we have
3220      a run of 25 octets in each, plus the NUL terminator.  */
3221   for (int i = 0; i < 25; i++)
3222     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
3223   /* NUL-terminator should use the closing quote at column 35.  */
3224   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 25, 1, 35, 35);
3225 
3226   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 26);
3227 }
3228 
3229 /* Test of string literal concatenation.  */
3230 
3231 static void
test_lexer_string_locations_concatenation_1(const line_table_case & case_)3232 test_lexer_string_locations_concatenation_1 (const line_table_case &case_)
3233 {
3234   /* Digits 0-9.
3235      .....................000000000.111111.11112222222222
3236      .....................123456789.012345.67890123456789.  */
3237   const char *content = ("        \"01234\" /* non-str */\n"
3238 			 "        \"56789\" /* non-str */\n");
3239   lexer_test test (case_, content, NULL);
3240 
3241   location_t input_locs[2];
3242 
3243   /* Verify that we get the expected tokens back.  */
3244   auto_vec <cpp_string> input_strings;
3245   const cpp_token *tok_a = test.get_token ();
3246   ASSERT_EQ (tok_a->type, CPP_STRING);
3247   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok_a, "\"01234\"");
3248   input_strings.safe_push (tok_a->val.str);
3249   input_locs[0] = tok_a->src_loc;
3250 
3251   const cpp_token *tok_b = test.get_token ();
3252   ASSERT_EQ (tok_b->type, CPP_STRING);
3253   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok_b, "\"56789\"");
3254   input_strings.safe_push (tok_b->val.str);
3255   input_locs[1] = tok_b->src_loc;
3256 
3257   /* Verify that cpp_interpret_string works.  */
3258   cpp_string dst_string;
3259   const enum cpp_ttype type = CPP_STRING;
3260   bool result = cpp_interpret_string (test.m_parser,
3261 				      input_strings.address (), 2,
3262 				      &dst_string, type);
3263   ASSERT_TRUE (result);
3264   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3265   free (const_cast <unsigned char *> (dst_string.text));
3266 
3267   /* Simulate c-lex.cc's lex_string in order to record concatenation.  */
3268   test.m_concats.record_string_concatenation (2, input_locs);
3269 
3270   location_t initial_loc = input_locs[0];
3271 
3272   /* "01234" on line 1.  */
3273   for (int i = 0; i <= 4; i++)
3274     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 10 + i, 10 + i);
3275   /* "56789" in line 2, plus its closing quote for the nul terminator.  */
3276   for (int i = 5; i <= 10; i++)
3277     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 2, 5 + i, 5 + i);
3278 
3279   ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3280 }
3281 
3282 /* Another test of string literal concatenation.  */
3283 
3284 static void
test_lexer_string_locations_concatenation_2(const line_table_case & case_)3285 test_lexer_string_locations_concatenation_2 (const line_table_case &case_)
3286 {
3287   /* Digits 0-9.
3288      .....................000000000.111.11111112222222
3289      .....................123456789.012.34567890123456.  */
3290   const char *content = ("        \"01\" /* non-str */\n"
3291 			 "        \"23\" /* non-str */\n"
3292 			 "        \"45\" /* non-str */\n"
3293 			 "        \"67\" /* non-str */\n"
3294 			 "        \"89\" /* non-str */\n");
3295   lexer_test test (case_, content, NULL);
3296 
3297   auto_vec <cpp_string> input_strings;
3298   location_t input_locs[5];
3299 
3300   /* Verify that we get the expected tokens back.  */
3301   for (int i = 0; i < 5; i++)
3302     {
3303       const cpp_token *tok = test.get_token ();
3304       ASSERT_EQ (tok->type, CPP_STRING);
3305       input_strings.safe_push (tok->val.str);
3306       input_locs[i] = tok->src_loc;
3307     }
3308 
3309   /* Verify that cpp_interpret_string works.  */
3310   cpp_string dst_string;
3311   const enum cpp_ttype type = CPP_STRING;
3312   bool result = cpp_interpret_string (test.m_parser,
3313 				      input_strings.address (), 5,
3314 				      &dst_string, type);
3315   ASSERT_TRUE (result);
3316   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3317   free (const_cast <unsigned char *> (dst_string.text));
3318 
3319   /* Simulate c-lex.cc's lex_string in order to record concatenation.  */
3320   test.m_concats.record_string_concatenation (5, input_locs);
3321 
3322   location_t initial_loc = input_locs[0];
3323 
3324   /* Within ASSERT_CHAR_AT_RANGE (actually assert_char_at_range), we can
3325      detect if the initial loc is after LINE_MAP_MAX_LOCATION_WITH_COLS
3326      and expect get_source_range_for_substring to fail.
3327      However, for a string concatenation test, we can have a case
3328      where the initial string is fully before LINE_MAP_MAX_LOCATION_WITH_COLS,
3329      but subsequent strings can be after it.
3330      Attempting to detect this within assert_char_at_range
3331      would overcomplicate the logic for the common test cases, so
3332      we detect it here.  */
3333   if (should_have_column_data_p (input_locs[0])
3334       && !should_have_column_data_p (input_locs[4]))
3335     {
3336       /* Verify that get_source_range_for_substring gracefully rejects
3337 	 this case.  */
3338       source_range actual_range;
3339       const char *err
3340 	= get_source_range_for_char (test.m_parser, &test.m_concats,
3341 				     initial_loc, type, 0, &actual_range);
3342       ASSERT_STREQ ("range starts after LINE_MAP_MAX_LOCATION_WITH_COLS", err);
3343       return;
3344     }
3345 
3346   for (int i = 0; i < 5; i++)
3347     for (int j = 0; j < 2; j++)
3348       ASSERT_CHAR_AT_RANGE (test, initial_loc, type, (i * 2) + j,
3349 			    i + 1, 10 + j, 10 + j);
3350 
3351   /* NUL-terminator should use the final closing quote at line 5 column 12.  */
3352   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 10, 5, 12, 12);
3353 
3354   ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3355 }
3356 
3357 /* Another test of string literal concatenation, this time combined with
3358    various kinds of escaped characters.  */
3359 
3360 static void
test_lexer_string_locations_concatenation_3(const line_table_case & case_)3361 test_lexer_string_locations_concatenation_3 (const line_table_case &case_)
3362 {
3363   /* Digits 0-9, expressing digit 5 in ASCII as hex "\x35"
3364      digit 6 in ASCII as octal "\066", concatenating multiple strings.  */
3365   const char *content
3366     /* .000000000.111111.111.1.2222.222.2.2233.333.3333.34444444444555
3367        .123456789.012345.678.9.0123.456.7.8901.234.5678.90123456789012. */
3368     = ("        \"01234\"  \"\\x35\"  \"\\066\"  \"789\" /* non-str */\n");
3369   lexer_test test (case_, content, NULL);
3370 
3371   auto_vec <cpp_string> input_strings;
3372   location_t input_locs[4];
3373 
3374   /* Verify that we get the expected tokens back.  */
3375   for (int i = 0; i < 4; i++)
3376     {
3377       const cpp_token *tok = test.get_token ();
3378       ASSERT_EQ (tok->type, CPP_STRING);
3379       input_strings.safe_push (tok->val.str);
3380       input_locs[i] = tok->src_loc;
3381     }
3382 
3383   /* Verify that cpp_interpret_string works.  */
3384   cpp_string dst_string;
3385   const enum cpp_ttype type = CPP_STRING;
3386   bool result = cpp_interpret_string (test.m_parser,
3387 				      input_strings.address (), 4,
3388 				      &dst_string, type);
3389   ASSERT_TRUE (result);
3390   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3391   free (const_cast <unsigned char *> (dst_string.text));
3392 
3393   /* Simulate c-lex.cc's lex_string in order to record concatenation.  */
3394   test.m_concats.record_string_concatenation (4, input_locs);
3395 
3396   location_t initial_loc = input_locs[0];
3397 
3398   for (int i = 0; i <= 4; i++)
3399     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 10 + i, 10 + i);
3400   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 5, 1, 19, 22);
3401   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 6, 1, 27, 30);
3402   for (int i = 7; i <= 9; i++)
3403     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 28 + i, 28 + i);
3404 
3405   /* NUL-terminator should use the location of the final closing quote.  */
3406   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 10, 1, 38, 38);
3407 
3408   ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3409 }
3410 
3411 /* Test of string literal in a macro.  */
3412 
3413 static void
test_lexer_string_locations_macro(const line_table_case & case_)3414 test_lexer_string_locations_macro (const line_table_case &case_)
3415 {
3416   /* Digits 0-9.
3417      .....................0000000001111111111.22222222223.
3418      .....................1234567890123456789.01234567890.  */
3419   const char *content = ("#define MACRO     \"0123456789\" /* non-str */\n"
3420 			 "  MACRO");
3421   lexer_test test (case_, content, NULL);
3422 
3423   /* Verify that we get the expected tokens back.  */
3424   const cpp_token *tok = test.get_token ();
3425   ASSERT_EQ (tok->type, CPP_PADDING);
3426 
3427   tok = test.get_token ();
3428   ASSERT_EQ (tok->type, CPP_STRING);
3429   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
3430 
3431   /* Verify ranges of individual characters.  We ought to
3432      see columns within the macro definition.  */
3433   for (int i = 0; i <= 10; i++)
3434     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3435 			  i, 1, 20 + i, 20 + i);
3436 
3437   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 11);
3438 
3439   tok = test.get_token ();
3440   ASSERT_EQ (tok->type, CPP_PADDING);
3441 }
3442 
3443 /* Test of stringification of a macro argument.  */
3444 
3445 static void
test_lexer_string_locations_stringified_macro_argument(const line_table_case & case_)3446 test_lexer_string_locations_stringified_macro_argument
3447   (const line_table_case &case_)
3448 {
3449   /* .....................000000000111111111122222222223.
3450      .....................123456789012345678901234567890.  */
3451   const char *content = ("#define MACRO(X) #X /* non-str */\n"
3452 			 "MACRO(foo)\n");
3453   lexer_test test (case_, content, NULL);
3454 
3455   /* Verify that we get the expected token back.  */
3456   const cpp_token *tok = test.get_token ();
3457   ASSERT_EQ (tok->type, CPP_PADDING);
3458 
3459   tok = test.get_token ();
3460   ASSERT_EQ (tok->type, CPP_STRING);
3461   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"foo\"");
3462 
3463   /* We don't support getting the location of a stringified macro
3464      argument.  Verify that it fails gracefully.  */
3465   ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING,
3466 				  "cpp_interpret_string_1 failed");
3467 
3468   tok = test.get_token ();
3469   ASSERT_EQ (tok->type, CPP_PADDING);
3470 
3471   tok = test.get_token ();
3472   ASSERT_EQ (tok->type, CPP_PADDING);
3473 }
3474 
3475 /* Ensure that we are fail gracefully if something attempts to pass
3476    in a location that isn't a string literal token.  Seen on this code:
3477 
3478      const char a[] = " %d ";
3479      __builtin_printf (a, 0.5);
3480                        ^
3481 
3482    when c-format.cc erroneously used the indicated one-character
3483    location as the format string location, leading to a read past the
3484    end of a string buffer in cpp_interpret_string_1.  */
3485 
3486 static void
test_lexer_string_locations_non_string(const line_table_case & case_)3487 test_lexer_string_locations_non_string (const line_table_case &case_)
3488 {
3489   /* .....................000000000111111111122222222223.
3490      .....................123456789012345678901234567890.  */
3491   const char *content = ("         a\n");
3492   lexer_test test (case_, content, NULL);
3493 
3494   /* Verify that we get the expected token back.  */
3495   const cpp_token *tok = test.get_token ();
3496   ASSERT_EQ (tok->type, CPP_NAME);
3497   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "a");
3498 
3499   /* At this point, libcpp is attempting to interpret the name as a
3500      string literal, despite it not starting with a quote.  We don't detect
3501      that, but we should at least fail gracefully.  */
3502   ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING,
3503 				  "cpp_interpret_string_1 failed");
3504 }
3505 
3506 /* Ensure that we can read substring information for a token which
3507    starts in one linemap and ends in another .  Adapted from
3508    gcc.dg/cpp/pr69985.c.  */
3509 
3510 static void
test_lexer_string_locations_long_line(const line_table_case & case_)3511 test_lexer_string_locations_long_line (const line_table_case &case_)
3512 {
3513   /* .....................000000.000111111111
3514      .....................123456.789012346789.  */
3515   const char *content = ("/* A very long line, so that we start a new line map.  */\n"
3516 			 "     \"0123456789012345678901234567890123456789"
3517 			 "0123456789012345678901234567890123456789"
3518 			 "0123456789012345678901234567890123456789"
3519 			 "0123456789\"\n");
3520 
3521   lexer_test test (case_, content, NULL);
3522 
3523   /* Verify that we get the expected token back.  */
3524   const cpp_token *tok = test.get_token ();
3525   ASSERT_EQ (tok->type, CPP_STRING);
3526 
3527   if (!should_have_column_data_p (line_table->highest_location))
3528     return;
3529 
3530   /* Verify ranges of individual characters.  */
3531   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 131);
3532   for (int i = 0; i < 131; i++)
3533     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3534 			  i, 2, 7 + i, 7 + i);
3535 }
3536 
3537 /* Test of locations within a raw string that doesn't contain a newline.  */
3538 
3539 static void
test_lexer_string_locations_raw_string_one_line(const line_table_case & case_)3540 test_lexer_string_locations_raw_string_one_line (const line_table_case &case_)
3541 {
3542   /* .....................00.0000000111111111122.
3543      .....................12.3456789012345678901.  */
3544   const char *content = ("R\"foo(0123456789)foo\"\n");
3545   lexer_test test (case_, content, NULL);
3546 
3547   /* Verify that we get the expected token back.  */
3548   const cpp_token *tok = test.get_token ();
3549   ASSERT_EQ (tok->type, CPP_STRING);
3550 
3551   /* Verify that cpp_interpret_string works.  */
3552   cpp_string dst_string;
3553   const enum cpp_ttype type = CPP_STRING;
3554   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3555 				      &dst_string, type);
3556   ASSERT_TRUE (result);
3557   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3558   free (const_cast <unsigned char *> (dst_string.text));
3559 
3560   if (!should_have_column_data_p (line_table->highest_location))
3561     return;
3562 
3563   /* 0-9, plus the nil terminator.  */
3564   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 11);
3565   for (int i = 0; i < 11; i++)
3566     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3567 			  i, 1, 7 + i, 7 + i);
3568 }
3569 
3570 /* Test of locations within a raw string that contains a newline.  */
3571 
3572 static void
test_lexer_string_locations_raw_string_multiline(const line_table_case & case_)3573 test_lexer_string_locations_raw_string_multiline (const line_table_case &case_)
3574 {
3575   /* .....................00.0000.
3576      .....................12.3456.  */
3577   const char *content = ("R\"foo(\n"
3578   /* .....................00000.
3579      .....................12345.  */
3580 			 "hello\n"
3581 			 "world\n"
3582   /* .....................00000.
3583      .....................12345.  */
3584 			 ")foo\"\n");
3585   lexer_test test (case_, content, NULL);
3586 
3587   /* Verify that we get the expected token back.  */
3588   const cpp_token *tok = test.get_token ();
3589   ASSERT_EQ (tok->type, CPP_STRING);
3590 
3591   /* Verify that cpp_interpret_string works.  */
3592   cpp_string dst_string;
3593   const enum cpp_ttype type = CPP_STRING;
3594   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3595 				      &dst_string, type);
3596   ASSERT_TRUE (result);
3597   ASSERT_STREQ ("\nhello\nworld\n", (const char *)dst_string.text);
3598   free (const_cast <unsigned char *> (dst_string.text));
3599 
3600   if (!should_have_column_data_p (line_table->highest_location))
3601     return;
3602 
3603   /* Currently we don't support locations within raw strings that
3604      contain newlines.  */
3605   ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, tok->type,
3606 				  "range endpoints are on different lines");
3607 }
3608 
3609 /* Test of parsing an unterminated raw string.  */
3610 
3611 static void
test_lexer_string_locations_raw_string_unterminated(const line_table_case & case_)3612 test_lexer_string_locations_raw_string_unterminated (const line_table_case &case_)
3613 {
3614   const char *content = "R\"ouch()ouCh\" /* etc */";
3615 
3616   lexer_diagnostic_sink diagnostics;
3617   lexer_test test (case_, content, &diagnostics);
3618   test.m_implicitly_expect_EOF = false;
3619 
3620   /* Attempt to parse the raw string.  */
3621   const cpp_token *tok = test.get_token ();
3622   ASSERT_EQ (tok->type, CPP_EOF);
3623 
3624   ASSERT_EQ (1, diagnostics.m_diagnostics.length ());
3625   /* We expect the message "unterminated raw string"
3626      in the "cpplib" translation domain.
3627      It's not clear that dgettext is available on all supported hosts,
3628      so this assertion is commented-out for now.
3629        ASSERT_STREQ (dgettext ("cpplib", "unterminated raw string"),
3630                      diagnostics.m_diagnostics[0]);
3631   */
3632 }
3633 
3634 /* Test of lexing char constants.  */
3635 
3636 static void
test_lexer_char_constants(const line_table_case & case_)3637 test_lexer_char_constants (const line_table_case &case_)
3638 {
3639   /* Various char constants.
3640      .....................0000000001111111111.22222222223.
3641      .....................1234567890123456789.01234567890.  */
3642   const char *content = ("         'a'\n"
3643 			 "        u'a'\n"
3644 			 "        U'a'\n"
3645 			 "        L'a'\n"
3646 			 "         'abc'\n");
3647   lexer_test test (case_, content, NULL);
3648 
3649   /* Verify that we get the expected tokens back.  */
3650   /* 'a'.  */
3651   const cpp_token *tok = test.get_token ();
3652   ASSERT_EQ (tok->type, CPP_CHAR);
3653   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "'a'");
3654 
3655   unsigned int chars_seen;
3656   int unsignedp;
3657   cppchar_t cc = cpp_interpret_charconst (test.m_parser, tok,
3658 					  &chars_seen, &unsignedp);
3659   ASSERT_EQ (cc, 'a');
3660   ASSERT_EQ (chars_seen, 1);
3661 
3662   /* u'a'.  */
3663   tok = test.get_token ();
3664   ASSERT_EQ (tok->type, CPP_CHAR16);
3665   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u'a'");
3666 
3667   /* U'a'.  */
3668   tok = test.get_token ();
3669   ASSERT_EQ (tok->type, CPP_CHAR32);
3670   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "U'a'");
3671 
3672   /* L'a'.  */
3673   tok = test.get_token ();
3674   ASSERT_EQ (tok->type, CPP_WCHAR);
3675   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "L'a'");
3676 
3677   /* 'abc' (c-char-sequence).  */
3678   tok = test.get_token ();
3679   ASSERT_EQ (tok->type, CPP_CHAR);
3680   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "'abc'");
3681 }
3682 /* A table of interesting location_t values, giving one axis of our test
3683    matrix.  */
3684 
3685 static const location_t boundary_locations[] = {
3686   /* Zero means "don't override the default values for a new line_table".  */
3687   0,
3688 
3689   /* An arbitrary non-zero value that isn't close to one of
3690      the boundary values below.  */
3691   0x10000,
3692 
3693   /* Values near LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES.  */
3694   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES - 0x100,
3695   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES - 1,
3696   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES,
3697   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES + 1,
3698   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES + 0x100,
3699 
3700   /* Values near LINE_MAP_MAX_LOCATION_WITH_COLS.  */
3701   LINE_MAP_MAX_LOCATION_WITH_COLS - 0x100,
3702   LINE_MAP_MAX_LOCATION_WITH_COLS - 1,
3703   LINE_MAP_MAX_LOCATION_WITH_COLS,
3704   LINE_MAP_MAX_LOCATION_WITH_COLS + 1,
3705   LINE_MAP_MAX_LOCATION_WITH_COLS + 0x100,
3706 };
3707 
3708 /* Run TESTCASE multiple times, once for each case in our test matrix.  */
3709 
3710 void
for_each_line_table_case(void (* testcase)(const line_table_case &))3711 for_each_line_table_case (void (*testcase) (const line_table_case &))
3712 {
3713   /* As noted above in the description of struct line_table_case,
3714      we want to explore a test matrix of interesting line_table
3715      situations, running various selftests for each case within the
3716      matrix.  */
3717 
3718   /* Run all tests with:
3719      (a) line_table->default_range_bits == 0, and
3720      (b) line_table->default_range_bits == 5.  */
3721   int num_cases_tested = 0;
3722   for (int default_range_bits = 0; default_range_bits <= 5;
3723        default_range_bits += 5)
3724     {
3725       /* ...and use each of the "interesting" location values as
3726 	 the starting location within line_table.  */
3727       const int num_boundary_locations
3728 	= sizeof (boundary_locations) / sizeof (boundary_locations[0]);
3729       for (int loc_idx = 0; loc_idx < num_boundary_locations; loc_idx++)
3730 	{
3731 	  line_table_case c (default_range_bits, boundary_locations[loc_idx]);
3732 
3733 	  testcase (c);
3734 
3735 	  num_cases_tested++;
3736 	}
3737     }
3738 
3739   /* Verify that we fully covered the test matrix.  */
3740   ASSERT_EQ (num_cases_tested, 2 * 12);
3741 }
3742 
3743 /* Verify that when presented with a consecutive pair of locations with
3744    a very large line offset, we don't attempt to consolidate them into
3745    a single ordinary linemap where the line offsets within the line map
3746    would lead to overflow (PR lto/88147).  */
3747 
3748 static void
test_line_offset_overflow()3749 test_line_offset_overflow ()
3750 {
3751   line_table_test ltt (line_table_case (5, 0));
3752 
3753   linemap_add (line_table, LC_ENTER, false, "foo.c", 0);
3754   linemap_line_start (line_table, 1, 100);
3755   location_t loc_a = linemap_line_start (line_table, 2578, 255);
3756   assert_loceq ("foo.c", 2578, 0, loc_a);
3757 
3758   const line_map_ordinary *ordmap_a = LINEMAPS_LAST_ORDINARY_MAP (line_table);
3759   ASSERT_EQ (ordmap_a->m_column_and_range_bits, 13);
3760   ASSERT_EQ (ordmap_a->m_range_bits, 5);
3761 
3762   location_t loc_b = linemap_line_start (line_table, 404198, 512);
3763   assert_loceq ("foo.c", 404198, 0, loc_b);
3764 
3765   /* We should have started a new linemap, rather than attempting to store
3766      a very large line offset.  */
3767   const line_map_ordinary *ordmap_b = LINEMAPS_LAST_ORDINARY_MAP (line_table);
3768   ASSERT_NE (ordmap_a, ordmap_b);
3769 }
3770 
test_cpp_utf8()3771 void test_cpp_utf8 ()
3772 {
3773   const int def_tabstop = 8;
3774   cpp_char_column_policy policy (def_tabstop, cpp_wcwidth);
3775 
3776   /* Verify that wcwidth of invalid UTF-8 or control bytes is 1.  */
3777   {
3778     int w_bad = cpp_display_width ("\xf0!\x9f!\x98!\x82!", 8, policy);
3779     ASSERT_EQ (8, w_bad);
3780     int w_ctrl = cpp_display_width ("\r\n\v\0\1", 5, policy);
3781     ASSERT_EQ (5, w_ctrl);
3782   }
3783 
3784   /* Verify that wcwidth of valid UTF-8 is as expected.  */
3785   {
3786     const int w_pi = cpp_display_width ("\xcf\x80", 2, policy);
3787     ASSERT_EQ (1, w_pi);
3788     const int w_emoji = cpp_display_width ("\xf0\x9f\x98\x82", 4, policy);
3789     ASSERT_EQ (2, w_emoji);
3790     const int w_umlaut_precomposed = cpp_display_width ("\xc3\xbf", 2,
3791 							policy);
3792     ASSERT_EQ (1, w_umlaut_precomposed);
3793     const int w_umlaut_combining = cpp_display_width ("y\xcc\x88", 3,
3794 						      policy);
3795     ASSERT_EQ (1, w_umlaut_combining);
3796     const int w_han = cpp_display_width ("\xe4\xb8\xba", 3, policy);
3797     ASSERT_EQ (2, w_han);
3798     const int w_ascii = cpp_display_width ("GCC", 3, policy);
3799     ASSERT_EQ (3, w_ascii);
3800     const int w_mixed = cpp_display_width ("\xcf\x80 = 3.14 \xf0\x9f\x98\x82"
3801 					   "\x9f! \xe4\xb8\xba y\xcc\x88",
3802 					   24, policy);
3803     ASSERT_EQ (18, w_mixed);
3804   }
3805 
3806   /* Verify that display width properly expands tabs.  */
3807   {
3808     const char *tstr = "\tabc\td";
3809     ASSERT_EQ (6, cpp_display_width (tstr, 6,
3810 				     cpp_char_column_policy (1, cpp_wcwidth)));
3811     ASSERT_EQ (10, cpp_display_width (tstr, 6,
3812 				      cpp_char_column_policy (3, cpp_wcwidth)));
3813     ASSERT_EQ (17, cpp_display_width (tstr, 6,
3814 				      cpp_char_column_policy (8, cpp_wcwidth)));
3815     ASSERT_EQ (1,
3816 	       cpp_display_column_to_byte_column
3817 		 (tstr, 6, 7, cpp_char_column_policy (8, cpp_wcwidth)));
3818   }
3819 
3820   /* Verify that cpp_byte_column_to_display_column can go past the end,
3821      and similar edge cases.  */
3822   {
3823     const char *str
3824       /* Display columns.
3825          111111112345  */
3826       = "\xcf\x80 abc";
3827       /* 111122223456
3828 	 Byte columns.  */
3829 
3830     ASSERT_EQ (5, cpp_display_width (str, 6, policy));
3831     ASSERT_EQ (105,
3832 	       cpp_byte_column_to_display_column (str, 6, 106, policy));
3833     ASSERT_EQ (10000,
3834 	       cpp_byte_column_to_display_column (NULL, 0, 10000, policy));
3835     ASSERT_EQ (0,
3836 	       cpp_byte_column_to_display_column (NULL, 10000, 0, policy));
3837   }
3838 
3839   /* Verify that cpp_display_column_to_byte_column can go past the end,
3840      and similar edge cases, and check invertibility.  */
3841   {
3842     const char *str
3843       /* Display columns.
3844 	 000000000000000000000000000000000000011
3845 	 111111112222222234444444455555555678901  */
3846       = "\xf0\x9f\x98\x82 \xf0\x9f\x98\x82 hello";
3847       /* 000000000000000000000000000000000111111
3848 	 111122223333444456666777788889999012345
3849 	 Byte columns.  */
3850     ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 2, policy));
3851     ASSERT_EQ (15,
3852 	       cpp_display_column_to_byte_column (str, 15, 11, policy));
3853     ASSERT_EQ (115,
3854 	       cpp_display_column_to_byte_column (str, 15, 111, policy));
3855     ASSERT_EQ (10000,
3856 	       cpp_display_column_to_byte_column (NULL, 0, 10000, policy));
3857     ASSERT_EQ (0,
3858 	       cpp_display_column_to_byte_column (NULL, 10000, 0, policy));
3859 
3860     /* Verify that we do not interrupt a UTF-8 sequence.  */
3861     ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 1, policy));
3862 
3863     for (int byte_col = 1; byte_col <= 15; ++byte_col)
3864       {
3865 	const int disp_col
3866 	  = cpp_byte_column_to_display_column (str, 15, byte_col, policy);
3867 	const int byte_col2
3868 	  = cpp_display_column_to_byte_column (str, 15, disp_col, policy);
3869 
3870 	/* If we ask for the display column in the middle of a UTF-8
3871 	   sequence, it will return the length of the partial sequence,
3872 	   matching the behavior of GCC before display column support.
3873 	   Otherwise check the round trip was successful.  */
3874 	if (byte_col < 4)
3875 	  ASSERT_EQ (byte_col, disp_col);
3876 	else if (byte_col >= 6 && byte_col < 9)
3877 	  ASSERT_EQ (3 + (byte_col - 5), disp_col);
3878 	else
3879 	  ASSERT_EQ (byte_col2, byte_col);
3880       }
3881   }
3882 
3883 }
3884 
3885 /* Run all of the selftests within this file.  */
3886 
3887 void
input_cc_tests()3888 input_cc_tests ()
3889 {
3890   test_linenum_comparisons ();
3891   test_should_have_column_data_p ();
3892   test_unknown_location ();
3893   test_builtins ();
3894   for_each_line_table_case (test_make_location_nonpure_range_endpoints);
3895 
3896   for_each_line_table_case (test_accessing_ordinary_linemaps);
3897   for_each_line_table_case (test_lexer);
3898   for_each_line_table_case (test_lexer_string_locations_simple);
3899   for_each_line_table_case (test_lexer_string_locations_ebcdic);
3900   for_each_line_table_case (test_lexer_string_locations_hex);
3901   for_each_line_table_case (test_lexer_string_locations_oct);
3902   for_each_line_table_case (test_lexer_string_locations_letter_escape_1);
3903   for_each_line_table_case (test_lexer_string_locations_letter_escape_2);
3904   for_each_line_table_case (test_lexer_string_locations_ucn4);
3905   for_each_line_table_case (test_lexer_string_locations_ucn8);
3906   for_each_line_table_case (test_lexer_string_locations_wide_string);
3907   for_each_line_table_case (test_lexer_string_locations_string16);
3908   for_each_line_table_case (test_lexer_string_locations_string32);
3909   for_each_line_table_case (test_lexer_string_locations_u8);
3910   for_each_line_table_case (test_lexer_string_locations_utf8_source);
3911   for_each_line_table_case (test_lexer_string_locations_concatenation_1);
3912   for_each_line_table_case (test_lexer_string_locations_concatenation_2);
3913   for_each_line_table_case (test_lexer_string_locations_concatenation_3);
3914   for_each_line_table_case (test_lexer_string_locations_macro);
3915   for_each_line_table_case (test_lexer_string_locations_stringified_macro_argument);
3916   for_each_line_table_case (test_lexer_string_locations_non_string);
3917   for_each_line_table_case (test_lexer_string_locations_long_line);
3918   for_each_line_table_case (test_lexer_string_locations_raw_string_one_line);
3919   for_each_line_table_case (test_lexer_string_locations_raw_string_multiline);
3920   for_each_line_table_case (test_lexer_string_locations_raw_string_unterminated);
3921   for_each_line_table_case (test_lexer_char_constants);
3922 
3923   test_reading_source_line ();
3924 
3925   test_line_offset_overflow ();
3926 
3927   test_cpp_utf8 ();
3928 }
3929 
3930 } // namespace selftest
3931 
3932 #endif /* CHECKING_P */
3933