xref: /netbsd-src/external/gpl2/grep/dist/src/dosbuf.c (revision a8fa202a6440953be7b92a8960a811bff58203f4)
1 /*	$NetBSD: dosbuf.c,v 1.1.1.1 2016/01/10 21:36:21 christos Exp $	*/
2 
3 /* Messy DOS-specific code for correctly treating binary, Unix text
4    and DOS text files.
5 
6    This has several aspects:
7 
8      * Guessing the file type (unless the user tells us);
9      * Stripping CR characters from DOS text files (otherwise regex
10        functions won't work correctly);
11      * Reporting correct byte count with -b for any kind of file.
12 
13 */
14 
15 typedef enum {
16   UNKNOWN, DOS_BINARY, DOS_TEXT, UNIX_TEXT
17 } File_type;
18 
19 struct dos_map {
20   off_t pos;	/* position in buffer passed to matcher */
21   off_t add;	/* how much to add when reporting char position */
22 };
23 
24 static int       dos_report_unix_offset = 0;
25 
26 static File_type dos_file_type     = UNKNOWN;
27 static File_type dos_use_file_type = UNKNOWN;
28 static off_t     dos_stripped_crs  = 0;
29 static struct dos_map *dos_pos_map;
30 static int       dos_pos_map_size  = 0;
31 static int       dos_pos_map_used  = 0;
32 static int       inp_map_idx = 0, out_map_idx = 1;
33 
34 /* Guess DOS file type by looking at its contents.  */
35 static inline File_type
guess_type(char * buf,register size_t buflen)36 guess_type (char *buf, register size_t buflen)
37 {
38   int crlf_seen = 0;
39   register char *bp = buf;
40 
41   while (buflen--)
42     {
43       /* Treat a file as binary if it has a NUL character.  */
44       if (!*bp)
45         return DOS_BINARY;
46 
47       /* CR before LF means DOS text file (unless we later see
48          binary characters).  */
49       else if (*bp == '\r' && buflen && bp[1] == '\n')
50         crlf_seen = 1;
51 
52       bp++;
53     }
54 
55   return crlf_seen ? DOS_TEXT : UNIX_TEXT;
56 }
57 
58 /* Convert external DOS file representation to internal.
59    Return the count of characters left in the buffer.
60    Build table to map character positions when reporting byte counts.  */
61 static inline int
undossify_input(register char * buf,size_t buflen)62 undossify_input (register char *buf, size_t buflen)
63 {
64   int chars_left = 0;
65 
66   if (totalcc == 0)
67     {
68       /* New file: forget everything we knew about character
69          position mapping table and file type.  */
70       inp_map_idx = 0;
71       out_map_idx = 1;
72       dos_pos_map_used = 0;
73       dos_stripped_crs = 0;
74       dos_file_type = dos_use_file_type;
75     }
76 
77   /* Guess if this file is binary, unless we already know that.  */
78   if (dos_file_type == UNKNOWN)
79     dos_file_type = guess_type(buf, buflen);
80 
81   /* If this file is to be treated as DOS Text, strip the CR characters
82      and maybe build the table for character position mapping on output.  */
83   if (dos_file_type == DOS_TEXT)
84     {
85       char   *destp   = buf;
86 
87       while (buflen--)
88         {
89           if (*buf != '\r')
90             {
91               *destp++ = *buf++;
92               chars_left++;
93             }
94           else
95             {
96               buf++;
97               if (out_byte && !dos_report_unix_offset)
98                 {
99                   dos_stripped_crs++;
100                   while (buflen && *buf == '\r')
101                     {
102                       dos_stripped_crs++;
103                       buflen--;
104                       buf++;
105                     }
106                   if (inp_map_idx >= dos_pos_map_size - 1)
107                     {
108                       dos_pos_map_size = inp_map_idx ? inp_map_idx * 2 : 1000;
109                       dos_pos_map =
110                         (struct dos_map *)xrealloc((char *)dos_pos_map,
111 						   dos_pos_map_size *
112 						   sizeof(struct dos_map));
113                     }
114 
115                   if (!inp_map_idx)
116                     {
117                       /* Add sentinel entry.  */
118                       dos_pos_map[inp_map_idx].pos = 0;
119                       dos_pos_map[inp_map_idx++].add = 0;
120 
121                       /* Initialize first real entry.  */
122                       dos_pos_map[inp_map_idx].add = 0;
123                     }
124 
125                   /* Put the new entry.  If the stripped CR characters
126                      precede a Newline (the usual case), pretend that
127                      they were found *after* the Newline.  This makes
128                      displayed byte offsets more reasonable in some
129                      cases, and fits better the intuitive notion that
130                      the line ends *before* the CR, not *after* it.  */
131                   inp_map_idx++;
132                   dos_pos_map[inp_map_idx-1].pos =
133                     (*buf == '\n' ? destp + 1 : destp ) - bufbeg + totalcc;
134                   dos_pos_map[inp_map_idx].add = dos_stripped_crs;
135                   dos_pos_map_used = inp_map_idx;
136 
137                   /* The following will be updated on the next pass.  */
138                   dos_pos_map[inp_map_idx].pos = destp - bufbeg + totalcc + 1;
139                 }
140             }
141         }
142 
143       return chars_left;
144     }
145 
146   return buflen;
147 }
148 
149 /* Convert internal byte count into external.  */
150 static inline off_t
dossified_pos(off_t byteno)151 dossified_pos (off_t byteno)
152 {
153   off_t pos_lo;
154   off_t pos_hi;
155 
156   if (dos_file_type != DOS_TEXT || dos_report_unix_offset)
157     return byteno;
158 
159   /* Optimization: usually the file will be scanned sequentially.
160      So in most cases, this byte position will be found in the
161      table near the previous one, as recorded in `out_map_idx'.  */
162   pos_lo = dos_pos_map[out_map_idx-1].pos;
163   pos_hi = dos_pos_map[out_map_idx].pos;
164 
165   /* If the initial guess failed, search up or down, as
166      appropriate, beginning with the previous place.  */
167   if (byteno >= pos_hi)
168     {
169       out_map_idx++;
170       while (out_map_idx < dos_pos_map_used &&
171              byteno >= dos_pos_map[out_map_idx].pos)
172         out_map_idx++;
173     }
174 
175   else if (byteno < pos_lo)
176     {
177       out_map_idx--;
178       while (out_map_idx > 1 && byteno < dos_pos_map[out_map_idx-1].pos)
179         out_map_idx--;
180     }
181 
182   return byteno + dos_pos_map[out_map_idx].add;
183 }
184