1 /* $NetBSD: dosbuf.c,v 1.1.1.1 2016/01/10 21:36:21 christos Exp $ */
2
3 /* Messy DOS-specific code for correctly treating binary, Unix text
4 and DOS text files.
5
6 This has several aspects:
7
8 * Guessing the file type (unless the user tells us);
9 * Stripping CR characters from DOS text files (otherwise regex
10 functions won't work correctly);
11 * Reporting correct byte count with -b for any kind of file.
12
13 */
14
15 typedef enum {
16 UNKNOWN, DOS_BINARY, DOS_TEXT, UNIX_TEXT
17 } File_type;
18
19 struct dos_map {
20 off_t pos; /* position in buffer passed to matcher */
21 off_t add; /* how much to add when reporting char position */
22 };
23
24 static int dos_report_unix_offset = 0;
25
26 static File_type dos_file_type = UNKNOWN;
27 static File_type dos_use_file_type = UNKNOWN;
28 static off_t dos_stripped_crs = 0;
29 static struct dos_map *dos_pos_map;
30 static int dos_pos_map_size = 0;
31 static int dos_pos_map_used = 0;
32 static int inp_map_idx = 0, out_map_idx = 1;
33
34 /* Guess DOS file type by looking at its contents. */
35 static inline File_type
guess_type(char * buf,register size_t buflen)36 guess_type (char *buf, register size_t buflen)
37 {
38 int crlf_seen = 0;
39 register char *bp = buf;
40
41 while (buflen--)
42 {
43 /* Treat a file as binary if it has a NUL character. */
44 if (!*bp)
45 return DOS_BINARY;
46
47 /* CR before LF means DOS text file (unless we later see
48 binary characters). */
49 else if (*bp == '\r' && buflen && bp[1] == '\n')
50 crlf_seen = 1;
51
52 bp++;
53 }
54
55 return crlf_seen ? DOS_TEXT : UNIX_TEXT;
56 }
57
58 /* Convert external DOS file representation to internal.
59 Return the count of characters left in the buffer.
60 Build table to map character positions when reporting byte counts. */
61 static inline int
undossify_input(register char * buf,size_t buflen)62 undossify_input (register char *buf, size_t buflen)
63 {
64 int chars_left = 0;
65
66 if (totalcc == 0)
67 {
68 /* New file: forget everything we knew about character
69 position mapping table and file type. */
70 inp_map_idx = 0;
71 out_map_idx = 1;
72 dos_pos_map_used = 0;
73 dos_stripped_crs = 0;
74 dos_file_type = dos_use_file_type;
75 }
76
77 /* Guess if this file is binary, unless we already know that. */
78 if (dos_file_type == UNKNOWN)
79 dos_file_type = guess_type(buf, buflen);
80
81 /* If this file is to be treated as DOS Text, strip the CR characters
82 and maybe build the table for character position mapping on output. */
83 if (dos_file_type == DOS_TEXT)
84 {
85 char *destp = buf;
86
87 while (buflen--)
88 {
89 if (*buf != '\r')
90 {
91 *destp++ = *buf++;
92 chars_left++;
93 }
94 else
95 {
96 buf++;
97 if (out_byte && !dos_report_unix_offset)
98 {
99 dos_stripped_crs++;
100 while (buflen && *buf == '\r')
101 {
102 dos_stripped_crs++;
103 buflen--;
104 buf++;
105 }
106 if (inp_map_idx >= dos_pos_map_size - 1)
107 {
108 dos_pos_map_size = inp_map_idx ? inp_map_idx * 2 : 1000;
109 dos_pos_map =
110 (struct dos_map *)xrealloc((char *)dos_pos_map,
111 dos_pos_map_size *
112 sizeof(struct dos_map));
113 }
114
115 if (!inp_map_idx)
116 {
117 /* Add sentinel entry. */
118 dos_pos_map[inp_map_idx].pos = 0;
119 dos_pos_map[inp_map_idx++].add = 0;
120
121 /* Initialize first real entry. */
122 dos_pos_map[inp_map_idx].add = 0;
123 }
124
125 /* Put the new entry. If the stripped CR characters
126 precede a Newline (the usual case), pretend that
127 they were found *after* the Newline. This makes
128 displayed byte offsets more reasonable in some
129 cases, and fits better the intuitive notion that
130 the line ends *before* the CR, not *after* it. */
131 inp_map_idx++;
132 dos_pos_map[inp_map_idx-1].pos =
133 (*buf == '\n' ? destp + 1 : destp ) - bufbeg + totalcc;
134 dos_pos_map[inp_map_idx].add = dos_stripped_crs;
135 dos_pos_map_used = inp_map_idx;
136
137 /* The following will be updated on the next pass. */
138 dos_pos_map[inp_map_idx].pos = destp - bufbeg + totalcc + 1;
139 }
140 }
141 }
142
143 return chars_left;
144 }
145
146 return buflen;
147 }
148
149 /* Convert internal byte count into external. */
150 static inline off_t
dossified_pos(off_t byteno)151 dossified_pos (off_t byteno)
152 {
153 off_t pos_lo;
154 off_t pos_hi;
155
156 if (dos_file_type != DOS_TEXT || dos_report_unix_offset)
157 return byteno;
158
159 /* Optimization: usually the file will be scanned sequentially.
160 So in most cases, this byte position will be found in the
161 table near the previous one, as recorded in `out_map_idx'. */
162 pos_lo = dos_pos_map[out_map_idx-1].pos;
163 pos_hi = dos_pos_map[out_map_idx].pos;
164
165 /* If the initial guess failed, search up or down, as
166 appropriate, beginning with the previous place. */
167 if (byteno >= pos_hi)
168 {
169 out_map_idx++;
170 while (out_map_idx < dos_pos_map_used &&
171 byteno >= dos_pos_map[out_map_idx].pos)
172 out_map_idx++;
173 }
174
175 else if (byteno < pos_lo)
176 {
177 out_map_idx--;
178 while (out_map_idx > 1 && byteno < dos_pos_map[out_map_idx-1].pos)
179 out_map_idx--;
180 }
181
182 return byteno + dos_pos_map[out_map_idx].add;
183 }
184