xref: /netbsd-src/external/gpl3/gcc/dist/libcody/buffer.cc (revision b1e838363e3c6fc78a55519254d99869742dd33c)
1*b1e83836Smrg // CODYlib		-*- mode:c++ -*-
2*b1e83836Smrg // Copyright (C) 2020 Nathan Sidwell, nathan@acm.org
3*b1e83836Smrg // License: Apache v2.0
4*b1e83836Smrg 
5*b1e83836Smrg // Cody
6*b1e83836Smrg #include "internal.hh"
7*b1e83836Smrg // C++
8*b1e83836Smrg #include <algorithm>
9*b1e83836Smrg // C
10*b1e83836Smrg #include <cstring>
11*b1e83836Smrg // OS
12*b1e83836Smrg #include <unistd.h>
13*b1e83836Smrg #include <cerrno>
14*b1e83836Smrg 
15*b1e83836Smrg // MessageBuffer code
16*b1e83836Smrg 
17*b1e83836Smrg // Lines consist of words and end with a NEWLINE (0xa) char
18*b1e83836Smrg // Whitespace characters are TAB (0x9) and SPACE (0x20)
19*b1e83836Smrg // Words consist of non-whitespace chars separated by whitespace.
20*b1e83836Smrg // Multiple lines in one transaction are indicated by ending non-final
21*b1e83836Smrg // lines with a SEMICOLON (0x3b) word, immediately before the NEWLINE
22*b1e83836Smrg // Continuations with ; preceding it
23*b1e83836Smrg // Words matching regexp [-+_/%.a-zA-Z0-9]+ need no quoting.
24*b1e83836Smrg // Quoting with '...'
25*b1e83836Smrg // Anything outside of [-+_/%.a-zA-Z0-9] needs quoting
26*b1e83836Smrg // Anything outside of <= <space> or DEL or \' or \\ needs escaping.
27*b1e83836Smrg // Escapes are \\, \', \n, \t, \_, everything else as \<hex><hex>?
28*b1e83836Smrg // Spaces separate words, UTF8 encoding for non-ascii chars
29*b1e83836Smrg 
30*b1e83836Smrg namespace Cody {
31*b1e83836Smrg namespace Detail {
32*b1e83836Smrg 
33*b1e83836Smrg static const char CONTINUE = S2C(u8";");
34*b1e83836Smrg 
BeginLine()35*b1e83836Smrg void MessageBuffer::BeginLine ()
36*b1e83836Smrg {
37*b1e83836Smrg   if (!buffer.empty ())
38*b1e83836Smrg     {
39*b1e83836Smrg       // Terminate the previous line with a continuation
40*b1e83836Smrg       buffer.reserve (buffer.size () + 3);
41*b1e83836Smrg       buffer.push_back (S2C(u8" "));
42*b1e83836Smrg       buffer.push_back (CONTINUE);
43*b1e83836Smrg       buffer.push_back (S2C(u8"\n"));
44*b1e83836Smrg     }
45*b1e83836Smrg   lastBol = buffer.size ();
46*b1e83836Smrg }
47*b1e83836Smrg 
48*b1e83836Smrg // QUOTE means 'maybe quote', we search it for quote-needing chars
49*b1e83836Smrg 
Append(char const * str,bool quote,size_t len)50*b1e83836Smrg void MessageBuffer::Append (char const *str, bool quote, size_t len)
51*b1e83836Smrg {
52*b1e83836Smrg   if (len == ~size_t (0))
53*b1e83836Smrg     len = strlen (str);
54*b1e83836Smrg 
55*b1e83836Smrg   if (!len && !quote)
56*b1e83836Smrg     return;
57*b1e83836Smrg 
58*b1e83836Smrg   // We want to quote characters outside of [-+_A-Za-z0-9/%.], anything
59*b1e83836Smrg   // that could remotely be shell-active.  UTF8 encoding for non-ascii.
60*b1e83836Smrg   if (quote && len)
61*b1e83836Smrg     {
62*b1e83836Smrg       quote = false;
63*b1e83836Smrg       // Scan looking for quote-needing characters.  We could just
64*b1e83836Smrg       // append until we find one, but that's probably confusing
65*b1e83836Smrg       for (size_t ix = len; ix--;)
66*b1e83836Smrg 	{
67*b1e83836Smrg 	  unsigned char c = (unsigned char)str[ix];
68*b1e83836Smrg 	  if (!((c >= S2C(u8"a") && c <= S2C(u8"z"))
69*b1e83836Smrg 		|| (c >= S2C(u8"A") && c <= S2C(u8"Z"))
70*b1e83836Smrg 		|| (c >= S2C(u8"0") && c <= S2C(u8"9"))
71*b1e83836Smrg 		|| c == S2C(u8"-") || c == S2C(u8"+") || c == S2C(u8"_")
72*b1e83836Smrg 		|| c == S2C(u8"/") || c == S2C(u8"%") || c == S2C(u8".")))
73*b1e83836Smrg 	    {
74*b1e83836Smrg 	      quote = true;
75*b1e83836Smrg 	      break;
76*b1e83836Smrg 	    }
77*b1e83836Smrg 	}
78*b1e83836Smrg     }
79*b1e83836Smrg 
80*b1e83836Smrg   // Maximal length of appended string
81*b1e83836Smrg   buffer.reserve (buffer.size () + len * (quote ? 3 : 1) + 2);
82*b1e83836Smrg 
83*b1e83836Smrg   if (quote)
84*b1e83836Smrg     buffer.push_back (S2C(u8"'"));
85*b1e83836Smrg 
86*b1e83836Smrg   for (auto *end = str + len; str != end;)
87*b1e83836Smrg     {
88*b1e83836Smrg       auto *e = end;
89*b1e83836Smrg 
90*b1e83836Smrg       if (quote)
91*b1e83836Smrg 	// Look for next escape-needing char.  More relaxed than
92*b1e83836Smrg 	// the earlier needs-quoting check.
93*b1e83836Smrg 	for (e = str; e != end; ++e)
94*b1e83836Smrg 	  {
95*b1e83836Smrg 	    unsigned char c = (unsigned char)*e;
96*b1e83836Smrg 	    if (c < S2C(u8" ") || c == 0x7f
97*b1e83836Smrg 		|| c == S2C(u8"\\") || c == S2C(u8"'"))
98*b1e83836Smrg 	      break;
99*b1e83836Smrg 	  }
100*b1e83836Smrg       buffer.insert (buffer.end (), str, e);
101*b1e83836Smrg       str = e;
102*b1e83836Smrg 
103*b1e83836Smrg       if (str == end)
104*b1e83836Smrg 	break;
105*b1e83836Smrg 
106*b1e83836Smrg       buffer.push_back (S2C(u8"\\"));
107*b1e83836Smrg       switch (unsigned char c = (unsigned char)*str++)
108*b1e83836Smrg 	{
109*b1e83836Smrg 	case S2C(u8"\t"):
110*b1e83836Smrg 	  c = S2C(u8"t");
111*b1e83836Smrg 	  goto append;
112*b1e83836Smrg 
113*b1e83836Smrg 	case S2C(u8"\n"):
114*b1e83836Smrg 	  c = S2C(u8"n");
115*b1e83836Smrg 	  goto append;
116*b1e83836Smrg 
117*b1e83836Smrg 	case S2C(u8"'"):
118*b1e83836Smrg 	case S2C(u8"\\"):
119*b1e83836Smrg 	append:
120*b1e83836Smrg 	  buffer.push_back (c);
121*b1e83836Smrg 	  break;
122*b1e83836Smrg 
123*b1e83836Smrg 	default:
124*b1e83836Smrg 	  // Full-on escape.  Use 2 lower-case hex chars
125*b1e83836Smrg 	  for (unsigned shift = 8; shift;)
126*b1e83836Smrg 	    {
127*b1e83836Smrg 	      shift -= 4;
128*b1e83836Smrg 
129*b1e83836Smrg 	      char nibble = (c >> shift) & 0xf;
130*b1e83836Smrg 	      nibble += S2C(u8"0");
131*b1e83836Smrg 	      if (nibble > S2C(u8"9"))
132*b1e83836Smrg 		nibble += S2C(u8"a") - (S2C(u8"9") + 1);
133*b1e83836Smrg 	      buffer.push_back (nibble);
134*b1e83836Smrg 	    }
135*b1e83836Smrg 	}
136*b1e83836Smrg     }
137*b1e83836Smrg 
138*b1e83836Smrg   if (quote)
139*b1e83836Smrg     buffer.push_back (S2C(u8"'"));
140*b1e83836Smrg }
141*b1e83836Smrg 
Append(char c)142*b1e83836Smrg void MessageBuffer::Append (char c)
143*b1e83836Smrg {
144*b1e83836Smrg   buffer.push_back (c);
145*b1e83836Smrg }
146*b1e83836Smrg 
AppendInteger(unsigned u)147*b1e83836Smrg void MessageBuffer::AppendInteger (unsigned u)
148*b1e83836Smrg {
149*b1e83836Smrg   // Sigh, even though std::to_string is C++11, we support building on
150*b1e83836Smrg   // gcc 4.8, which is a C++11 compiler lacking std::to_string.  so
151*b1e83836Smrg   // have something horrible.
152*b1e83836Smrg   std::string v (20, 0);
153*b1e83836Smrg   size_t len = snprintf (const_cast<char *> (v.data ()), v.size (), "%u", u);
154*b1e83836Smrg   v.erase (len);
155*b1e83836Smrg 
156*b1e83836Smrg   AppendWord (v);
157*b1e83836Smrg }
158*b1e83836Smrg 
Write(int fd)159*b1e83836Smrg int MessageBuffer::Write (int fd) noexcept
160*b1e83836Smrg {
161*b1e83836Smrg   size_t limit = buffer.size () - lastBol;
162*b1e83836Smrg   ssize_t count = write (fd, &buffer.data ()[lastBol], limit);
163*b1e83836Smrg 
164*b1e83836Smrg   int err = 0;
165*b1e83836Smrg   if (count < 0)
166*b1e83836Smrg     err = errno;
167*b1e83836Smrg   else
168*b1e83836Smrg     {
169*b1e83836Smrg       lastBol += count;
170*b1e83836Smrg       if (size_t (count) != limit)
171*b1e83836Smrg 	err = EAGAIN;
172*b1e83836Smrg     }
173*b1e83836Smrg 
174*b1e83836Smrg   if (err != EAGAIN && err != EINTR)
175*b1e83836Smrg     {
176*b1e83836Smrg       // Reset for next message
177*b1e83836Smrg       buffer.clear ();
178*b1e83836Smrg       lastBol = 0;
179*b1e83836Smrg     }
180*b1e83836Smrg 
181*b1e83836Smrg   return err;
182*b1e83836Smrg }
183*b1e83836Smrg 
Read(int fd)184*b1e83836Smrg int MessageBuffer::Read (int fd) noexcept
185*b1e83836Smrg {
186*b1e83836Smrg   constexpr size_t blockSize = 200;
187*b1e83836Smrg 
188*b1e83836Smrg   size_t lwm = buffer.size ();
189*b1e83836Smrg   size_t hwm = buffer.capacity ();
190*b1e83836Smrg   if (hwm - lwm < blockSize / 2)
191*b1e83836Smrg     hwm += blockSize;
192*b1e83836Smrg   buffer.resize (hwm);
193*b1e83836Smrg 
194*b1e83836Smrg   auto iter = buffer.begin () + lwm;
195*b1e83836Smrg   ssize_t count = read (fd, &*iter, hwm - lwm);
196*b1e83836Smrg   buffer.resize (lwm + (count >= 0 ? count : 0));
197*b1e83836Smrg 
198*b1e83836Smrg   if (count < 0)
199*b1e83836Smrg     return errno;
200*b1e83836Smrg 
201*b1e83836Smrg   if (!count)
202*b1e83836Smrg     // End of file
203*b1e83836Smrg     return -1;
204*b1e83836Smrg 
205*b1e83836Smrg   bool more = true;
206*b1e83836Smrg   for (;;)
207*b1e83836Smrg     {
208*b1e83836Smrg       auto newline = std::find (iter, buffer.end (), S2C(u8"\n"));
209*b1e83836Smrg       if (newline == buffer.end ())
210*b1e83836Smrg 	break;
211*b1e83836Smrg       more = newline != buffer.begin () && newline[-1] == CONTINUE;
212*b1e83836Smrg       iter = newline + 1;
213*b1e83836Smrg 
214*b1e83836Smrg       if (iter == buffer.end ())
215*b1e83836Smrg 	break;
216*b1e83836Smrg 
217*b1e83836Smrg       if (!more)
218*b1e83836Smrg 	{
219*b1e83836Smrg 	  // There is no continuation, but there are chars after the
220*b1e83836Smrg 	  // newline.  Truncate the buffer and return an error
221*b1e83836Smrg 	  buffer.resize (iter - buffer.begin ());
222*b1e83836Smrg 	  return EINVAL;
223*b1e83836Smrg 	}
224*b1e83836Smrg     }
225*b1e83836Smrg 
226*b1e83836Smrg   return more ? EAGAIN : 0;
227*b1e83836Smrg }
228*b1e83836Smrg 
Lex(std::vector<std::string> & result)229*b1e83836Smrg int MessageBuffer::Lex (std::vector<std::string> &result)
230*b1e83836Smrg {
231*b1e83836Smrg   result.clear ();
232*b1e83836Smrg 
233*b1e83836Smrg   if (IsAtEnd ())
234*b1e83836Smrg     return ENOENT;
235*b1e83836Smrg 
236*b1e83836Smrg   Assert (buffer.back () == S2C(u8"\n"));
237*b1e83836Smrg 
238*b1e83836Smrg   auto iter = buffer.begin () + lastBol;
239*b1e83836Smrg 
240*b1e83836Smrg   for (std::string *word = nullptr;;)
241*b1e83836Smrg     {
242*b1e83836Smrg       char c = *iter;
243*b1e83836Smrg 
244*b1e83836Smrg       ++iter;
245*b1e83836Smrg       if (c == S2C(u8" ") || c == S2C(u8"\t"))
246*b1e83836Smrg 	{
247*b1e83836Smrg 	  word = nullptr;
248*b1e83836Smrg 	  continue;
249*b1e83836Smrg 	}
250*b1e83836Smrg 
251*b1e83836Smrg       if (c == S2C(u8"\n"))
252*b1e83836Smrg 	break;
253*b1e83836Smrg 
254*b1e83836Smrg       if (c == CONTINUE)
255*b1e83836Smrg 	{
256*b1e83836Smrg 	  // Line continuation
257*b1e83836Smrg 	  if (word || *iter != S2C(u8"\n"))
258*b1e83836Smrg 	    goto malformed;
259*b1e83836Smrg 	  ++iter;
260*b1e83836Smrg 	  break;
261*b1e83836Smrg 	}
262*b1e83836Smrg 
263*b1e83836Smrg       if (c <= S2C(u8" ") || c >= 0x7f)
264*b1e83836Smrg 	goto malformed;
265*b1e83836Smrg 
266*b1e83836Smrg       if (!word)
267*b1e83836Smrg 	{
268*b1e83836Smrg 	  result.emplace_back ();
269*b1e83836Smrg 	  word = &result.back ();
270*b1e83836Smrg 	}
271*b1e83836Smrg 
272*b1e83836Smrg       if (c == S2C(u8"'"))
273*b1e83836Smrg 	{
274*b1e83836Smrg 	  // Quoted word
275*b1e83836Smrg 	  for (;;)
276*b1e83836Smrg 	    {
277*b1e83836Smrg 	      c = *iter;
278*b1e83836Smrg 
279*b1e83836Smrg 	      if (c == S2C(u8"\n"))
280*b1e83836Smrg 		{
281*b1e83836Smrg 		malformed:;
282*b1e83836Smrg 		  result.clear ();
283*b1e83836Smrg 		  iter = std::find (iter, buffer.end (), S2C(u8"\n"));
284*b1e83836Smrg 		  auto back = iter;
285*b1e83836Smrg 		  if (back[-1] == CONTINUE  && back[-2] == S2C(u8" "))
286*b1e83836Smrg 		    // Smells like a line continuation
287*b1e83836Smrg 		    back -= 2;
288*b1e83836Smrg 		  result.emplace_back (&buffer[lastBol],
289*b1e83836Smrg 				       back - buffer.begin () - lastBol);
290*b1e83836Smrg 		  ++iter;
291*b1e83836Smrg 		  lastBol = iter - buffer.begin ();
292*b1e83836Smrg 		  return EINVAL;
293*b1e83836Smrg 		}
294*b1e83836Smrg 
295*b1e83836Smrg 	      if (c < S2C(u8" ") || c >= 0x7f)
296*b1e83836Smrg 		goto malformed;
297*b1e83836Smrg 
298*b1e83836Smrg 	      ++iter;
299*b1e83836Smrg 	      if (c == S2C(u8"'"))
300*b1e83836Smrg 		break;
301*b1e83836Smrg 
302*b1e83836Smrg 	      if (c == S2C(u8"\\"))
303*b1e83836Smrg 		// escape
304*b1e83836Smrg 		switch (c = *iter)
305*b1e83836Smrg 		  {
306*b1e83836Smrg 		    case S2C(u8"\\"):
307*b1e83836Smrg 		    case S2C(u8"'"):
308*b1e83836Smrg 		      ++iter;
309*b1e83836Smrg 		      break;
310*b1e83836Smrg 
311*b1e83836Smrg 		    case S2C(u8"n"):
312*b1e83836Smrg 		      c = S2C(u8"\n");
313*b1e83836Smrg 		      ++iter;
314*b1e83836Smrg 		      break;
315*b1e83836Smrg 
316*b1e83836Smrg 		    case S2C(u8"_"):
317*b1e83836Smrg 		      // We used to escape SPACE as \_, so accept that
318*b1e83836Smrg 		      c = S2C(u8" ");
319*b1e83836Smrg 		      ++iter;
320*b1e83836Smrg 		      break;
321*b1e83836Smrg 
322*b1e83836Smrg 		    case S2C(u8"t"):
323*b1e83836Smrg 		      c = S2C(u8"\t");
324*b1e83836Smrg 		      ++iter;
325*b1e83836Smrg 		      break;
326*b1e83836Smrg 
327*b1e83836Smrg 		    default:
328*b1e83836Smrg 		      {
329*b1e83836Smrg 			unsigned v = 0;
330*b1e83836Smrg 			for (unsigned nibble = 0; nibble != 2; nibble++)
331*b1e83836Smrg 			  {
332*b1e83836Smrg 			    c = *iter;
333*b1e83836Smrg 			    if (c < S2C(u8"0"))
334*b1e83836Smrg 			      {
335*b1e83836Smrg 				if (!nibble)
336*b1e83836Smrg 				  goto malformed;
337*b1e83836Smrg 				break;
338*b1e83836Smrg 			      }
339*b1e83836Smrg 			    else if (c <= S2C(u8"9"))
340*b1e83836Smrg 			      c -= S2C(u8"0");
341*b1e83836Smrg 			    else if (c < S2C(u8"a"))
342*b1e83836Smrg 			      {
343*b1e83836Smrg 				if (!nibble)
344*b1e83836Smrg 				  goto malformed;
345*b1e83836Smrg 				break;
346*b1e83836Smrg 			      }
347*b1e83836Smrg 			    else if (c <= S2C(u8"f"))
348*b1e83836Smrg 			      c -= S2C(u8"a") - 10;
349*b1e83836Smrg 			    else
350*b1e83836Smrg 			      {
351*b1e83836Smrg 				if (!nibble)
352*b1e83836Smrg 				  goto malformed;
353*b1e83836Smrg 				break;
354*b1e83836Smrg 			      }
355*b1e83836Smrg 			    ++iter;
356*b1e83836Smrg 			    v = (v << 4) | c;
357*b1e83836Smrg 			  }
358*b1e83836Smrg 			c = v;
359*b1e83836Smrg 		      }
360*b1e83836Smrg 		  }
361*b1e83836Smrg 	      word->push_back (c);
362*b1e83836Smrg 	    }
363*b1e83836Smrg 	}
364*b1e83836Smrg       else
365*b1e83836Smrg 	// Unquoted character
366*b1e83836Smrg 	word->push_back (c);
367*b1e83836Smrg     }
368*b1e83836Smrg   lastBol = iter - buffer.begin ();
369*b1e83836Smrg   if (result.empty ())
370*b1e83836Smrg     return ENOENT;
371*b1e83836Smrg 
372*b1e83836Smrg   return 0;
373*b1e83836Smrg }
374*b1e83836Smrg 
LexedLine(std::string & str)375*b1e83836Smrg void MessageBuffer::LexedLine (std::string &str)
376*b1e83836Smrg {
377*b1e83836Smrg   if (lastBol)
378*b1e83836Smrg     {
379*b1e83836Smrg       size_t pos = lastBol - 1;
380*b1e83836Smrg       for (; pos; pos--)
381*b1e83836Smrg 	if (buffer[pos-1] == S2C(u8"\n"))
382*b1e83836Smrg 	  break;
383*b1e83836Smrg 
384*b1e83836Smrg       size_t end = lastBol - 1;
385*b1e83836Smrg       if (buffer[end-1] == CONTINUE && buffer[end-2] == S2C(u8" "))
386*b1e83836Smrg 	// Strip line continuation
387*b1e83836Smrg 	end -= 2;
388*b1e83836Smrg       str.append (&buffer[pos], end - pos);
389*b1e83836Smrg     }
390*b1e83836Smrg }
391*b1e83836Smrg } // Detail
392*b1e83836Smrg } // Cody
393