1// CODYlib		-*- mode:c++ -*-
2// Copyright (C) 2020 Nathan Sidwell, nathan@acm.org
3// License: Apache v2.0
4
5// Cody
6#include "internal.hh"
7// C++
8#include <algorithm>
9// C
10#include <cstring>
11// OS
12#include <unistd.h>
13#include <cerrno>
14
15// MessageBuffer code
16
17// Lines consist of words and end with a NEWLINE (0xa) char
18// Whitespace characters are TAB (0x9) and SPACE (0x20)
19// Words consist of non-whitespace chars separated by whitespace.
20// Multiple lines in one transaction are indicated by ending non-final
21// lines with a SEMICOLON (0x3b) word, immediately before the NEWLINE
22// Continuations with ; preceding it
23// Words matching regexp [-+_/%.a-zA-Z0-9]+ need no quoting.
24// Quoting with '...'
25// Anything outside of [-+_/%.a-zA-Z0-9] needs quoting
26// Anything outside of <= <space> or DEL or \' or \\ needs escaping.
27// Escapes are \\, \', \n, \t, \_, everything else as \<hex><hex>?
28// Spaces separate words, UTF8 encoding for non-ascii chars
29
30namespace Cody {
31namespace Detail {
32
33static const char CONTINUE = S2C(u8";");
34
35void MessageBuffer::BeginLine ()
36{
37  if (!buffer.empty ())
38    {
39      // Terminate the previous line with a continuation
40      buffer.reserve (buffer.size () + 3);
41      buffer.push_back (S2C(u8" "));
42      buffer.push_back (CONTINUE);
43      buffer.push_back (S2C(u8"\n"));
44    }
45  lastBol = buffer.size ();
46}
47
48// QUOTE means 'maybe quote', we search it for quote-needing chars
49
50void MessageBuffer::Append (char const *str, bool quote, size_t len)
51{
52  if (len == ~size_t (0))
53    len = strlen (str);
54
55  if (!len && !quote)
56    return;
57
58  // We want to quote characters outside of [-+_A-Za-z0-9/%.], anything
59  // that could remotely be shell-active.  UTF8 encoding for non-ascii.
60  if (quote && len)
61    {
62      quote = false;
63      // Scan looking for quote-needing characters.  We could just
64      // append until we find one, but that's probably confusing
65      for (size_t ix = len; ix--;)
66	{
67	  unsigned char c = (unsigned char)str[ix];
68	  if (!((c >= S2C(u8"a") && c <= S2C(u8"z"))
69		|| (c >= S2C(u8"A") && c <= S2C(u8"Z"))
70		|| (c >= S2C(u8"0") && c <= S2C(u8"9"))
71		|| c == S2C(u8"-") || c == S2C(u8"+") || c == S2C(u8"_")
72		|| c == S2C(u8"/") || c == S2C(u8"%") || c == S2C(u8".")))
73	    {
74	      quote = true;
75	      break;
76	    }
77	}
78    }
79
80  // Maximal length of appended string
81  buffer.reserve (buffer.size () + len * (quote ? 3 : 1) + 2);
82
83  if (quote)
84    buffer.push_back (S2C(u8"'"));
85
86  for (auto *end = str + len; str != end;)
87    {
88      auto *e = end;
89
90      if (quote)
91	// Look for next escape-needing char.  More relaxed than
92	// the earlier needs-quoting check.
93	for (e = str; e != end; ++e)
94	  {
95	    unsigned char c = (unsigned char)*e;
96	    if (c < S2C(u8" ") || c == 0x7f
97		|| c == S2C(u8"\\") || c == S2C(u8"'"))
98	      break;
99	  }
100      buffer.insert (buffer.end (), str, e);
101      str = e;
102
103      if (str == end)
104	break;
105
106      buffer.push_back (S2C(u8"\\"));
107      switch (unsigned char c = (unsigned char)*str++)
108	{
109	case S2C(u8"\t"):
110	  c = S2C(u8"t");
111	  goto append;
112
113	case S2C(u8"\n"):
114	  c = S2C(u8"n");
115	  goto append;
116
117	case S2C(u8"'"):
118	case S2C(u8"\\"):
119	append:
120	  buffer.push_back (c);
121	  break;
122
123	default:
124	  // Full-on escape.  Use 2 lower-case hex chars
125	  for (unsigned shift = 8; shift;)
126	    {
127	      shift -= 4;
128
129	      char nibble = (c >> shift) & 0xf;
130	      nibble += S2C(u8"0");
131	      if (nibble > S2C(u8"9"))
132		nibble += S2C(u8"a") - (S2C(u8"9") + 1);
133	      buffer.push_back (nibble);
134	    }
135	}
136    }
137
138  if (quote)
139    buffer.push_back (S2C(u8"'"));
140}
141
142void MessageBuffer::Append (char c)
143{
144  buffer.push_back (c);
145}
146
147void MessageBuffer::AppendInteger (unsigned u)
148{
149  // Sigh, even though std::to_string is C++11, we support building on
150  // gcc 4.8, which is a C++11 compiler lacking std::to_string.  so
151  // have something horrible.
152  std::string v (20, 0);
153  size_t len = snprintf (const_cast<char *> (v.data ()), v.size (), "%u", u);
154  v.erase (len);
155
156  AppendWord (v);
157}
158
159int MessageBuffer::Write (int fd) noexcept
160{
161  size_t limit = buffer.size () - lastBol;
162  ssize_t count = write (fd, &buffer.data ()[lastBol], limit);
163
164  int err = 0;
165  if (count < 0)
166    err = errno;
167  else
168    {
169      lastBol += count;
170      if (size_t (count) != limit)
171	err = EAGAIN;
172    }
173
174  if (err != EAGAIN && err != EINTR)
175    {
176      // Reset for next message
177      buffer.clear ();
178      lastBol = 0;
179    }
180
181  return err;
182}
183
184int MessageBuffer::Read (int fd) noexcept
185{
186  constexpr size_t blockSize = 200;
187
188  size_t lwm = buffer.size ();
189  size_t hwm = buffer.capacity ();
190  if (hwm - lwm < blockSize / 2)
191    hwm += blockSize;
192  buffer.resize (hwm);
193
194  auto iter = buffer.begin () + lwm;
195  ssize_t count = read (fd, &*iter, hwm - lwm);
196  buffer.resize (lwm + (count >= 0 ? count : 0));
197
198  if (count < 0)
199    return errno;
200
201  if (!count)
202    // End of file
203    return -1;
204
205  bool more = true;
206  for (;;)
207    {
208      auto newline = std::find (iter, buffer.end (), S2C(u8"\n"));
209      if (newline == buffer.end ())
210	break;
211      more = newline != buffer.begin () && newline[-1] == CONTINUE;
212      iter = newline + 1;
213
214      if (iter == buffer.end ())
215	break;
216
217      if (!more)
218	{
219	  // There is no continuation, but there are chars after the
220	  // newline.  Truncate the buffer and return an error
221	  buffer.resize (iter - buffer.begin ());
222	  return EINVAL;
223	}
224    }
225
226  return more ? EAGAIN : 0;
227}
228
229int MessageBuffer::Lex (std::vector<std::string> &result)
230{
231  result.clear ();
232
233  if (IsAtEnd ())
234    return ENOENT;
235
236  Assert (buffer.back () == S2C(u8"\n"));
237
238  auto iter = buffer.begin () + lastBol;
239
240  for (std::string *word = nullptr;;)
241    {
242      char c = *iter;
243
244      ++iter;
245      if (c == S2C(u8" ") || c == S2C(u8"\t"))
246	{
247	  word = nullptr;
248	  continue;
249	}
250
251      if (c == S2C(u8"\n"))
252	break;
253
254      if (c == CONTINUE)
255	{
256	  // Line continuation
257	  if (word || *iter != S2C(u8"\n"))
258	    goto malformed;
259	  ++iter;
260	  break;
261	}
262
263      if (c <= S2C(u8" ") || c >= 0x7f)
264	goto malformed;
265
266      if (!word)
267	{
268	  result.emplace_back ();
269	  word = &result.back ();
270	}
271
272      if (c == S2C(u8"'"))
273	{
274	  // Quoted word
275	  for (;;)
276	    {
277	      c = *iter;
278
279	      if (c == S2C(u8"\n"))
280		{
281		malformed:;
282		  result.clear ();
283		  iter = std::find (iter, buffer.end (), S2C(u8"\n"));
284		  auto back = iter;
285		  if (back[-1] == CONTINUE  && back[-2] == S2C(u8" "))
286		    // Smells like a line continuation
287		    back -= 2;
288		  result.emplace_back (&buffer[lastBol],
289				       back - buffer.begin () - lastBol);
290		  ++iter;
291		  lastBol = iter - buffer.begin ();
292		  return EINVAL;
293		}
294
295	      if (c < S2C(u8" ") || c >= 0x7f)
296		goto malformed;
297
298	      ++iter;
299	      if (c == S2C(u8"'"))
300		break;
301
302	      if (c == S2C(u8"\\"))
303		// escape
304		switch (c = *iter)
305		  {
306		    case S2C(u8"\\"):
307		    case S2C(u8"'"):
308		      ++iter;
309		      break;
310
311		    case S2C(u8"n"):
312		      c = S2C(u8"\n");
313		      ++iter;
314		      break;
315
316		    case S2C(u8"_"):
317		      // We used to escape SPACE as \_, so accept that
318		      c = S2C(u8" ");
319		      ++iter;
320		      break;
321
322		    case S2C(u8"t"):
323		      c = S2C(u8"\t");
324		      ++iter;
325		      break;
326
327		    default:
328		      {
329			unsigned v = 0;
330			for (unsigned nibble = 0; nibble != 2; nibble++)
331			  {
332			    c = *iter;
333			    if (c < S2C(u8"0"))
334			      {
335				if (!nibble)
336				  goto malformed;
337				break;
338			      }
339			    else if (c <= S2C(u8"9"))
340			      c -= S2C(u8"0");
341			    else if (c < S2C(u8"a"))
342			      {
343				if (!nibble)
344				  goto malformed;
345				break;
346			      }
347			    else if (c <= S2C(u8"f"))
348			      c -= S2C(u8"a") - 10;
349			    else
350			      {
351				if (!nibble)
352				  goto malformed;
353				break;
354			      }
355			    ++iter;
356			    v = (v << 4) | c;
357			  }
358			c = v;
359		      }
360		  }
361	      word->push_back (c);
362	    }
363	}
364      else
365	// Unquoted character
366	word->push_back (c);
367    }
368  lastBol = iter - buffer.begin ();
369  if (result.empty ())
370    return ENOENT;
371
372  return 0;
373}
374
375void MessageBuffer::LexedLine (std::string &str)
376{
377  if (lastBol)
378    {
379      size_t pos = lastBol - 1;
380      for (; pos; pos--)
381	if (buffer[pos-1] == S2C(u8"\n"))
382	  break;
383
384      size_t end = lastBol - 1;
385      if (buffer[end-1] == CONTINUE && buffer[end-2] == S2C(u8" "))
386	// Strip line continuation
387	end -= 2;
388      str.append (&buffer[pos], end - pos);
389    }
390}
391} // Detail
392} // Cody
393