buffer.cc 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392
  1. // CODYlib -*- mode:c++ -*-
  2. // Copyright (C) 2020 Nathan Sidwell, nathan@acm.org
  3. // License: Apache v2.0
  4. // Cody
  5. #include "internal.hh"
  6. // C++
  7. #include <algorithm>
  8. // C
  9. #include <cstring>
  10. // OS
  11. #include <unistd.h>
  12. #include <cerrno>
  13. // MessageBuffer code
  14. // Lines consist of words and end with a NEWLINE (0xa) char
  15. // Whitespace characters are TAB (0x9) and SPACE (0x20)
  16. // Words consist of non-whitespace chars separated by whitespace.
  17. // Multiple lines in one transaction are indicated by ending non-final
  18. // lines with a SEMICOLON (0x3b) word, immediately before the NEWLINE
  19. // Continuations with ; preceding it
  20. // Words matching regexp [-+_/%.a-zA-Z0-9]+ need no quoting.
  21. // Quoting with '...'
  22. // Anything outside of [-+_/%.a-zA-Z0-9] needs quoting
  23. // Anything outside of <= <space> or DEL or \' or \\ needs escaping.
  24. // Escapes are \\, \', \n, \t, \_, everything else as \<hex><hex>?
  25. // Spaces separate words, UTF8 encoding for non-ascii chars
  26. namespace Cody {
  27. namespace Detail {
  28. static const char CONTINUE = S2C(u8";");
  29. void MessageBuffer::BeginLine ()
  30. {
  31. if (!buffer.empty ())
  32. {
  33. // Terminate the previous line with a continuation
  34. buffer.reserve (buffer.size () + 3);
  35. buffer.push_back (S2C(u8" "));
  36. buffer.push_back (CONTINUE);
  37. buffer.push_back (S2C(u8"\n"));
  38. }
  39. lastBol = buffer.size ();
  40. }
  41. // QUOTE means 'maybe quote', we search it for quote-needing chars
  42. void MessageBuffer::Append (char const *str, bool quote, size_t len)
  43. {
  44. if (len == ~size_t (0))
  45. len = strlen (str);
  46. if (!len && !quote)
  47. return;
  48. // We want to quote characters outside of [-+_A-Za-z0-9/%.], anything
  49. // that could remotely be shell-active. UTF8 encoding for non-ascii.
  50. if (quote && len)
  51. {
  52. quote = false;
  53. // Scan looking for quote-needing characters. We could just
  54. // append until we find one, but that's probably confusing
  55. for (size_t ix = len; ix--;)
  56. {
  57. unsigned char c = (unsigned char)str[ix];
  58. if (!((c >= S2C(u8"a") && c <= S2C(u8"z"))
  59. || (c >= S2C(u8"A") && c <= S2C(u8"Z"))
  60. || (c >= S2C(u8"0") && c <= S2C(u8"9"))
  61. || c == S2C(u8"-") || c == S2C(u8"+") || c == S2C(u8"_")
  62. || c == S2C(u8"/") || c == S2C(u8"%") || c == S2C(u8".")))
  63. {
  64. quote = true;
  65. break;
  66. }
  67. }
  68. }
  69. // Maximal length of appended string
  70. buffer.reserve (buffer.size () + len * (quote ? 3 : 1) + 2);
  71. if (quote)
  72. buffer.push_back (S2C(u8"'"));
  73. for (auto *end = str + len; str != end;)
  74. {
  75. auto *e = end;
  76. if (quote)
  77. // Look for next escape-needing char. More relaxed than
  78. // the earlier needs-quoting check.
  79. for (e = str; e != end; ++e)
  80. {
  81. unsigned char c = (unsigned char)*e;
  82. if (c < S2C(u8" ") || c == 0x7f
  83. || c == S2C(u8"\\") || c == S2C(u8"'"))
  84. break;
  85. }
  86. buffer.insert (buffer.end (), str, e);
  87. str = e;
  88. if (str == end)
  89. break;
  90. buffer.push_back (S2C(u8"\\"));
  91. switch (unsigned char c = (unsigned char)*str++)
  92. {
  93. case S2C(u8"\t"):
  94. c = S2C(u8"t");
  95. goto append;
  96. case S2C(u8"\n"):
  97. c = S2C(u8"n");
  98. goto append;
  99. case S2C(u8"'"):
  100. case S2C(u8"\\"):
  101. append:
  102. buffer.push_back (c);
  103. break;
  104. default:
  105. // Full-on escape. Use 2 lower-case hex chars
  106. for (unsigned shift = 8; shift;)
  107. {
  108. shift -= 4;
  109. char nibble = (c >> shift) & 0xf;
  110. nibble += S2C(u8"0");
  111. if (nibble > S2C(u8"9"))
  112. nibble += S2C(u8"a") - (S2C(u8"9") + 1);
  113. buffer.push_back (nibble);
  114. }
  115. }
  116. }
  117. if (quote)
  118. buffer.push_back (S2C(u8"'"));
  119. }
  120. void MessageBuffer::Append (char c)
  121. {
  122. buffer.push_back (c);
  123. }
  124. void MessageBuffer::AppendInteger (unsigned u)
  125. {
  126. // Sigh, even though std::to_string is C++11, we support building on
  127. // gcc 4.8, which is a C++11 compiler lacking std::to_string. so
  128. // have something horrible.
  129. std::string v (20, 0);
  130. size_t len = snprintf (const_cast<char *> (v.data ()), v.size (), "%u", u);
  131. v.erase (len);
  132. AppendWord (v);
  133. }
  134. int MessageBuffer::Write (int fd) noexcept
  135. {
  136. size_t limit = buffer.size () - lastBol;
  137. ssize_t count = write (fd, &buffer.data ()[lastBol], limit);
  138. int err = 0;
  139. if (count < 0)
  140. err = errno;
  141. else
  142. {
  143. lastBol += count;
  144. if (size_t (count) != limit)
  145. err = EAGAIN;
  146. }
  147. if (err != EAGAIN && err != EINTR)
  148. {
  149. // Reset for next message
  150. buffer.clear ();
  151. lastBol = 0;
  152. }
  153. return err;
  154. }
  155. int MessageBuffer::Read (int fd) noexcept
  156. {
  157. constexpr size_t blockSize = 200;
  158. size_t lwm = buffer.size ();
  159. size_t hwm = buffer.capacity ();
  160. if (hwm - lwm < blockSize / 2)
  161. hwm += blockSize;
  162. buffer.resize (hwm);
  163. auto iter = buffer.begin () + lwm;
  164. ssize_t count = read (fd, &*iter, hwm - lwm);
  165. buffer.resize (lwm + (count >= 0 ? count : 0));
  166. if (count < 0)
  167. return errno;
  168. if (!count)
  169. // End of file
  170. return -1;
  171. bool more = true;
  172. for (;;)
  173. {
  174. auto newline = std::find (iter, buffer.end (), S2C(u8"\n"));
  175. if (newline == buffer.end ())
  176. break;
  177. more = newline != buffer.begin () && newline[-1] == CONTINUE;
  178. iter = newline + 1;
  179. if (iter == buffer.end ())
  180. break;
  181. if (!more)
  182. {
  183. // There is no continuation, but there are chars after the
  184. // newline. Truncate the buffer and return an error
  185. buffer.resize (iter - buffer.begin ());
  186. return EINVAL;
  187. }
  188. }
  189. return more ? EAGAIN : 0;
  190. }
  191. int MessageBuffer::Lex (std::vector<std::string> &result)
  192. {
  193. result.clear ();
  194. if (IsAtEnd ())
  195. return ENOENT;
  196. Assert (buffer.back () == S2C(u8"\n"));
  197. auto iter = buffer.begin () + lastBol;
  198. for (std::string *word = nullptr;;)
  199. {
  200. char c = *iter;
  201. ++iter;
  202. if (c == S2C(u8" ") || c == S2C(u8"\t"))
  203. {
  204. word = nullptr;
  205. continue;
  206. }
  207. if (c == S2C(u8"\n"))
  208. break;
  209. if (c == CONTINUE)
  210. {
  211. // Line continuation
  212. if (word || *iter != S2C(u8"\n"))
  213. goto malformed;
  214. ++iter;
  215. break;
  216. }
  217. if (c <= S2C(u8" ") || c >= 0x7f)
  218. goto malformed;
  219. if (!word)
  220. {
  221. result.emplace_back ();
  222. word = &result.back ();
  223. }
  224. if (c == S2C(u8"'"))
  225. {
  226. // Quoted word
  227. for (;;)
  228. {
  229. c = *iter;
  230. if (c == S2C(u8"\n"))
  231. {
  232. malformed:;
  233. result.clear ();
  234. iter = std::find (iter, buffer.end (), S2C(u8"\n"));
  235. auto back = iter;
  236. if (back[-1] == CONTINUE && back[-2] == S2C(u8" "))
  237. // Smells like a line continuation
  238. back -= 2;
  239. result.emplace_back (&buffer[lastBol],
  240. back - buffer.begin () - lastBol);
  241. ++iter;
  242. lastBol = iter - buffer.begin ();
  243. return EINVAL;
  244. }
  245. if (c < S2C(u8" ") || c >= 0x7f)
  246. goto malformed;
  247. ++iter;
  248. if (c == S2C(u8"'"))
  249. break;
  250. if (c == S2C(u8"\\"))
  251. // escape
  252. switch (c = *iter)
  253. {
  254. case S2C(u8"\\"):
  255. case S2C(u8"'"):
  256. ++iter;
  257. break;
  258. case S2C(u8"n"):
  259. c = S2C(u8"\n");
  260. ++iter;
  261. break;
  262. case S2C(u8"_"):
  263. // We used to escape SPACE as \_, so accept that
  264. c = S2C(u8" ");
  265. ++iter;
  266. break;
  267. case S2C(u8"t"):
  268. c = S2C(u8"\t");
  269. ++iter;
  270. break;
  271. default:
  272. {
  273. unsigned v = 0;
  274. for (unsigned nibble = 0; nibble != 2; nibble++)
  275. {
  276. c = *iter;
  277. if (c < S2C(u8"0"))
  278. {
  279. if (!nibble)
  280. goto malformed;
  281. break;
  282. }
  283. else if (c <= S2C(u8"9"))
  284. c -= S2C(u8"0");
  285. else if (c < S2C(u8"a"))
  286. {
  287. if (!nibble)
  288. goto malformed;
  289. break;
  290. }
  291. else if (c <= S2C(u8"f"))
  292. c -= S2C(u8"a") - 10;
  293. else
  294. {
  295. if (!nibble)
  296. goto malformed;
  297. break;
  298. }
  299. ++iter;
  300. v = (v << 4) | c;
  301. }
  302. c = v;
  303. }
  304. }
  305. word->push_back (c);
  306. }
  307. }
  308. else
  309. // Unquoted character
  310. word->push_back (c);
  311. }
  312. lastBol = iter - buffer.begin ();
  313. if (result.empty ())
  314. return ENOENT;
  315. return 0;
  316. }
  317. void MessageBuffer::LexedLine (std::string &str)
  318. {
  319. if (lastBol)
  320. {
  321. size_t pos = lastBol - 1;
  322. for (; pos; pos--)
  323. if (buffer[pos-1] == S2C(u8"\n"))
  324. break;
  325. size_t end = lastBol - 1;
  326. if (buffer[end-1] == CONTINUE && buffer[end-2] == S2C(u8" "))
  327. // Strip line continuation
  328. end -= 2;
  329. str.append (&buffer[pos], end - pos);
  330. }
  331. }
  332. } // Detail
  333. } // Cody