doctools/micro_syntax.re2c.h

OILS / doctools / micro_syntax.re2c.h View on Github | oils.pub

907 lines, 708 significant

1	#ifndef MICRO_SYNTAX_H
2	#define MICRO_SYNTAX_H
3
4	#include <assert.h>
5	#include <string.h> // strlen()
6
7	#include <vector>
8
9	enum class Id {
10	// Common to nearly all languages
11	Comm,
12	MaybeComment, // for shell, resolved in a fix-up pass
13
14	WS,
15
16	Name, // Keyword or Identifier
17	Str, // "" and Python r""
18	// '' and Python r''
19	// ''' """
20	// body of here docs
21
22	Other, // any other text
23	Unknown,
24
25	// C++
26	DelimStrBegin, // for C++ R"zzz(hello)zzz"
27	DelimStrEnd,
28	Re2c, // re2c code block
29
30	MaybePreproc, // resolved to PreprocCommand/PreprocOther in fix-up pass
31	PreprocCommand, // resolved #define
32	PreprocOther, // any other text
33	LineCont, // backslash at end of line, for #define continuation
34
35	// Braces for C++ block structure. Could be done in second pass after
36	// removing comments/strings?
37	LBrace,
38	RBrace,
39
40	// Shell
41	HereBegin,
42	HereEnd,
43
44	// Html
45	TagNameLeft, // start <a> or <br id=foo />
46	SelfClose, // />
47	TagNameRight, // >
48	EndTag, // </a>
49	CharEscape, // &
50	AttrName, // foo=
51	BadAmpersand,
52	BadLessThan,
53	BadGreaterThan,
54	// Reused: Str Other
55
56	// Zero-width token to detect #ifdef and Python INDENT/DEDENT
57	// StartLine,
58
59	// These are special zero-width tokens for Python
60	// Indent,
61	// Dedent,
62	// Maintain our own stack!
63	// https://stackoverflow.com/questions/40960123/how-exactly-a-dedent-token-is-generated-in-python
64	};
65
66	struct Token {
67	Token() : id(Id::Unknown), end_col(0), submatch_start(0), submatch_end(0) {
68	}
69	Token(Id id, int end_col)
70	: id(id), end_col(end_col), submatch_start(0), submatch_end(0) {
71	}
72
73	Id id;
74	int end_col; // offset from char* line
75	int submatch_start; // ditto
76	int submatch_end; // ditto
77	};
78
79	// Lexer and Matcher are specialized on py_mode_e, cpp_mode_e, ...
80
81	template <typename T>
82	class Lexer {
83	public:
84	Lexer(char* line) : line_(line), p_current(line), line_mode(T::Outer) {
85	}
86
87	void SetLine(char* line) {
88	line_ = line;
89	p_current = line;
90	}
91
92	const char* line_;
93	const char* p_current; // points into line
94	T line_mode; // current mode, starts with Outer
95	};
96
97	template <typename T>
98	class Matcher {
99	public:
100	// Returns whether EOL was hit. Mutates lexer state, and fills in tok out
101	// param.
102	bool Match(Lexer<T>* lexer, Token* tok);
103	};
104
105	// Macros for semantic actions
106
107	#define TOK(k) \
108	tok->id = k; \
109	break;
110
111	#define TOK_MODE(k, m) \
112	tok->id = k; \
113	lexer->line_mode = m; \
114	break;
115
116	// Must call TOK*() after this
117	#define SUBMATCH(s, e) \
118	tok->submatch_start = s - lexer->line_; \
119	tok->submatch_end = e - lexer->line_;
120
121	// Regex definitions shared between languages
122
123	/*!re2c
124	re2c:yyfill:enable = 0;
125	re2c:define:YYCTYPE = char;
126	re2c:define:YYCURSOR = p;
127
128	nul = [\x00];
129	not_nul = [^\x00];
130
131	// Whitespace is needed for SLOC, to tell if a line is entirely blank
132	whitespace = [ \t\r\n]*;
133	space_required = [ \t\r\n]+;
134
135	identifier = [_a-zA-Z][_a-zA-Z0-9]*;
136
137	// Python and C++ have "" strings
138	// C++ char literals are similar, e.g. '\''
139	// We are not more precise
140
141	sq_middle = ( [^\x00'\\] \| "\\" not_nul )*;
142	dq_middle = ( [^\x00"\\] \| "\\" not_nul )*;
143
144	sq_string = ['] sq_middle ['];
145	dq_string = ["] dq_middle ["];
146
147	// Shell and Python have # comments
148	pound_comment = "#" not_nul*;
149
150	// YSH and Python have ''' """
151	triple_sq = "'''";
152	triple_dq = ["]["]["];
153	*/
154
155	enum class text_mode_e {
156	Outer, // default
157	};
158
159	// Returns whether EOL was hit
160	template <>
161	bool Matcher<text_mode_e>::Match(Lexer<text_mode_e>* lexer, Token* tok) {
162	const char* p = lexer->p_current; // mutated by re2c
163
164	while (true) {
165	/*!re2c
166	nul { return true; }
167
168	// whitespace at start of line
169	whitespace { TOK(Id::WS); }
170
171	// This rule consumes trailing whitespace, but
172	// it's OK. We're counting significant lines, not
173	// highlighting.
174	[^\x00]+ { TOK(Id::Other); }
175
176	* { TOK(Id::Other); }
177
178	*/
179	}
180
181	tok->end_col = p - lexer->line_;
182	lexer->p_current = p;
183	return false;
184	}
185
186	enum class asdl_mode_e {
187	Outer,
188	};
189
190	// Returns whether EOL was hit
191	template <>
192	bool Matcher<asdl_mode_e>::Match(Lexer<asdl_mode_e>* lexer, Token* tok) {
193	const char* p = lexer->p_current; // mutated by re2c
194
195	switch (lexer->line_mode) {
196	case asdl_mode_e::Outer:
197	while (true) {
198	/*!re2c
199	nul { return true; }
200
201	whitespace { TOK(Id::WS); }
202
203	identifier { TOK(Id::Name); }
204
205	pound_comment { TOK(Id::Comm); }
206
207	// Not the start of a comment, identifier
208	[^\x00#_a-zA-Z]+ { TOK(Id::Other); }
209
210	// e.g. unclosed quote like "foo
211	* { TOK(Id::Unknown); }
212
213	*/
214	}
215	break;
216	}
217
218	tok->end_col = p - lexer->line_;
219	lexer->p_current = p;
220	return false;
221	}
222
223	enum class py_mode_e {
224	Outer, // default
225	MultiSQ, // inside '''
226	MultiDQ, // inside """
227	};
228
229	// Returns whether EOL was hit
230	template <>
231	bool Matcher<py_mode_e>::Match(Lexer<py_mode_e>* lexer, Token* tok) {
232	const char* p = lexer->p_current; // mutated by re2c
233	const char* YYMARKER = p;
234
235	switch (lexer->line_mode) {
236	case py_mode_e::Outer:
237	while (true) {
238	/*!re2c
239	nul { return true; }
240
241	whitespace { TOK(Id::WS); }
242
243	identifier { TOK(Id::Name); }
244
245	[r]? sq_string { TOK(Id::Str); }
246	[r]? dq_string { TOK(Id::Str); }
247
248	// optional raw prefix
249	[r]? triple_sq { TOK_MODE(Id::Str, py_mode_e::MultiSQ); }
250	[r]? triple_dq { TOK_MODE(Id::Str, py_mode_e::MultiDQ); }
251
252	pound_comment { TOK(Id::Comm); }
253
254	// Not the start of a string, comment, identifier
255	[^\x00"'#_a-zA-Z]+ { TOK(Id::Other); }
256
257	// e.g. unclosed quote like "foo
258	* { TOK(Id::Unknown); }
259
260	*/
261	}
262	break;
263
264	case py_mode_e::MultiSQ:
265	while (true) {
266	/*!re2c
267	nul { return true; }
268
269	triple_sq { TOK_MODE(Id::Str, py_mode_e::Outer); }
270
271	[^\x00']* { TOK(Id::Str); }
272
273	* { TOK(Id::Str); }
274
275	*/
276	}
277	break;
278
279	case py_mode_e::MultiDQ:
280	while (true) {
281	/*!re2c
282	nul { return true; }
283
284	triple_dq { TOK_MODE(Id::Str, py_mode_e::Outer); }
285
286	[^\x00"]* { TOK(Id::Str); }
287
288	* { TOK(Id::Str); }
289
290	*/
291	}
292	break;
293	}
294
295	tok->end_col = p - lexer->line_;
296	lexer->p_current = p;
297	return false;
298	}
299
300	enum class cpp_mode_e {
301	Outer, // default
302	Comm, // inside /* */ comment
303	DelimStr, // R"zz(string literal)zz"
304	Re2c, // /* !re2c
305	};
306
307	// Returns whether EOL was hit
308	template <>
309	bool Matcher<cpp_mode_e>::Match(Lexer<cpp_mode_e>* lexer, Token* tok) {
310	const char* p = lexer->p_current; // mutated by re2c
311	const char* YYMARKER = p;
312	const char s, e; // submatch extraction
313
314	// Autogenerated tag variables used by the lexer to track tag values.
315	/!stags:re2c format = 'const char @@;\n'; */
316
317	switch (lexer->line_mode) {
318	case cpp_mode_e::Outer:
319
320	while (true) {
321	/*!re2c
322	nul { return true; }
323
324	whitespace { TOK(Id::WS); }
325
326	"{" { TOK(Id::LBrace); }
327	"}" { TOK(Id::RBrace); }
328
329	identifier { TOK(Id::Name); }
330
331	// approximation for C++ char literals
332	sq_string { TOK(Id::Str); }
333	dq_string { TOK(Id::Str); }
334
335	// Not the start of a string, comment, identifier
336	[^\x00"'/_a-zA-Z{}]+ { TOK(Id::Other); }
337
338	"//" not_nul* { TOK(Id::Comm); }
339
340	// Treat re2c as preprocessor block
341	"/" "*!re2c" { TOK_MODE(Id::Re2c, cpp_mode_e::Re2c); }
342
343	"/" "*" { TOK_MODE(Id::Comm, cpp_mode_e::Comm); }
344
345	// Not sure what the rules are for R"zz(hello)zz". Make it similar to
346	// here docs.
347	cpp_delim_str = [_a-zA-Z]*;
348
349	"R" ["] @s cpp_delim_str @e "(" {
350	SUBMATCH(s, e);
351	TOK_MODE(Id::DelimStrBegin, cpp_mode_e::DelimStr);
352	}
353
354	// e.g. unclosed quote like "foo
355	* { TOK(Id::Unknown); }
356
357	*/
358	}
359	break;
360
361	case cpp_mode_e::Comm:
362	// Search until next */
363	while (true) {
364	/*!re2c
365	nul { return true; }
366
367	"*" "/" { TOK_MODE(Id::Comm, cpp_mode_e::Outer); }
368
369	[^\x00] { TOK(Id::Comm); }
370
371	* { TOK(Id::Comm); }
372
373	*/
374	}
375	break;
376
377	case cpp_mode_e::Re2c:
378	// Search until next */
379	while (true) {
380	/*!re2c
381	nul { return true; }
382
383	"*" "/" { TOK_MODE(Id::Re2c, cpp_mode_e::Outer); }
384
385	[^\x00] { TOK(Id::Re2c); }
386
387	* { TOK(Id::Re2c); }
388
389	*/
390	}
391	break;
392
393	case cpp_mode_e::DelimStr:
394	// Search until next */
395	while (true) {
396	/*!re2c
397	nul { return true; }
398
399	")" @s cpp_delim_str @e ["] {
400	SUBMATCH(s, e);
401	TOK(Id::DelimStrEnd);
402
403	// Caller is responsible for checking the extracted delimiter, and
404	// setting mode back to Cpp::Outer!
405	}
406
407	[^\x00)]* { TOK(Id::Str); }
408
409	* { TOK(Id::Str); }
410
411	*/
412	}
413	break;
414	}
415
416	tok->end_col = p - lexer->line_;
417	lexer->p_current = p;
418	return false;
419	}
420
421	class Hook {
422	public:
423	// Return true if this is a preprocessor line, and fill in tokens
424	// Caller should check last token for whether there is a continuation line.
425	virtual void TryPreprocess(char* line, std::vector<Token>* tokens) {
426	;
427	}
428	virtual ~Hook() {
429	}
430	};
431
432	enum class pp_mode_e {
433	Outer,
434	};
435
436	// Returns whether EOL was hit
437	template <>
438	bool Matcher<pp_mode_e>::Match(Lexer<pp_mode_e>* lexer, Token* tok) {
439	const char* p = lexer->p_current; // mutated by re2c
440	const char* YYMARKER = p;
441
442	switch (lexer->line_mode) {
443	case pp_mode_e::Outer:
444	while (true) {
445	/*!re2c
446	nul { return true; }
447
448	// Resolved in fix-up pass
449	// #include #define etc. only valid at the
450	// beginning
451	[ \t]* "#" [a-z]+ { TOK(Id::MaybePreproc); }
452
453	// C-style comments can end these lines
454	"//" not_nul* { TOK(Id::Comm); }
455
456	[\\] [\n] { TOK(Id::LineCont); }
457
458	// A line could be all whitespace, then \ at the
459	// end. And it's not significant
460	whitespace { TOK(Id::WS); }
461
462	// Not the start of a command, comment, or line
463	// continuation
464	[^\x00#/\\]+ { TOK(Id::PreprocOther); }
465
466	* { TOK(Id::PreprocOther); }
467
468	*/
469	}
470	break;
471	}
472
473	tok->end_col = p - lexer->line_;
474	lexer->p_current = p;
475	return false;
476	}
477
478	class CppHook : public Hook {
479	public:
480	virtual void TryPreprocess(char* line, std::vector<Token>* tokens);
481	};
482
483	enum class R_mode_e {
484	Outer, // default
485
486	SQ, // inside multi-line ''
487	DQ, // inside multi-line ""
488	};
489
490	// Returns whether EOL was hit
491	template <>
492	bool Matcher<R_mode_e>::Match(Lexer<R_mode_e>* lexer, Token* tok) {
493	const char* p = lexer->p_current; // mutated by re2c
494	const char* YYMARKER = p;
495
496	switch (lexer->line_mode) {
497	case R_mode_e::Outer:
498	while (true) {
499	/*!re2c
500	nul { return true; }
501
502	whitespace { TOK(Id::WS); }
503
504	pound_comment { TOK(Id::Comm); }
505
506	identifier { TOK(Id::Name); }
507
508	// Not the start of a string, escaped, comment, identifier
509	[^\x00"'#_a-zA-Z]+ { TOK(Id::Other); }
510
511	['] { TOK_MODE(Id::Str, R_mode_e::SQ); }
512	["] { TOK_MODE(Id::Str, R_mode_e::DQ); }
513
514	* { TOK(Id::Unknown); }
515
516	*/
517	}
518	break;
519
520	case R_mode_e::SQ:
521	while (true) {
522	/*!re2c
523	nul { return true; }
524
525	['] { TOK_MODE(Id::Str, R_mode_e::Outer); }
526
527	sq_middle { TOK(Id::Str); }
528
529	* { TOK(Id::Str); }
530
531	*/
532	}
533	break;
534
535	case R_mode_e::DQ:
536	while (true) {
537	/*!re2c
538	nul { return true; }
539
540	["] { TOK_MODE(Id::Str, R_mode_e::Outer); }
541
542	dq_middle { TOK(Id::Str); }
543
544	* { TOK(Id::Str); }
545
546	*/
547	}
548	break;
549	}
550
551	tok->end_col = p - lexer->line_;
552	lexer->p_current = p;
553	return false;
554	}
555
556	// Problem with shell: nested double quotes!!!
557	// We probably discourage this in YSH
558
559	enum class sh_mode_e {
560	Outer, // default
561
562	SQ, // inside multi-line ''
563	DollarSQ, // inside multi-line $''
564	DQ, // inside multi-line ""
565
566	// We could have a separate thing for this
567	YshSQ, // inside '''
568	YshDQ, // inside """
569	YshJ, // inside j"""
570	};
571
572	// Returns whether EOL was hit
573
574	// Submatch docs:
575	// https://re2c.org/manual/manual_c.html#submatch-extraction
576
577	template <>
578	bool Matcher<sh_mode_e>::Match(Lexer<sh_mode_e>* lexer, Token* tok) {
579	const char* p = lexer->p_current; // mutated by re2c
580	const char* YYMARKER = p;
581	const char s, e; // submatch extraction
582
583	// Autogenerated tag variables used by the lexer to track tag values.
584	/!stags:re2c format = 'const char @@;\n'; */
585
586	switch (lexer->line_mode) {
587	case sh_mode_e::Outer:
588	while (true) {
589	/*!re2c
590	nul { return true; }
591
592	whitespace { TOK(Id::WS); }
593
594	// Resolved in fix-up pass
595	pound_comment { TOK(Id::MaybeComment); }
596
597	// not that relevant for shell
598	identifier { TOK(Id::Name); }
599
600	// Not the start of a string, escaped, comment, identifier, here doc
601	[^\x00"'$#_a-zA-Z\\<]+ { TOK(Id::Other); }
602
603	// echo is like a string
604	"\\" . { TOK(Id::Str); }
605
606	['] { TOK_MODE(Id::Str, sh_mode_e::SQ); }
607	["] { TOK_MODE(Id::Str, sh_mode_e::DQ); }
608	"$'" { TOK_MODE(Id::Str, sh_mode_e::DollarSQ); }
609
610	// <<- is another syntax
611	here_op = "<<" [-]? [ \t]*;
612	h_delim = [_a-zA-Z][_a-zA-Z0-9]*;
613
614	// unquoted or quoted
615	here_op @s h_delim @e { SUBMATCH(s, e); TOK(Id::HereBegin); }
616	here_op ['] @s h_delim @e ['] { SUBMATCH(s, e); TOK(Id::HereBegin); }
617	here_op ["] @s h_delim @e ["] { SUBMATCH(s, e); TOK(Id::HereBegin); }
618	here_op "\\" @s h_delim @e { SUBMATCH(s, e); TOK(Id::HereBegin); }
619
620	// NOT Unknown, as in Python
621	* { TOK(Id::Other); }
622
623	*/
624	}
625	break;
626
627	case sh_mode_e::SQ:
628	// Search until next ' unconditionally
629	while (true) {
630	/*!re2c
631	nul { return true; }
632
633	['] { TOK_MODE(Id::Str, sh_mode_e::Outer); }
634
635	[^\x00']* { TOK(Id::Str); }
636
637	* { TOK(Id::Str); }
638
639	*/
640	}
641	break;
642
643	case sh_mode_e::DQ:
644	// Search until next " that's not preceded by "
645	while (true) {
646	/*!re2c
647	nul { return true; }
648
649	["] { TOK_MODE(Id::Str, sh_mode_e::Outer); }
650
651	dq_middle { TOK(Id::Str); }
652
653	* { TOK(Id::Str); }
654
655	*/
656	}
657	break;
658
659	case sh_mode_e::DollarSQ:
660	// Search until next ' that's not preceded by "
661	while (true) {
662	/*!re2c
663	nul { return true; }
664
665	['] { TOK_MODE(Id::Str, sh_mode_e::Outer); }
666
667	sq_middle { TOK(Id::Str); }
668
669	* { TOK(Id::Str); }
670
671	*/
672	}
673	break;
674	case sh_mode_e::YshSQ:
675	case sh_mode_e::YshDQ:
676	case sh_mode_e::YshJ:
677	assert(0);
678	}
679
680	tok->end_col = p - lexer->line_;
681	lexer->p_current = p;
682	return false;
683	}
684
685	enum class html_mode_e {
686	Outer, // <NAME enters the TAG state
687	AttrName, // NAME=" NAME=' NAME= NAME
688	AttrValue, // NAME=" NAME=' NAME=
689	SQ, // respects Chars, can contain "
690	DQ, // respects Chars, can contain '
691	Comm, // <!-- -->
692	Preprocessing, // <? ?>
693	CData, // <![CDATA[ x ]]>
694	HtmlCData, // <script> <style>
695	};
696
697	// LeftStartTag -> RightStartTag <a href=/ >
698	// LeftStartTag -> SelfClose <br id=foo />
699
700	// Returns whether EOL was hit
701	template <>
702	bool Matcher<html_mode_e>::Match(Lexer<html_mode_e>* lexer, Token* tok) {
703	const char* p = lexer->p_current; // mutated by re2c
704	const char* YYMARKER = p;
705
706	/*!re2c
707	// Common definitions
708
709	// Like _NAME_RE in HTM8
710	name = [a-zA-Z][a-zA-Z0-9:_-]* ;
711
712	// TODO: check this pattern
713	char_name = "&" [a-zA-Z][a-zA-Z0-9]* ";" ;
714	char_dec = "&#" [0-9]+ ";" ;
715	char_hex = "&#x" [0-9a-fA-F]+ ";" ;
716	*/
717
718	switch (lexer->line_mode) {
719	case html_mode_e::Outer:
720	while (true) {
721	/*!re2c
722	// accepted EOF
723	nul { return true; }
724
725	char_name { TOK(Id::CharEscape); }
726	char_dec { TOK(Id::CharEscape); }
727	char_hex { TOK(Id::CharEscape); }
728
729	"&" { TOK(Id::BadAmpersand); }
730	">" { TOK(Id::BadGreaterThan); }
731	"<" { TOK(Id::BadLessThan); }
732
733	"</" name ">" { TOK(Id::EndTag); }
734
735	"<" name {
736	TOK_MODE(Id::TagNameLeft, html_mode_e::AttrName);
737	// TODO: <script> <style> - special logic for strstr()
738	}
739
740	// Problem: these can span more than one linee ... it needs to be
741	// another mode? The end tag might be technically the same.
742	"<!" [^\x00>]* ">" { TOK(Id::Comm); }
743
744	"<!--" { TOK_MODE(Id::Comm, html_mode_e::Comm); }
745	"<?" { TOK_MODE(Id::Comm, html_mode_e::Preprocessing); }
746	"<![CDATA[" { TOK_MODE(Id::Str, html_mode_e::CData); }
747
748
749	// Like RawData
750	* { TOK(Id::Other); }
751
752	*/
753	}
754	break;
755	case html_mode_e::AttrName:
756	while (true) {
757	/*!re2c
758	nul { return true; } // TODO: error
759
760	// TODO: If the tag was <script> or <STYLE>, then we want to enter
761	// HtmlCData mode, until we hit </script> or </STYLE>.
762	// This is live throughout AttrName, AttrValue, SQ, DQ states?
763	">" { TOK_MODE(Id::TagNameRight, html_mode_e::Outer); }
764	"/>" { TOK_MODE(Id::SelfClose, html_mode_e::Outer); }
765
766	space_required name {
767	// <a missing> - stay in the AttrName mode
768	TOK(Id::AttrName);
769	}
770
771	space_required name whitespace '=' whitespace {
772	// NAME= NAME=' NAME=" - expecting a value
773	TOK_MODE(Id::AttrName, html_mode_e::AttrValue);
774	}
775
776	* { TOK(Id::Unknown); }
777	*/
778	}
779	break;
780	case html_mode_e::AttrValue:
781	while (true) {
782	/*!re2c
783	nul { return true; } // TODO: error
784
785	["] { TOK_MODE(Id::Str, html_mode_e::DQ); }
786	['] { TOK_MODE(Id::Str, html_mode_e::SQ); }
787
788	// Unquoted value - a single token
789	unquoted_value = [^\x00 \r\n\t<>&"']+ ;
790
791	unquoted_value { TOK_MODE(Id::Str, html_mode_e::AttrName); }
792
793	* { TOK(Id::Unknown); }
794	*/
795	}
796	break;
797
798	case html_mode_e::DQ:
799	while (true) {
800	/*!re2c
801	nul { return true; } // TODO: error
802	char_name { TOK(Id::CharEscape); }
803	char_dec { TOK(Id::CharEscape); }
804	char_hex { TOK(Id::CharEscape); }
805
806	// we would only need these for translation to XML, not
807	// highlighting?
808	"&" { TOK(Id::BadAmpersand); }
809	">" { TOK(Id::BadGreaterThan); }
810	"<" { TOK(Id::BadLessThan); }
811
812	["] { TOK_MODE(Id::Str, html_mode_e::AttrName); }
813	* { TOK(Id::Str); }
814	*/
815	}
816	break;
817	case html_mode_e::SQ:
818	while (true) {
819	/*!re2c
820	nul { return true; } // TODO: error
821	char_name { TOK(Id::CharEscape); }
822	char_dec { TOK(Id::CharEscape); }
823	char_hex { TOK(Id::CharEscape); }
824
825	// we would only need these for translation to XML, not
826	// highlighting?
827	"&" { TOK(Id::BadAmpersand); }
828	">" { TOK(Id::BadGreaterThan); }
829	"<" { TOK(Id::BadLessThan); }
830	['] { TOK_MODE(Id::Str, html_mode_e::AttrName); }
831
832	* { TOK(Id::Str); }
833	*/
834	}
835	break;
836	case html_mode_e::Comm:
837	// Search until next -->
838	while (true) {
839	/*!re2c
840	nul { return true; }
841
842	"-->" { TOK_MODE(Id::Comm, html_mode_e::Outer); }
843
844	[^\x00-]* { TOK(Id::Comm); }
845
846	* { TOK(Id::Comm); }
847
848	*/
849	}
850	break;
851	case html_mode_e::Preprocessing:
852	// Search until next ?>
853	while (true) {
854	/*!re2c
855	nul { return true; }
856
857	"?>" { TOK_MODE(Id::Comm, html_mode_e::Outer); }
858
859	[^\x00?]* { TOK(Id::Comm); }
860
861	* { TOK(Id::Comm); }
862
863	*/
864	}
865	break;
866	case html_mode_e::CData:
867	// Search until next ]]>
868	while (true) {
869	/*!re2c
870	nul { return true; }
871
872	"]]>" { TOK_MODE(Id::Str, html_mode_e::Outer); }
873
874	[^\x00\]]* { TOK(Id::Str); }
875
876	* { TOK(Id::Str); }
877
878	*/
879	}
880	break;
881
882	default:
883	assert(0);
884	}
885
886	tok->end_col = p - lexer->line_;
887	lexer->p_current = p;
888	return false;
889	}
890
891	// TODO:
892	// - Lua / Rust-style multi-line strings, with matching delimiters e.g. r###"
893	// - same as C++ raw string, I think
894	// - similar to here docs, but less complex
895	//
896	// Inherent problems with "micro segmentation":
897	//
898	// - Nested double quotes in shell. echo "hi ${name:-"default"}"
899	// - This means that lexing is dependent on parsing: does the second
900	// double quote close the first one, or does it start a nested string?
901	// - lexing is non-recursive, parsing is recursive
902
903	// Shell Comments depend on operator chars
904	// echo one # comment
905	// echo $(( 16#ff ))'
906
907	#endif // MICRO_SYNTAX_H