| 1 | #ifndef MICRO_SYNTAX_H
|
| 2 | #define MICRO_SYNTAX_H
|
| 3 |
|
| 4 | #include <assert.h>
|
| 5 | #include <string.h> // strlen()
|
| 6 |
|
| 7 | #include <vector>
|
| 8 |
|
| 9 | enum class Id {
|
| 10 | // Common to nearly all languages
|
| 11 | Comm,
|
| 12 | MaybeComment, // for shell, resolved in a fix-up pass
|
| 13 |
|
| 14 | WS,
|
| 15 |
|
| 16 | Name, // Keyword or Identifier
|
| 17 | Str, // "" and Python r""
|
| 18 | // '' and Python r''
|
| 19 | // ''' """
|
| 20 | // body of here docs
|
| 21 |
|
| 22 | Other, // any other text
|
| 23 | Unknown,
|
| 24 |
|
| 25 | // C++
|
| 26 | DelimStrBegin, // for C++ R"zzz(hello)zzz"
|
| 27 | DelimStrEnd,
|
| 28 | Re2c, // re2c code block
|
| 29 |
|
| 30 | MaybePreproc, // resolved to PreprocCommand/PreprocOther in fix-up pass
|
| 31 | PreprocCommand, // resolved #define
|
| 32 | PreprocOther, // any other text
|
| 33 | LineCont, // backslash at end of line, for #define continuation
|
| 34 |
|
| 35 | // Braces for C++ block structure. Could be done in second pass after
|
| 36 | // removing comments/strings?
|
| 37 | LBrace,
|
| 38 | RBrace,
|
| 39 |
|
| 40 | // Shell
|
| 41 | HereBegin,
|
| 42 | HereEnd,
|
| 43 |
|
| 44 | // Html
|
| 45 | TagNameLeft, // start <a> or <br id=foo />
|
| 46 | SelfClose, // />
|
| 47 | TagNameRight, // >
|
| 48 | EndTag, // </a>
|
| 49 | CharEscape, // &
|
| 50 | AttrName, // foo=
|
| 51 | BadAmpersand,
|
| 52 | BadLessThan,
|
| 53 | BadGreaterThan,
|
| 54 | // Reused: Str Other
|
| 55 |
|
| 56 | // Zero-width token to detect #ifdef and Python INDENT/DEDENT
|
| 57 | // StartLine,
|
| 58 |
|
| 59 | // These are special zero-width tokens for Python
|
| 60 | // Indent,
|
| 61 | // Dedent,
|
| 62 | // Maintain our own stack!
|
| 63 | // https://stackoverflow.com/questions/40960123/how-exactly-a-dedent-token-is-generated-in-python
|
| 64 | };
|
| 65 |
|
| 66 | struct Token {
|
| 67 | Token() : id(Id::Unknown), end_col(0), submatch_start(0), submatch_end(0) {
|
| 68 | }
|
| 69 | Token(Id id, int end_col)
|
| 70 | : id(id), end_col(end_col), submatch_start(0), submatch_end(0) {
|
| 71 | }
|
| 72 |
|
| 73 | Id id;
|
| 74 | int end_col; // offset from char* line
|
| 75 | int submatch_start; // ditto
|
| 76 | int submatch_end; // ditto
|
| 77 | };
|
| 78 |
|
| 79 | // Lexer and Matcher are specialized on py_mode_e, cpp_mode_e, ...
|
| 80 |
|
| 81 | template <typename T>
|
| 82 | class Lexer {
|
| 83 | public:
|
| 84 | Lexer(char* line) : line_(line), p_current(line), line_mode(T::Outer) {
|
| 85 | }
|
| 86 |
|
| 87 | void SetLine(char* line) {
|
| 88 | line_ = line;
|
| 89 | p_current = line;
|
| 90 | }
|
| 91 |
|
| 92 | const char* line_;
|
| 93 | const char* p_current; // points into line
|
| 94 | T line_mode; // current mode, starts with Outer
|
| 95 | };
|
| 96 |
|
| 97 | template <typename T>
|
| 98 | class Matcher {
|
| 99 | public:
|
| 100 | // Returns whether EOL was hit. Mutates lexer state, and fills in tok out
|
| 101 | // param.
|
| 102 | bool Match(Lexer<T>* lexer, Token* tok);
|
| 103 | };
|
| 104 |
|
| 105 | // Macros for semantic actions
|
| 106 |
|
| 107 | #define TOK(k) \
|
| 108 | tok->id = k; \
|
| 109 | break;
|
| 110 |
|
| 111 | #define TOK_MODE(k, m) \
|
| 112 | tok->id = k; \
|
| 113 | lexer->line_mode = m; \
|
| 114 | break;
|
| 115 |
|
| 116 | // Must call TOK*() after this
|
| 117 | #define SUBMATCH(s, e) \
|
| 118 | tok->submatch_start = s - lexer->line_; \
|
| 119 | tok->submatch_end = e - lexer->line_;
|
| 120 |
|
| 121 | // Regex definitions shared between languages
|
| 122 |
|
| 123 | /*!re2c
|
| 124 | re2c:yyfill:enable = 0;
|
| 125 | re2c:define:YYCTYPE = char;
|
| 126 | re2c:define:YYCURSOR = p;
|
| 127 |
|
| 128 | nul = [\x00];
|
| 129 | not_nul = [^\x00];
|
| 130 |
|
| 131 | // Whitespace is needed for SLOC, to tell if a line is entirely blank
|
| 132 | whitespace = [ \t\r\n]*;
|
| 133 | space_required = [ \t\r\n]+;
|
| 134 |
|
| 135 | identifier = [_a-zA-Z][_a-zA-Z0-9]*;
|
| 136 |
|
| 137 | // Python and C++ have "" strings
|
| 138 | // C++ char literals are similar, e.g. '\''
|
| 139 | // We are not more precise
|
| 140 |
|
| 141 | sq_middle = ( [^\x00'\\] | "\\" not_nul )*;
|
| 142 | dq_middle = ( [^\x00"\\] | "\\" not_nul )*;
|
| 143 |
|
| 144 | sq_string = ['] sq_middle ['];
|
| 145 | dq_string = ["] dq_middle ["];
|
| 146 |
|
| 147 | // Shell and Python have # comments
|
| 148 | pound_comment = "#" not_nul*;
|
| 149 |
|
| 150 | // YSH and Python have ''' """
|
| 151 | triple_sq = "'''";
|
| 152 | triple_dq = ["]["]["];
|
| 153 | */
|
| 154 |
|
| 155 | enum class text_mode_e {
|
| 156 | Outer, // default
|
| 157 | };
|
| 158 |
|
| 159 | // Returns whether EOL was hit
|
| 160 | template <>
|
| 161 | bool Matcher<text_mode_e>::Match(Lexer<text_mode_e>* lexer, Token* tok) {
|
| 162 | const char* p = lexer->p_current; // mutated by re2c
|
| 163 |
|
| 164 | while (true) {
|
| 165 | /*!re2c
|
| 166 | nul { return true; }
|
| 167 |
|
| 168 | // whitespace at start of line
|
| 169 | whitespace { TOK(Id::WS); }
|
| 170 |
|
| 171 | // This rule consumes trailing whitespace, but
|
| 172 | // it's OK. We're counting significant lines, not
|
| 173 | // highlighting.
|
| 174 | [^\x00]+ { TOK(Id::Other); }
|
| 175 |
|
| 176 | * { TOK(Id::Other); }
|
| 177 |
|
| 178 | */
|
| 179 | }
|
| 180 |
|
| 181 | tok->end_col = p - lexer->line_;
|
| 182 | lexer->p_current = p;
|
| 183 | return false;
|
| 184 | }
|
| 185 |
|
| 186 | enum class asdl_mode_e {
|
| 187 | Outer,
|
| 188 | };
|
| 189 |
|
| 190 | // Returns whether EOL was hit
|
| 191 | template <>
|
| 192 | bool Matcher<asdl_mode_e>::Match(Lexer<asdl_mode_e>* lexer, Token* tok) {
|
| 193 | const char* p = lexer->p_current; // mutated by re2c
|
| 194 |
|
| 195 | switch (lexer->line_mode) {
|
| 196 | case asdl_mode_e::Outer:
|
| 197 | while (true) {
|
| 198 | /*!re2c
|
| 199 | nul { return true; }
|
| 200 |
|
| 201 | whitespace { TOK(Id::WS); }
|
| 202 |
|
| 203 | identifier { TOK(Id::Name); }
|
| 204 |
|
| 205 | pound_comment { TOK(Id::Comm); }
|
| 206 |
|
| 207 | // Not the start of a comment, identifier
|
| 208 | [^\x00#_a-zA-Z]+ { TOK(Id::Other); }
|
| 209 |
|
| 210 | // e.g. unclosed quote like "foo
|
| 211 | * { TOK(Id::Unknown); }
|
| 212 |
|
| 213 | */
|
| 214 | }
|
| 215 | break;
|
| 216 | }
|
| 217 |
|
| 218 | tok->end_col = p - lexer->line_;
|
| 219 | lexer->p_current = p;
|
| 220 | return false;
|
| 221 | }
|
| 222 |
|
| 223 | enum class py_mode_e {
|
| 224 | Outer, // default
|
| 225 | MultiSQ, // inside '''
|
| 226 | MultiDQ, // inside """
|
| 227 | };
|
| 228 |
|
| 229 | // Returns whether EOL was hit
|
| 230 | template <>
|
| 231 | bool Matcher<py_mode_e>::Match(Lexer<py_mode_e>* lexer, Token* tok) {
|
| 232 | const char* p = lexer->p_current; // mutated by re2c
|
| 233 | const char* YYMARKER = p;
|
| 234 |
|
| 235 | switch (lexer->line_mode) {
|
| 236 | case py_mode_e::Outer:
|
| 237 | while (true) {
|
| 238 | /*!re2c
|
| 239 | nul { return true; }
|
| 240 |
|
| 241 | whitespace { TOK(Id::WS); }
|
| 242 |
|
| 243 | identifier { TOK(Id::Name); }
|
| 244 |
|
| 245 | [r]? sq_string { TOK(Id::Str); }
|
| 246 | [r]? dq_string { TOK(Id::Str); }
|
| 247 |
|
| 248 | // optional raw prefix
|
| 249 | [r]? triple_sq { TOK_MODE(Id::Str, py_mode_e::MultiSQ); }
|
| 250 | [r]? triple_dq { TOK_MODE(Id::Str, py_mode_e::MultiDQ); }
|
| 251 |
|
| 252 | pound_comment { TOK(Id::Comm); }
|
| 253 |
|
| 254 | // Not the start of a string, comment, identifier
|
| 255 | [^\x00"'#_a-zA-Z]+ { TOK(Id::Other); }
|
| 256 |
|
| 257 | // e.g. unclosed quote like "foo
|
| 258 | * { TOK(Id::Unknown); }
|
| 259 |
|
| 260 | */
|
| 261 | }
|
| 262 | break;
|
| 263 |
|
| 264 | case py_mode_e::MultiSQ:
|
| 265 | while (true) {
|
| 266 | /*!re2c
|
| 267 | nul { return true; }
|
| 268 |
|
| 269 | triple_sq { TOK_MODE(Id::Str, py_mode_e::Outer); }
|
| 270 |
|
| 271 | [^\x00']* { TOK(Id::Str); }
|
| 272 |
|
| 273 | * { TOK(Id::Str); }
|
| 274 |
|
| 275 | */
|
| 276 | }
|
| 277 | break;
|
| 278 |
|
| 279 | case py_mode_e::MultiDQ:
|
| 280 | while (true) {
|
| 281 | /*!re2c
|
| 282 | nul { return true; }
|
| 283 |
|
| 284 | triple_dq { TOK_MODE(Id::Str, py_mode_e::Outer); }
|
| 285 |
|
| 286 | [^\x00"]* { TOK(Id::Str); }
|
| 287 |
|
| 288 | * { TOK(Id::Str); }
|
| 289 |
|
| 290 | */
|
| 291 | }
|
| 292 | break;
|
| 293 | }
|
| 294 |
|
| 295 | tok->end_col = p - lexer->line_;
|
| 296 | lexer->p_current = p;
|
| 297 | return false;
|
| 298 | }
|
| 299 |
|
| 300 | enum class cpp_mode_e {
|
| 301 | Outer, // default
|
| 302 | Comm, // inside /* */ comment
|
| 303 | DelimStr, // R"zz(string literal)zz"
|
| 304 | Re2c, // /* !re2c
|
| 305 | };
|
| 306 |
|
| 307 | // Returns whether EOL was hit
|
| 308 | template <>
|
| 309 | bool Matcher<cpp_mode_e>::Match(Lexer<cpp_mode_e>* lexer, Token* tok) {
|
| 310 | const char* p = lexer->p_current; // mutated by re2c
|
| 311 | const char* YYMARKER = p;
|
| 312 | const char *s, *e; // submatch extraction
|
| 313 |
|
| 314 | // Autogenerated tag variables used by the lexer to track tag values.
|
| 315 | /*!stags:re2c format = 'const char *@@;\n'; */
|
| 316 |
|
| 317 | switch (lexer->line_mode) {
|
| 318 | case cpp_mode_e::Outer:
|
| 319 |
|
| 320 | while (true) {
|
| 321 | /*!re2c
|
| 322 | nul { return true; }
|
| 323 |
|
| 324 | whitespace { TOK(Id::WS); }
|
| 325 |
|
| 326 | "{" { TOK(Id::LBrace); }
|
| 327 | "}" { TOK(Id::RBrace); }
|
| 328 |
|
| 329 | identifier { TOK(Id::Name); }
|
| 330 |
|
| 331 | // approximation for C++ char literals
|
| 332 | sq_string { TOK(Id::Str); }
|
| 333 | dq_string { TOK(Id::Str); }
|
| 334 |
|
| 335 | // Not the start of a string, comment, identifier
|
| 336 | [^\x00"'/_a-zA-Z{}]+ { TOK(Id::Other); }
|
| 337 |
|
| 338 | "//" not_nul* { TOK(Id::Comm); }
|
| 339 |
|
| 340 | // Treat re2c as preprocessor block
|
| 341 | "/" "*!re2c" { TOK_MODE(Id::Re2c, cpp_mode_e::Re2c); }
|
| 342 |
|
| 343 | "/" "*" { TOK_MODE(Id::Comm, cpp_mode_e::Comm); }
|
| 344 |
|
| 345 | // Not sure what the rules are for R"zz(hello)zz". Make it similar to
|
| 346 | // here docs.
|
| 347 | cpp_delim_str = [_a-zA-Z]*;
|
| 348 |
|
| 349 | "R" ["] @s cpp_delim_str @e "(" {
|
| 350 | SUBMATCH(s, e);
|
| 351 | TOK_MODE(Id::DelimStrBegin, cpp_mode_e::DelimStr);
|
| 352 | }
|
| 353 |
|
| 354 | // e.g. unclosed quote like "foo
|
| 355 | * { TOK(Id::Unknown); }
|
| 356 |
|
| 357 | */
|
| 358 | }
|
| 359 | break;
|
| 360 |
|
| 361 | case cpp_mode_e::Comm:
|
| 362 | // Search until next */
|
| 363 | while (true) {
|
| 364 | /*!re2c
|
| 365 | nul { return true; }
|
| 366 |
|
| 367 | "*" "/" { TOK_MODE(Id::Comm, cpp_mode_e::Outer); }
|
| 368 |
|
| 369 | [^\x00*]* { TOK(Id::Comm); }
|
| 370 |
|
| 371 | * { TOK(Id::Comm); }
|
| 372 |
|
| 373 | */
|
| 374 | }
|
| 375 | break;
|
| 376 |
|
| 377 | case cpp_mode_e::Re2c:
|
| 378 | // Search until next */
|
| 379 | while (true) {
|
| 380 | /*!re2c
|
| 381 | nul { return true; }
|
| 382 |
|
| 383 | "*" "/" { TOK_MODE(Id::Re2c, cpp_mode_e::Outer); }
|
| 384 |
|
| 385 | [^\x00*]* { TOK(Id::Re2c); }
|
| 386 |
|
| 387 | * { TOK(Id::Re2c); }
|
| 388 |
|
| 389 | */
|
| 390 | }
|
| 391 | break;
|
| 392 |
|
| 393 | case cpp_mode_e::DelimStr:
|
| 394 | // Search until next */
|
| 395 | while (true) {
|
| 396 | /*!re2c
|
| 397 | nul { return true; }
|
| 398 |
|
| 399 | ")" @s cpp_delim_str @e ["] {
|
| 400 | SUBMATCH(s, e);
|
| 401 | TOK(Id::DelimStrEnd);
|
| 402 |
|
| 403 | // Caller is responsible for checking the extracted delimiter, and
|
| 404 | // setting mode back to Cpp::Outer!
|
| 405 | }
|
| 406 |
|
| 407 | [^\x00)]* { TOK(Id::Str); }
|
| 408 |
|
| 409 | * { TOK(Id::Str); }
|
| 410 |
|
| 411 | */
|
| 412 | }
|
| 413 | break;
|
| 414 | }
|
| 415 |
|
| 416 | tok->end_col = p - lexer->line_;
|
| 417 | lexer->p_current = p;
|
| 418 | return false;
|
| 419 | }
|
| 420 |
|
| 421 | class Hook {
|
| 422 | public:
|
| 423 | // Return true if this is a preprocessor line, and fill in tokens
|
| 424 | // Caller should check last token for whether there is a continuation line.
|
| 425 | virtual void TryPreprocess(char* line, std::vector<Token>* tokens) {
|
| 426 | ;
|
| 427 | }
|
| 428 | virtual ~Hook() {
|
| 429 | }
|
| 430 | };
|
| 431 |
|
| 432 | enum class pp_mode_e {
|
| 433 | Outer,
|
| 434 | };
|
| 435 |
|
| 436 | // Returns whether EOL was hit
|
| 437 | template <>
|
| 438 | bool Matcher<pp_mode_e>::Match(Lexer<pp_mode_e>* lexer, Token* tok) {
|
| 439 | const char* p = lexer->p_current; // mutated by re2c
|
| 440 | const char* YYMARKER = p;
|
| 441 |
|
| 442 | switch (lexer->line_mode) {
|
| 443 | case pp_mode_e::Outer:
|
| 444 | while (true) {
|
| 445 | /*!re2c
|
| 446 | nul { return true; }
|
| 447 |
|
| 448 | // Resolved in fix-up pass
|
| 449 | // #include #define etc. only valid at the
|
| 450 | // beginning
|
| 451 | [ \t]* "#" [a-z]+ { TOK(Id::MaybePreproc); }
|
| 452 |
|
| 453 | // C-style comments can end these lines
|
| 454 | "//" not_nul* { TOK(Id::Comm); }
|
| 455 |
|
| 456 | [\\] [\n] { TOK(Id::LineCont); }
|
| 457 |
|
| 458 | // A line could be all whitespace, then \ at the
|
| 459 | // end. And it's not significant
|
| 460 | whitespace { TOK(Id::WS); }
|
| 461 |
|
| 462 | // Not the start of a command, comment, or line
|
| 463 | // continuation
|
| 464 | [^\x00#/\\]+ { TOK(Id::PreprocOther); }
|
| 465 |
|
| 466 | * { TOK(Id::PreprocOther); }
|
| 467 |
|
| 468 | */
|
| 469 | }
|
| 470 | break;
|
| 471 | }
|
| 472 |
|
| 473 | tok->end_col = p - lexer->line_;
|
| 474 | lexer->p_current = p;
|
| 475 | return false;
|
| 476 | }
|
| 477 |
|
| 478 | class CppHook : public Hook {
|
| 479 | public:
|
| 480 | virtual void TryPreprocess(char* line, std::vector<Token>* tokens);
|
| 481 | };
|
| 482 |
|
| 483 | enum class R_mode_e {
|
| 484 | Outer, // default
|
| 485 |
|
| 486 | SQ, // inside multi-line ''
|
| 487 | DQ, // inside multi-line ""
|
| 488 | };
|
| 489 |
|
| 490 | // Returns whether EOL was hit
|
| 491 | template <>
|
| 492 | bool Matcher<R_mode_e>::Match(Lexer<R_mode_e>* lexer, Token* tok) {
|
| 493 | const char* p = lexer->p_current; // mutated by re2c
|
| 494 | const char* YYMARKER = p;
|
| 495 |
|
| 496 | switch (lexer->line_mode) {
|
| 497 | case R_mode_e::Outer:
|
| 498 | while (true) {
|
| 499 | /*!re2c
|
| 500 | nul { return true; }
|
| 501 |
|
| 502 | whitespace { TOK(Id::WS); }
|
| 503 |
|
| 504 | pound_comment { TOK(Id::Comm); }
|
| 505 |
|
| 506 | identifier { TOK(Id::Name); }
|
| 507 |
|
| 508 | // Not the start of a string, escaped, comment, identifier
|
| 509 | [^\x00"'#_a-zA-Z]+ { TOK(Id::Other); }
|
| 510 |
|
| 511 | ['] { TOK_MODE(Id::Str, R_mode_e::SQ); }
|
| 512 | ["] { TOK_MODE(Id::Str, R_mode_e::DQ); }
|
| 513 |
|
| 514 | * { TOK(Id::Unknown); }
|
| 515 |
|
| 516 | */
|
| 517 | }
|
| 518 | break;
|
| 519 |
|
| 520 | case R_mode_e::SQ:
|
| 521 | while (true) {
|
| 522 | /*!re2c
|
| 523 | nul { return true; }
|
| 524 |
|
| 525 | ['] { TOK_MODE(Id::Str, R_mode_e::Outer); }
|
| 526 |
|
| 527 | sq_middle { TOK(Id::Str); }
|
| 528 |
|
| 529 | * { TOK(Id::Str); }
|
| 530 |
|
| 531 | */
|
| 532 | }
|
| 533 | break;
|
| 534 |
|
| 535 | case R_mode_e::DQ:
|
| 536 | while (true) {
|
| 537 | /*!re2c
|
| 538 | nul { return true; }
|
| 539 |
|
| 540 | ["] { TOK_MODE(Id::Str, R_mode_e::Outer); }
|
| 541 |
|
| 542 | dq_middle { TOK(Id::Str); }
|
| 543 |
|
| 544 | * { TOK(Id::Str); }
|
| 545 |
|
| 546 | */
|
| 547 | }
|
| 548 | break;
|
| 549 | }
|
| 550 |
|
| 551 | tok->end_col = p - lexer->line_;
|
| 552 | lexer->p_current = p;
|
| 553 | return false;
|
| 554 | }
|
| 555 |
|
| 556 | // Problem with shell: nested double quotes!!!
|
| 557 | // We probably discourage this in YSH
|
| 558 |
|
| 559 | enum class sh_mode_e {
|
| 560 | Outer, // default
|
| 561 |
|
| 562 | SQ, // inside multi-line ''
|
| 563 | DollarSQ, // inside multi-line $''
|
| 564 | DQ, // inside multi-line ""
|
| 565 |
|
| 566 | // We could have a separate thing for this
|
| 567 | YshSQ, // inside '''
|
| 568 | YshDQ, // inside """
|
| 569 | YshJ, // inside j"""
|
| 570 | };
|
| 571 |
|
| 572 | // Returns whether EOL was hit
|
| 573 |
|
| 574 | // Submatch docs:
|
| 575 | // https://re2c.org/manual/manual_c.html#submatch-extraction
|
| 576 |
|
| 577 | template <>
|
| 578 | bool Matcher<sh_mode_e>::Match(Lexer<sh_mode_e>* lexer, Token* tok) {
|
| 579 | const char* p = lexer->p_current; // mutated by re2c
|
| 580 | const char* YYMARKER = p;
|
| 581 | const char *s, *e; // submatch extraction
|
| 582 |
|
| 583 | // Autogenerated tag variables used by the lexer to track tag values.
|
| 584 | /*!stags:re2c format = 'const char *@@;\n'; */
|
| 585 |
|
| 586 | switch (lexer->line_mode) {
|
| 587 | case sh_mode_e::Outer:
|
| 588 | while (true) {
|
| 589 | /*!re2c
|
| 590 | nul { return true; }
|
| 591 |
|
| 592 | whitespace { TOK(Id::WS); }
|
| 593 |
|
| 594 | // Resolved in fix-up pass
|
| 595 | pound_comment { TOK(Id::MaybeComment); }
|
| 596 |
|
| 597 | // not that relevant for shell
|
| 598 | identifier { TOK(Id::Name); }
|
| 599 |
|
| 600 | // Not the start of a string, escaped, comment, identifier, here doc
|
| 601 | [^\x00"'$#_a-zA-Z\\<]+ { TOK(Id::Other); }
|
| 602 |
|
| 603 | // echo is like a string
|
| 604 | "\\" . { TOK(Id::Str); }
|
| 605 |
|
| 606 | ['] { TOK_MODE(Id::Str, sh_mode_e::SQ); }
|
| 607 | ["] { TOK_MODE(Id::Str, sh_mode_e::DQ); }
|
| 608 | "$'" { TOK_MODE(Id::Str, sh_mode_e::DollarSQ); }
|
| 609 |
|
| 610 | // <<- is another syntax
|
| 611 | here_op = "<<" [-]? [ \t]*;
|
| 612 | h_delim = [_a-zA-Z][_a-zA-Z0-9]*;
|
| 613 |
|
| 614 | // unquoted or quoted
|
| 615 | here_op @s h_delim @e { SUBMATCH(s, e); TOK(Id::HereBegin); }
|
| 616 | here_op ['] @s h_delim @e ['] { SUBMATCH(s, e); TOK(Id::HereBegin); }
|
| 617 | here_op ["] @s h_delim @e ["] { SUBMATCH(s, e); TOK(Id::HereBegin); }
|
| 618 | here_op "\\" @s h_delim @e { SUBMATCH(s, e); TOK(Id::HereBegin); }
|
| 619 |
|
| 620 | // NOT Unknown, as in Python
|
| 621 | * { TOK(Id::Other); }
|
| 622 |
|
| 623 | */
|
| 624 | }
|
| 625 | break;
|
| 626 |
|
| 627 | case sh_mode_e::SQ:
|
| 628 | // Search until next ' unconditionally
|
| 629 | while (true) {
|
| 630 | /*!re2c
|
| 631 | nul { return true; }
|
| 632 |
|
| 633 | ['] { TOK_MODE(Id::Str, sh_mode_e::Outer); }
|
| 634 |
|
| 635 | [^\x00']* { TOK(Id::Str); }
|
| 636 |
|
| 637 | * { TOK(Id::Str); }
|
| 638 |
|
| 639 | */
|
| 640 | }
|
| 641 | break;
|
| 642 |
|
| 643 | case sh_mode_e::DQ:
|
| 644 | // Search until next " that's not preceded by "
|
| 645 | while (true) {
|
| 646 | /*!re2c
|
| 647 | nul { return true; }
|
| 648 |
|
| 649 | ["] { TOK_MODE(Id::Str, sh_mode_e::Outer); }
|
| 650 |
|
| 651 | dq_middle { TOK(Id::Str); }
|
| 652 |
|
| 653 | * { TOK(Id::Str); }
|
| 654 |
|
| 655 | */
|
| 656 | }
|
| 657 | break;
|
| 658 |
|
| 659 | case sh_mode_e::DollarSQ:
|
| 660 | // Search until next ' that's not preceded by "
|
| 661 | while (true) {
|
| 662 | /*!re2c
|
| 663 | nul { return true; }
|
| 664 |
|
| 665 | ['] { TOK_MODE(Id::Str, sh_mode_e::Outer); }
|
| 666 |
|
| 667 | sq_middle { TOK(Id::Str); }
|
| 668 |
|
| 669 | * { TOK(Id::Str); }
|
| 670 |
|
| 671 | */
|
| 672 | }
|
| 673 | break;
|
| 674 | case sh_mode_e::YshSQ:
|
| 675 | case sh_mode_e::YshDQ:
|
| 676 | case sh_mode_e::YshJ:
|
| 677 | assert(0);
|
| 678 | }
|
| 679 |
|
| 680 | tok->end_col = p - lexer->line_;
|
| 681 | lexer->p_current = p;
|
| 682 | return false;
|
| 683 | }
|
| 684 |
|
| 685 | enum class html_mode_e {
|
| 686 | Outer, // <NAME enters the TAG state
|
| 687 | AttrName, // NAME=" NAME=' NAME= NAME
|
| 688 | AttrValue, // NAME=" NAME=' NAME=
|
| 689 | SQ, // respects Chars, can contain "
|
| 690 | DQ, // respects Chars, can contain '
|
| 691 | Comm, // <!-- -->
|
| 692 | Preprocessing, // <? ?>
|
| 693 | CData, // <![CDATA[ x ]]>
|
| 694 | HtmlCData, // <script> <style>
|
| 695 | };
|
| 696 |
|
| 697 | // LeftStartTag -> RightStartTag <a href=/ >
|
| 698 | // LeftStartTag -> SelfClose <br id=foo />
|
| 699 |
|
| 700 | // Returns whether EOL was hit
|
| 701 | template <>
|
| 702 | bool Matcher<html_mode_e>::Match(Lexer<html_mode_e>* lexer, Token* tok) {
|
| 703 | const char* p = lexer->p_current; // mutated by re2c
|
| 704 | const char* YYMARKER = p;
|
| 705 |
|
| 706 | /*!re2c
|
| 707 | // Common definitions
|
| 708 |
|
| 709 | // Like _NAME_RE in HTM8
|
| 710 | name = [a-zA-Z][a-zA-Z0-9:_-]* ;
|
| 711 |
|
| 712 | // TODO: check this pattern
|
| 713 | char_name = "&" [a-zA-Z][a-zA-Z0-9]* ";" ;
|
| 714 | char_dec = "&#" [0-9]+ ";" ;
|
| 715 | char_hex = "&#x" [0-9a-fA-F]+ ";" ;
|
| 716 | */
|
| 717 |
|
| 718 | switch (lexer->line_mode) {
|
| 719 | case html_mode_e::Outer:
|
| 720 | while (true) {
|
| 721 | /*!re2c
|
| 722 | // accepted EOF
|
| 723 | nul { return true; }
|
| 724 |
|
| 725 | char_name { TOK(Id::CharEscape); }
|
| 726 | char_dec { TOK(Id::CharEscape); }
|
| 727 | char_hex { TOK(Id::CharEscape); }
|
| 728 |
|
| 729 | "&" { TOK(Id::BadAmpersand); }
|
| 730 | ">" { TOK(Id::BadGreaterThan); }
|
| 731 | "<" { TOK(Id::BadLessThan); }
|
| 732 |
|
| 733 | "</" name ">" { TOK(Id::EndTag); }
|
| 734 |
|
| 735 | "<" name {
|
| 736 | TOK_MODE(Id::TagNameLeft, html_mode_e::AttrName);
|
| 737 | // TODO: <script> <style> - special logic for strstr()
|
| 738 | }
|
| 739 |
|
| 740 | // Problem: these can span more than one linee ... it needs to be
|
| 741 | // another mode? The end tag might be technically the same.
|
| 742 | "<!" [^\x00>]* ">" { TOK(Id::Comm); }
|
| 743 |
|
| 744 | "<!--" { TOK_MODE(Id::Comm, html_mode_e::Comm); }
|
| 745 | "<?" { TOK_MODE(Id::Comm, html_mode_e::Preprocessing); }
|
| 746 | "<![CDATA[" { TOK_MODE(Id::Str, html_mode_e::CData); }
|
| 747 |
|
| 748 |
|
| 749 | // Like RawData
|
| 750 | * { TOK(Id::Other); }
|
| 751 |
|
| 752 | */
|
| 753 | }
|
| 754 | break;
|
| 755 | case html_mode_e::AttrName:
|
| 756 | while (true) {
|
| 757 | /*!re2c
|
| 758 | nul { return true; } // TODO: error
|
| 759 |
|
| 760 | // TODO: If the tag was <script> or <STYLE>, then we want to enter
|
| 761 | // HtmlCData mode, until we hit </script> or </STYLE>.
|
| 762 | // This is live throughout AttrName, AttrValue, SQ, DQ states?
|
| 763 | ">" { TOK_MODE(Id::TagNameRight, html_mode_e::Outer); }
|
| 764 | "/>" { TOK_MODE(Id::SelfClose, html_mode_e::Outer); }
|
| 765 |
|
| 766 | space_required name {
|
| 767 | // <a missing> - stay in the AttrName mode
|
| 768 | TOK(Id::AttrName);
|
| 769 | }
|
| 770 |
|
| 771 | space_required name whitespace '=' whitespace {
|
| 772 | // NAME= NAME=' NAME=" - expecting a value
|
| 773 | TOK_MODE(Id::AttrName, html_mode_e::AttrValue);
|
| 774 | }
|
| 775 |
|
| 776 | * { TOK(Id::Unknown); }
|
| 777 | */
|
| 778 | }
|
| 779 | break;
|
| 780 | case html_mode_e::AttrValue:
|
| 781 | while (true) {
|
| 782 | /*!re2c
|
| 783 | nul { return true; } // TODO: error
|
| 784 |
|
| 785 | ["] { TOK_MODE(Id::Str, html_mode_e::DQ); }
|
| 786 | ['] { TOK_MODE(Id::Str, html_mode_e::SQ); }
|
| 787 |
|
| 788 | // Unquoted value - a single token
|
| 789 | unquoted_value = [^\x00 \r\n\t<>&"']+ ;
|
| 790 |
|
| 791 | unquoted_value { TOK_MODE(Id::Str, html_mode_e::AttrName); }
|
| 792 |
|
| 793 | * { TOK(Id::Unknown); }
|
| 794 | */
|
| 795 | }
|
| 796 | break;
|
| 797 |
|
| 798 | case html_mode_e::DQ:
|
| 799 | while (true) {
|
| 800 | /*!re2c
|
| 801 | nul { return true; } // TODO: error
|
| 802 | char_name { TOK(Id::CharEscape); }
|
| 803 | char_dec { TOK(Id::CharEscape); }
|
| 804 | char_hex { TOK(Id::CharEscape); }
|
| 805 |
|
| 806 | // we would only need these for translation to XML, not
|
| 807 | // highlighting?
|
| 808 | "&" { TOK(Id::BadAmpersand); }
|
| 809 | ">" { TOK(Id::BadGreaterThan); }
|
| 810 | "<" { TOK(Id::BadLessThan); }
|
| 811 |
|
| 812 | ["] { TOK_MODE(Id::Str, html_mode_e::AttrName); }
|
| 813 | * { TOK(Id::Str); }
|
| 814 | */
|
| 815 | }
|
| 816 | break;
|
| 817 | case html_mode_e::SQ:
|
| 818 | while (true) {
|
| 819 | /*!re2c
|
| 820 | nul { return true; } // TODO: error
|
| 821 | char_name { TOK(Id::CharEscape); }
|
| 822 | char_dec { TOK(Id::CharEscape); }
|
| 823 | char_hex { TOK(Id::CharEscape); }
|
| 824 |
|
| 825 | // we would only need these for translation to XML, not
|
| 826 | // highlighting?
|
| 827 | "&" { TOK(Id::BadAmpersand); }
|
| 828 | ">" { TOK(Id::BadGreaterThan); }
|
| 829 | "<" { TOK(Id::BadLessThan); }
|
| 830 | ['] { TOK_MODE(Id::Str, html_mode_e::AttrName); }
|
| 831 |
|
| 832 | * { TOK(Id::Str); }
|
| 833 | */
|
| 834 | }
|
| 835 | break;
|
| 836 | case html_mode_e::Comm:
|
| 837 | // Search until next -->
|
| 838 | while (true) {
|
| 839 | /*!re2c
|
| 840 | nul { return true; }
|
| 841 |
|
| 842 | "-->" { TOK_MODE(Id::Comm, html_mode_e::Outer); }
|
| 843 |
|
| 844 | [^\x00-]* { TOK(Id::Comm); }
|
| 845 |
|
| 846 | * { TOK(Id::Comm); }
|
| 847 |
|
| 848 | */
|
| 849 | }
|
| 850 | break;
|
| 851 | case html_mode_e::Preprocessing:
|
| 852 | // Search until next ?>
|
| 853 | while (true) {
|
| 854 | /*!re2c
|
| 855 | nul { return true; }
|
| 856 |
|
| 857 | "?>" { TOK_MODE(Id::Comm, html_mode_e::Outer); }
|
| 858 |
|
| 859 | [^\x00?]* { TOK(Id::Comm); }
|
| 860 |
|
| 861 | * { TOK(Id::Comm); }
|
| 862 |
|
| 863 | */
|
| 864 | }
|
| 865 | break;
|
| 866 | case html_mode_e::CData:
|
| 867 | // Search until next ]]>
|
| 868 | while (true) {
|
| 869 | /*!re2c
|
| 870 | nul { return true; }
|
| 871 |
|
| 872 | "]]>" { TOK_MODE(Id::Str, html_mode_e::Outer); }
|
| 873 |
|
| 874 | [^\x00\]]* { TOK(Id::Str); }
|
| 875 |
|
| 876 | * { TOK(Id::Str); }
|
| 877 |
|
| 878 | */
|
| 879 | }
|
| 880 | break;
|
| 881 |
|
| 882 | default:
|
| 883 | assert(0);
|
| 884 | }
|
| 885 |
|
| 886 | tok->end_col = p - lexer->line_;
|
| 887 | lexer->p_current = p;
|
| 888 | return false;
|
| 889 | }
|
| 890 |
|
| 891 | // TODO:
|
| 892 | // - Lua / Rust-style multi-line strings, with matching delimiters e.g. r###"
|
| 893 | // - same as C++ raw string, I think
|
| 894 | // - similar to here docs, but less complex
|
| 895 | //
|
| 896 | // Inherent problems with "micro segmentation":
|
| 897 | //
|
| 898 | // - Nested double quotes in shell. echo "hi ${name:-"default"}"
|
| 899 | // - This means that lexing is **dependent on** parsing: does the second
|
| 900 | // double quote **close** the first one, or does it start a nested string?
|
| 901 | // - lexing is non-recursive, parsing is recursive
|
| 902 |
|
| 903 | // Shell Comments depend on operator chars
|
| 904 | // echo one # comment
|
| 905 | // echo $(( 16#ff ))'
|
| 906 |
|
| 907 | #endif // MICRO_SYNTAX_H
|