OILS / osh / word_parse.py View on Github | oils.pub

2269 lines, 1210 significant
1# Copyright 2016 Andy Chu. All rights reserved.
2# Licensed under the Apache License, Version 2.0 (the "License");
3# you may not use this file except in compliance with the License.
4# You may obtain a copy of the License at
5#
6# http://www.apache.org/licenses/LICENSE-2.0
7"""
8word_parse.py - Parse the shell word language.
9
10Hairy example:
11
12 hi$((1 + 2))"$(echo hi)"${var:-__"$(echo default)"__}
13
14Substitutions can be nested, but which inner subs are allowed depends on the
15outer sub. Notes:
16
17lex_mode_e.ShCommand (_ReadUnquotedLeftParts)
18 All subs and quotes are allowed:
19 $v ${v} $() `` $(()) '' "" $'' $"" <() >()
20
21lex_mode_e.DQ (_ReadDoubleQuotedLeftParts)
22 Var, Command, Arith, but no quotes.
23 $v ${v} $() `` $(())
24 No process substitution.
25
26lex_mode_e.Arith
27 Similar to DQ: Var, Command, and Arith sub, but no process sub. bash doesn't
28 allow quotes, but OSH does. We allow ALL FOUR kinds of quotes, because we
29 need those for associative array indexing.
30
31lex_mode_e.VSub_ArgUnquoted
32 Like ShCommand, everything is allowed (even process substitutions), but we
33 stop at }, and space is SIGNIFICANT.
34
35 Example: ${a:- b }
36
37 ${X:-$v} ${X:-${v}} ${X:-$(echo hi)} ${X:-`echo hi`} ${X:-$((1+2))}
38 ${X:-'single'} ${X:-"double"} ${X:-$'\n'} ${X:-<(echo hi)}
39
40lex_mode_e.VSub_ArgDQ
41 In contrast to DQ, VS_ARG_DQ accepts nested "" and $'' and $"", e.g.
42 "${x:-"default"}".
43
44 In contrast, VSub_ArgUnquoted respects single quotes and process
45 substitution.
46
47 It's weird that double quotes are allowed. Space is also significant here,
48 e.g. "${x:-a "b"}".
49"""
50
51from _devbuild.gen import grammar_nt
52from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str, Kind
53from _devbuild.gen.types_asdl import (lex_mode_t, lex_mode_e)
54from _devbuild.gen.syntax_asdl import (
55 BoolParamBox,
56 Token,
57 SimpleVarSub,
58 loc,
59 source,
60 DoubleQuoted,
61 SingleQuoted,
62 BracedVarSub,
63 CommandSub,
64 InitializerWord,
65 InitializerWord_t,
66 bracket_op,
67 bracket_op_t,
68 suffix_op,
69 suffix_op_t,
70 rhs_word,
71 rhs_word_e,
72 rhs_word_t,
73 word_e,
74 word_t,
75 CompoundWord,
76 word_part,
77 word_part_t,
78 y_lhs_e,
79 arith_expr_t,
80 command,
81 expr,
82 expr_e,
83 expr_t,
84 pat_t,
85 ArgList,
86 Proc,
87 Func,
88 Subscript,
89 Attribute,
90 arith_expr,
91 VarDecl,
92 Mutation,
93 word_part_e,
94)
95from core import alloc
96from core.error import p_die
97from mycpp.mylib import log
98from core import pyutil
99from display import ui
100from frontend import consts
101from frontend import lexer
102from frontend import reader
103from osh import tdop
104from osh import arith_parse
105from osh import braces
106from osh import word_
107from osh import word_compile
108from mycpp.mylib import tagswitch
109
110from libc import HAVE_FNM_EXTMATCH
111
112from typing import List, Optional, Tuple, cast
113from typing import TYPE_CHECKING
114if TYPE_CHECKING:
115 from frontend.lexer import Lexer
116 from frontend.parse_lib import ParseContext
117 from frontend.reader import _Reader
118 from osh.cmd_parse import VarChecker
119
120unused1 = log
121unused2 = Id_str
122
123KINDS_THAT_END_WORDS = [Kind.Eof, Kind.WS, Kind.Op, Kind.Right]
124
125
126def _IsValidYshWord(w):
127 # type: (CompoundWord) -> bool
128 """YSH word restriction
129
130 Allowed:
131 'foo' r'foo' --flag r'foo'
132 --flag='foo'
133 --flag="foo"
134 Not allowed:
135 --flag=r'bar' NAME=u'value' # ambiguous
136 --flag=b''' multi '''
137 """
138 parts = w.parts
139 n = len(parts)
140
141 if n != 0 and word_.LiteralId(parts[0]) == Id.Lit_Tilde:
142 # ~bob/src/'dir with spaces' is allowed
143 # ~bob/src/u'dir with spaces' is ambiguous, but allowed for simplicity
144 return True # early return
145
146 ok = True
147 if n >= 2:
148 for part in parts:
149 if part.tag() in (word_part_e.SingleQuoted,
150 word_part_e.DoubleQuoted):
151 ok = False
152
153 # Allow special cases:
154 # --flag='val' NAME='bar'
155 # But NOT
156 # --flag=r'val' NAME=r'val'
157 if not ok:
158 if (n == 2 and word_.LiteralId(parts[0]) == Id.Lit_VarLike):
159 ok = True
160 elif (n == 3 and word_.LiteralId(parts[0]) == Id.Lit_Chars and
161 word_.LiteralId(parts[1]) == Id.Lit_Equals):
162 ok = True
163
164 return ok
165
166
167class WordEmitter(object):
168 """Common interface for [ and [["""
169
170 def __init__(self):
171 # type: () -> None
172 """Empty constructor for mycpp."""
173 pass
174
175 def ReadWord(self, lex_mode):
176 # type: (lex_mode_t) -> word_t
177 raise NotImplementedError()
178
179
180class WordParser(WordEmitter):
181
182 def __init__(self, parse_ctx, lexer, line_reader):
183 # type: (ParseContext, Lexer, _Reader) -> None
184 self.parse_ctx = parse_ctx
185 self.lexer = lexer
186 self.line_reader = line_reader
187 self.arena = line_reader.arena
188
189 self.parse_opts = parse_ctx.parse_opts
190 self.a_parser = tdop.TdopParser(arith_parse.Spec(), self,
191 self.parse_opts)
192 self.Reset()
193
194 def Init(self, lex_mode):
195 # type: (lex_mode_t) -> None
196 """Used to parse arithmetic, see ParseContext."""
197 self.next_lex_mode = lex_mode
198
199 def Reset(self):
200 # type: () -> None
201 """Called by interactive loop."""
202 # For _GetToken()
203 self.cur_token = None # type: Token
204 self.token_kind = Kind.Undefined
205 self.token_type = Id.Undefined_Tok
206
207 self.next_lex_mode = lex_mode_e.ShCommand
208
209 # Boolean mutated by CommandParser via word_.ctx_EmitDocToken. For ### doc
210 # comments
211 self.emit_doc_token = False
212 # Boolean mutated by CommandParser via word_.ctx_Multiline. '...' starts
213 # multiline mode.
214 self.multiline = False
215
216 # For detecting invalid \n\n in multiline mode. Counts what we got
217 # directly from the lexer.
218 self.newline_state = 0
219 # For consolidating \n\n -> \n for the CALLER. This simplifies the parsers
220 # that consume words.
221 self.returned_newline = False
222
223 # For integration with pgen2
224 self.buffered_word = None # type: word_t
225
226 def _GetToken(self):
227 # type: () -> None
228 """Call this when you need to make a decision based on any of:
229
230 self.token_type
231 self.token_kind
232 self.cur_token
233 """
234 if self.next_lex_mode == lex_mode_e.Undefined:
235 return # _SetNext() not called, so do nothing
236
237 is_fake = self.next_lex_mode == lex_mode_e.BashRegexFakeInner
238 real_mode = (lex_mode_e.BashRegex if is_fake else self.next_lex_mode)
239
240 self.cur_token = self.lexer.Read(real_mode)
241
242 # MUTATE TOKEN for fake lexer mode.
243 # This is for crazy stuff bash allows, like [[ s =~ (< >) ]]
244 if (is_fake and self.cur_token.id
245 in (Id.WS_Space, Id.BashRegex_AllowedInParens)):
246 self.cur_token.id = Id.Lit_Chars
247
248 self.token_type = self.cur_token.id
249 self.token_kind = consts.GetKind(self.token_type)
250
251 # number of consecutive newlines, ignoring whitespace
252 if self.token_type == Id.Op_Newline:
253 self.newline_state += 1
254 elif self.token_kind != Kind.WS:
255 self.newline_state = 0
256
257 self.parse_ctx.trail.AppendToken(self.cur_token) # For completion
258 self.next_lex_mode = lex_mode_e.Undefined
259
260 def _SetNext(self, lex_mode):
261 # type: (lex_mode_t) -> None
262 """Set the next lex state, but don't actually read a token.
263
264 We need this for proper interactive parsing.
265 """
266 self.next_lex_mode = lex_mode
267
268 def _ReadVarOpArg(self, arg_lex_mode):
269 # type: (lex_mode_t) -> rhs_word_t
270
271 # NOTE: Operators like | and < are not treated as special, so ${a:- | >} is
272 # valid, even when unquoted.
273 self._SetNext(arg_lex_mode)
274 self._GetToken()
275
276 w = self._ReadVarOpArg2(arg_lex_mode, Id.Undefined_Tok,
277 True) # empty_ok
278
279 # If the Compound has no parts, and we're in a double-quoted VarSub
280 # arg, and empty_ok, then return Empty. This is so it can evaluate to
281 # the empty string and not get elided.
282 #
283 # Examples:
284 # - "${s:-}", "${s/%pat/}"
285 # It's similar to LooksLikeShAssignment where we turn x= into x=''. And it
286 # has the same potential problem of not having Token location info.
287 #
288 # NOTE: empty_ok is False only for the PatSub pattern, which means we'll
289 # return a Compound with no parts, which is explicitly checked with a
290 # custom error message.
291 if len(w.parts) == 0 and arg_lex_mode == lex_mode_e.VSub_ArgDQ:
292 return rhs_word.Empty
293
294 return w
295
296 def _ReadVarOpArg2(self, arg_lex_mode, eof_type, empty_ok):
297 # type: (lex_mode_t, Id_t, bool) -> CompoundWord
298 """Return a CompoundWord.
299
300 Helper function for _ReadVarOpArg and used directly by
301 _ReadPatSubVarOp.
302 """
303 w = self._ReadCompoundWord3(arg_lex_mode, eof_type, empty_ok)
304 #log('w %s', w)
305 tilde = word_.TildeDetect(w)
306 if tilde:
307 w = tilde
308 return w
309
310 def _ReadSliceVarOp(self):
311 # type: () -> suffix_op.Slice
312 """
313 Looking token after first ':'
314
315 ArithExpr? (':' ArithExpr? )? '}'
316 """
317 self._NextNonSpace()
318
319 cur_id = self.token_type
320
321 if cur_id in (Id.Arith_RBrace, Id.Arith_Colon): # ${a:} or ${a::}
322 begin = arith_expr.EmptyZero # type: arith_expr_t
323 else:
324 begin = self.a_parser.Parse()
325 cur_id = self.a_parser.CurrentId() # advance
326
327 if cur_id == Id.Arith_RBrace: # ${a:1} or ${@:1}
328 # No length specified, so it's N
329 no_length = None # type: Optional[arith_expr_t]
330 return suffix_op.Slice(begin, no_length)
331
332 elif cur_id == Id.Arith_Colon: # ${a:1:} or ${@:1:}
333 colon_tok = self.cur_token
334 self._NextNonSpace()
335
336 if self.token_type == Id.Arith_RBrace:
337 # quirky bash behavior:
338 # ${a:1:} or ${a::} means length ZERO
339 # but ${a:1} or ${a:} means length N
340 if self.parse_opts.strict_parse_slice():
341 p_die(
342 "Slice length: Add explicit zero, or omit : for N (strict_parse_slice)",
343 colon_tok)
344
345 length = arith_expr.EmptyZero # type: arith_expr_t
346 else:
347 length = self._ReadArithExpr(Id.Arith_RBrace)
348
349 return suffix_op.Slice(begin, length)
350
351 else:
352 p_die("Expected : or } in slice", self.cur_token)
353
354 raise AssertionError() # for MyPy
355
356 def _ReadPatSubVarOp(self):
357 # type: () -> suffix_op.PatSub
358 """Looking at the first '/' after VarOf:
359
360 VarSub = ...
361 | VarOf '/' Match ( '/' WORD? )?
362 Match = '/' WORD # can't be empty
363 | '#' WORD? # may be empty
364 | '%' WORD?
365 """
366 slash_tok = self.cur_token # location info
367 replace_mode = Id.Undefined_Tok # bizarre syntax / # %
368
369 self._SetNext(lex_mode_e.VSub_ArgUnquoted) # advance past /
370
371 self._GetToken()
372 if self.token_type == Id.Right_DollarBrace:
373 pat = CompoundWord([])
374 return suffix_op.PatSub(pat, rhs_word.Empty, replace_mode,
375 slash_tok)
376
377 if self.token_type in (Id.Lit_Slash, Id.Lit_Pound, Id.Lit_Percent):
378 replace_mode = self.token_type
379 self._SetNext(lex_mode_e.VSub_ArgUnquoted)
380
381 # Bash quirk:
382 # echo ${x/#/replace} has an empty pattern
383 # echo ${x////replace} is non-empty; it means echo ${x//'/'/replace}
384 empty_ok = replace_mode != Id.Lit_Slash
385 pat = self._ReadVarOpArg2(lex_mode_e.VSub_ArgUnquoted, Id.Lit_Slash,
386 empty_ok)
387 #log('pat 1 %r', pat)
388
389 if self.token_type == Id.Lit_Slash:
390 # read until }
391 replace = self._ReadVarOpArg(
392 lex_mode_e.VSub_ArgUnquoted) # type: rhs_word_t
393 #log('r 1 %r', replace)
394 else:
395 # e.g. ${v/a} is the same as ${v/a/} -- empty replacement string
396 replace = rhs_word.Empty
397
398 self._GetToken()
399 if self.token_type != Id.Right_DollarBrace:
400 # This happens on invalid code
401 p_die(
402 "Expected } after replacement string, got %s" %
403 ui.PrettyId(self.token_type), self.cur_token)
404
405 return suffix_op.PatSub(pat, replace, replace_mode, slash_tok)
406
407 def _ReadSubscript(self):
408 # type: () -> bracket_op_t
409 """ Subscript = '[' ('@' | '*' | ArithExpr) ']' """
410 # Lookahead to see if we get @ or *. Otherwise read a full arithmetic
411 # expression.
412 next_id = self.lexer.LookPastSpace(lex_mode_e.Arith)
413 if next_id in (Id.Lit_At, Id.Arith_Star):
414 op = bracket_op.WholeArray(next_id) # type: bracket_op_t
415
416 self._SetNext(lex_mode_e.Arith) # skip past [
417 self._GetToken()
418 self._SetNext(lex_mode_e.Arith) # skip past @
419 self._GetToken()
420 else:
421 self._SetNext(lex_mode_e.Arith) # skip past [
422 anode = self._ReadArithExpr(Id.Arith_RBracket)
423 op = bracket_op.ArrayIndex(anode)
424
425 if self.token_type != Id.Arith_RBracket: # Should be looking at ]
426 p_die('Expected ] to close subscript', self.cur_token)
427
428 self._SetNext(lex_mode_e.VSub_2) # skip past ]
429 self._GetToken() # Needed to be in the same spot as no subscript
430
431 return op
432
433 def _ParseVarOf(self):
434 # type: () -> BracedVarSub
435 """
436 VarOf = NAME Subscript?
437 | NUMBER # no subscript allowed, none of these are arrays
438 # ${@[1]} doesn't work, even though slicing does
439 | VarSymbol
440 """
441 self._GetToken()
442 name_token = self.cur_token
443 self._SetNext(lex_mode_e.VSub_2)
444
445 self._GetToken() # Check for []
446 if self.token_type == Id.VOp2_LBracket:
447 bracket_op = self._ReadSubscript()
448 else:
449 bracket_op = None
450
451 part = BracedVarSub.CreateNull()
452 part.name_tok = name_token
453 part.var_name = lexer.TokenVal(name_token)
454 part.bracket_op = bracket_op
455 return part
456
457 def _ParseVarExpr(self, arg_lex_mode, allow_query=False):
458 # type: (lex_mode_t, bool) -> BracedVarSub
459 """Start parsing at the op -- we already skipped past the name."""
460 part = self._ParseVarOf()
461
462 self._GetToken()
463 if self.token_type == Id.Right_DollarBrace:
464 return part # no ops
465
466 op_kind = self.token_kind
467
468 if op_kind == Kind.VTest:
469 tok = self.cur_token
470 arg_word = self._ReadVarOpArg(arg_lex_mode)
471 if self.token_type != Id.Right_DollarBrace:
472 p_die('Expected } to close ${', self.cur_token)
473
474 part.suffix_op = suffix_op.Unary(tok, arg_word)
475
476 elif op_kind == Kind.VOpYsh:
477 tok = self.cur_token
478 arg_word = self._ReadVarOpArg(arg_lex_mode)
479 if self.token_type != Id.Right_DollarBrace:
480 p_die('Expected } to close ${', self.cur_token)
481
482 UP_arg_word = arg_word
483 with tagswitch(arg_word) as case:
484 if case(rhs_word_e.Empty):
485 pass
486 elif case(rhs_word_e.Compound):
487 arg_word = cast(CompoundWord, UP_arg_word)
488 # This handles ${x|html} and ${x %.3f} now
489 # However I think ${x %.3f} should be statically parsed? It can enter
490 # the printf lexer modes.
491 ok, arg, quoted = word_.StaticEval(arg_word)
492 if not ok or quoted:
493 p_die('Expected a constant argument',
494 loc.Word(arg_word))
495
496 part.suffix_op = suffix_op.Static(tok, arg)
497
498 elif op_kind == Kind.VOp0:
499 part.suffix_op = self.cur_token # Nullary
500 self._SetNext(lex_mode_e.VSub_2) # Expecting }
501 self._GetToken()
502
503 elif op_kind == Kind.VOp1: # % %% # ## etc.
504 tok = self.cur_token
505 # Weird exception that all shells have: these operators take a glob
506 # pattern, so they're lexed as VSub_ArgUnquoted, not VSub_ArgDQ
507 arg_word = self._ReadVarOpArg(lex_mode_e.VSub_ArgUnquoted)
508 if self.token_type != Id.Right_DollarBrace:
509 p_die('Expected } to close ${', self.cur_token)
510
511 part.suffix_op = suffix_op.Unary(tok, arg_word)
512
513 elif op_kind == Kind.VOp2: # / : [ ]
514 if self.token_type == Id.VOp2_Slash:
515 patsub_op = self._ReadPatSubVarOp() # type: suffix_op_t
516 part.suffix_op = patsub_op
517
518 # Checked by the method above
519 assert self.token_type == Id.Right_DollarBrace, self.cur_token
520
521 elif self.token_type == Id.VOp2_Colon:
522 part.suffix_op = self._ReadSliceVarOp()
523 # NOTE: } in arithmetic mode.
524 if self.token_type != Id.Arith_RBrace:
525 # Token seems off; doesn't point to X in # ${a:1:2 X
526 p_die('Expected } to close ${', self.cur_token)
527
528 else:
529 # TODO: Does this ever happen?
530 p_die('Unexpected token in ${} (%s)' % 'VOp2', self.cur_token)
531
532 elif op_kind == Kind.VOp3: # ${prefix@} etc.
533 if allow_query:
534 part.suffix_op = self.cur_token # Nullary
535 self._SetNext(lex_mode_e.VSub_2) # Expecting }
536 self._GetToken()
537 else:
538 p_die("Unexpected token in ${} (%s)" % 'VOp3', self.cur_token)
539
540 # NOTE: Arith_RBrace is for slicing, because it reads } in arithmetic
541 # mode. It's redundantly checked above.
542 if self.token_type not in (Id.Right_DollarBrace, Id.Arith_RBrace):
543 # ${a.} or ${!a.}
544 p_die('Expected } to close ${', self.cur_token)
545
546 # Now look for ops
547 return part
548
549 def _ReadZshVarSub(self, left_token):
550 # type: (Token) -> word_part.ZshVarSub
551
552 self._SetNext(lex_mode_e.VSub_Zsh) # Move past ${(foo)
553
554 # Can be empty
555 w = self._ReadCompoundWord3(lex_mode_e.VSub_Zsh, Id.Right_DollarBrace,
556 True)
557 self._GetToken()
558 return word_part.ZshVarSub(left_token, w, self.cur_token)
559
560 def ReadBracedVarSub(self, left_token):
561 # type: (Token) -> Tuple[BracedVarSub, Token]
562 """ For YSH expressions like var x = ${x:-"default"}. """
563 part = self._ReadBracedVarSub(left_token, d_quoted=False)
564 last_token = self.cur_token
565 return part, last_token
566
567 def _ReadBracedVarSub(self, left_token, d_quoted):
568 # type: (Token, bool) -> BracedVarSub
569 """For the ${} expression language.
570
571 NAME = [a-zA-Z_][a-zA-Z0-9_]*
572 NUMBER = [0-9]+ # ${10}, ${11}, ...
573
574 Subscript = '[' ('@' | '*' | ArithExpr) ']'
575 VarSymbol = '!' | '@' | '#' | ...
576 VarOf = NAME Subscript?
577 | NUMBER # no subscript allowed, none of these are arrays
578 # ${@[1]} doesn't work, even though slicing does
579 | VarSymbol
580
581 NULLARY_OP = '@Q' | '@E' | '@P' | '@A' | '@a' # VOp0
582
583 TEST_OP = '-' | ':-' | '=' | ':=' | '+' | ':+' | '?' | ':?'
584 STRIP_OP = '#' | '##' | '%' | '%%'
585 CASE_OP = ',' | ',,' | '^' | '^^'
586 UnaryOp = TEST_OP | STRIP_OP | CASE_OP
587
588 YSH_UNARY = '|' | ' ' # ${x|html} and ${x %.3f}.
589 # SPACE is operator not %
590 Match = ('/' | '#' | '%') WORD # match all / prefix / suffix
591 VarExpr = VarOf
592 | VarOf NULLARY_OP
593 | VarOf UnaryOp WORD
594 | VarOf YSH_UNARY STATIC_WORD
595 | VarOf ':' ArithExpr (':' ArithExpr )?
596 | VarOf '/' Match '/' WORD
597
598 LengthExpr = '#' VarOf # can't apply operators after length
599
600 RefOrKeys = '!' VarExpr # CAN apply operators after a named ref
601 # ${!ref[0]} vs ${!keys[@]} resolved later
602
603 PrefixQuery = '!' NAME ('*' | '@') # list variable names with a prefix
604
605 BuiltinSub = '.' WORD+ # ${.myproc 'builtin' $sub}
606
607 VarSub = LengthExpr
608 | RefOrKeys
609 | PrefixQuery
610 | VarExpr
611 | BuiltinSub
612
613 NOTES:
614 - Arithmetic expressions are used twice, inside subscripts ${a[x+1]} and
615 slicing ${a:x+1:y+2}
616 - ${#} and ${!} need LL(2) lookahead (considering how my tokenizer works)
617 - @ and * are technically arithmetic expressions in this implementation
618 - We don't account for bash 4.4: ${param@operator} -- Q E P A a. Note that
619 it's also vectorized.
620
621 Strictness over bash:
622 - echo ${a[0][0]} doesn't do anything useful, so we disallow it from the
623 grammar
624 - ! and # prefixes can't be composed, even though named refs can be
625 composed with other operators
626 - '#' means 4 different things: length prefix, VarSymbol, UnaryOp to strip
627 a prefix, and it can also be a literal part of WORD.
628
629 From the parser's point of view, the prefix # can't be combined with
630 UnaryOp/slicing/matching, and the ! can. However
631
632 - ${a[@]:1:2} is not allowed
633 - ${#a[@]:1:2} is allowed, but gives the wrong answer
634 """
635 if d_quoted:
636 arg_lex_mode = lex_mode_e.VSub_ArgDQ
637 else:
638 arg_lex_mode = lex_mode_e.VSub_ArgUnquoted
639
640 self._SetNext(lex_mode_e.VSub_1)
641 self._GetToken()
642
643 ty = self.token_type
644 first_tok = self.cur_token
645
646 if ty == Id.VSub_Pound:
647 # Disambiguate
648 next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
649 if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
650 # e.g. a name, '#' is the prefix
651 self._SetNext(lex_mode_e.VSub_1)
652 part = self._ParseVarOf()
653
654 self._GetToken()
655 if self.token_type != Id.Right_DollarBrace:
656 p_die('Expected } after length expression', self.cur_token)
657
658 part.prefix_op = first_tok
659
660 else: # not a prefix, '#' is the variable
661 part = self._ParseVarExpr(arg_lex_mode)
662
663 elif ty == Id.VSub_Bang:
664 next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
665 if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
666 # e.g. a name, '!' is the prefix
667 # ${!a} -- this is a ref
668 # ${!3} -- this is ref
669 # ${!a[1]} -- this is a ref
670 # ${!a[@]} -- this is a keys
671 # No lookahead -- do it in a second step, or at runtime
672 self._SetNext(lex_mode_e.VSub_1)
673 part = self._ParseVarExpr(arg_lex_mode, allow_query=True)
674
675 part.prefix_op = first_tok
676
677 else: # not a prefix, '!' is the variable
678 part = self._ParseVarExpr(arg_lex_mode)
679
680 elif ty == Id.VSub_Dot:
681 # Note: this will become a new builtin_sub type, so this method must
682 # return word_part_t rather than BracedVarSub. I don't think that
683 # should cause problems.
684 p_die('TODO: ${.myproc builtin sub}', self.cur_token)
685
686 # VS_NAME, VS_NUMBER, symbol that isn't # or !
687 elif self.token_kind == Kind.VSub:
688 part = self._ParseVarExpr(arg_lex_mode)
689
690 else:
691 # e.g. ${^}
692 p_die('Unexpected token in ${}', self.cur_token)
693
694 part.left = left_token # attach the argument
695 part.right = self.cur_token
696 return part
697
698 def _ReadSingleQuoted(self, left_token, lex_mode):
699 # type: (Token, lex_mode_t) -> SingleQuoted
700 """Internal method to read a word_part."""
701 tokens = [] # type: List[Token]
702 # In command mode, we never disallow backslashes like '\'
703 right_quote = self.ReadSingleQuoted(lex_mode, left_token, tokens,
704 False)
705 sval = word_compile.EvalSingleQuoted(left_token.id, tokens)
706 node = SingleQuoted(left_token, sval, right_quote)
707 return node
708
709 def ReadSingleQuoted(self, lex_mode, left_token, out_tokens, is_ysh_expr):
710 # type: (lex_mode_t, Token, List[Token], bool) -> Token
711 """Appends to out_tokens; returns last token
712
713 Used by expr_parse.py
714 """
715 # TODO: Remove and use out_tokens
716 tokens = [] # type: List[Token]
717
718 # echo '\' is allowed, but x = '\' is invalid, in favor of x = r'\'
719 no_backslashes = is_ysh_expr and left_token.id == Id.Left_SingleQuote
720
721 expected_end_tokens = 3 if left_token.id in (
722 Id.Left_TSingleQuote, Id.Left_RTSingleQuote, Id.Left_UTSingleQuote,
723 Id.Left_BTSingleQuote) else 1
724 num_end_tokens = 0
725
726 while num_end_tokens < expected_end_tokens:
727 self._SetNext(lex_mode)
728 self._GetToken()
729
730 # Kind.Char emitted in lex_mode.SQ_C
731 if self.token_kind in (Kind.Lit, Kind.Char):
732 tok = self.cur_token
733 # Happens in lex_mode_e.SQ: 'one\two' is ambiguous, should be
734 # r'one\two' or c'one\\two'
735 if no_backslashes and lexer.TokenContains(tok, '\\'):
736 p_die(
737 r"Strings with backslashes should look like r'\n' or u'\n' or b'\n'",
738 tok)
739
740 if is_ysh_expr:
741 # Disallow var x = $'\001'. Arguably we don't need these
742 # checks because u'\u{1}' is the way to write it.
743 if self.token_type == Id.Char_Octal3:
744 p_die(
745 r"Use \xhh or \u{...} instead of octal escapes in YSH strings",
746 tok)
747
748 if self.token_type == Id.Char_Hex and self.cur_token.length != 4:
749 # disallow \xH
750 p_die(
751 r'Invalid hex escape in YSH string (must be \xHH)',
752 tok)
753
754 tokens.append(tok)
755
756 elif self.token_kind == Kind.Unknown:
757 tok = self.cur_token
758 assert tok.id == Id.Unknown_Backslash, tok
759
760 # x = $'\z' is disallowed; ditto for echo $'\z' if shopt -u parse_backslash
761 if is_ysh_expr or not self.parse_opts.parse_backslash():
762 p_die(
763 "Invalid char escape in C-style string literal (OILS-ERR-11)",
764 tok)
765
766 tokens.append(tok)
767
768 elif self.token_kind == Kind.Eof:
769 p_die('Unexpected EOF in single-quoted string that began here',
770 left_token)
771
772 elif self.token_kind == Kind.Right:
773 # assume Id.Right_SingleQuote
774 num_end_tokens += 1
775 tokens.append(self.cur_token)
776
777 else:
778 raise AssertionError(self.cur_token)
779
780 if self.token_kind != Kind.Right:
781 num_end_tokens = 0 # we need three in a ROW
782
783 if expected_end_tokens == 1:
784 tokens.pop()
785 elif expected_end_tokens == 3: # Get rid of spurious end tokens
786 tokens.pop()
787 tokens.pop()
788 tokens.pop()
789
790 # Remove space from ''' r''' $''' in both expression mode and command mode
791 if left_token.id in (Id.Left_TSingleQuote, Id.Left_RTSingleQuote,
792 Id.Left_UTSingleQuote, Id.Left_BTSingleQuote):
793 word_compile.RemoveLeadingSpaceSQ(tokens)
794
795 # Validation after lexing - same 2 checks in j8.LexerDecoder
796 is_u_string = left_token.id in (Id.Left_USingleQuote,
797 Id.Left_UTSingleQuote)
798
799 for tok in tokens:
800 # u'\yff' is not valid, but b'\yff' is
801 if is_u_string and tok.id == Id.Char_YHex:
802 p_die(
803 r"%s escapes not allowed in u'' strings" %
804 lexer.TokenVal(tok), tok)
805
806 out_tokens.extend(tokens)
807 return self.cur_token
808
809 def _ReadDoubleQuotedLeftParts(self):
810 # type: () -> word_part_t
811 """Read substitution parts in a double quoted context."""
812 if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick):
813 return self._ReadCommandSub(self.token_type, d_quoted=True)
814
815 if self.token_type == Id.Left_DollarBrace:
816 return self._ReadBracedVarSub(self.cur_token, d_quoted=True)
817
818 if self.token_type == Id.Left_DollarDParen:
819 return self._ReadArithSub()
820
821 if self.token_type == Id.Left_DollarBracket:
822 return self._ReadExprSub(lex_mode_e.DQ)
823
824 if self.token_type == Id.Left_DollarBraceZsh:
825 return self._ReadZshVarSub(self.cur_token)
826
827 raise AssertionError(self.cur_token)
828
829 def _ReadYshSingleQuoted(self, left_id):
830 # type: (Id_t) -> CompoundWord
831 """Read YSH style strings
832
833 r'' u'' b''
834 r''' ''' u''' ''' b''' '''
835 """
836 #log('BEF self.cur_token %s', self.cur_token)
837 if left_id == Id.Left_RSingleQuote:
838 lexer_mode = lex_mode_e.SQ_Raw
839 triple_left_id = Id.Left_RTSingleQuote
840 elif left_id == Id.Left_USingleQuote:
841 lexer_mode = lex_mode_e.J8_Str
842 triple_left_id = Id.Left_UTSingleQuote
843 elif left_id == Id.Left_BSingleQuote:
844 lexer_mode = lex_mode_e.J8_Str
845 triple_left_id = Id.Left_BTSingleQuote
846 else:
847 raise AssertionError(left_id)
848
849 # Needed for syntax checks
850 left_tok = self.cur_token
851 left_tok.id = left_id
852
853 sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
854
855 if (len(sq_part.sval) == 0 and self.lexer.ByteLookAhead() == "'"):
856 self._SetNext(lex_mode_e.ShCommand)
857 self._GetToken()
858
859 assert self.token_type == Id.Left_SingleQuote
860 # HACK: magically transform the third ' in u''' to
861 # Id.Left_UTSingleQuote, so that ''' is the terminator
862 left_tok = self.cur_token
863 left_tok.id = triple_left_id
864
865 # Handles stripping leading whitespace
866 sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
867
868 # Advance and validate
869 self._SetNext(lex_mode_e.ShCommand)
870
871 self._GetToken()
872 if self.token_kind not in KINDS_THAT_END_WORDS:
873 p_die('Unexpected token after YSH single-quoted string',
874 self.cur_token)
875
876 return CompoundWord([sq_part])
877
878 def _ReadUnquotedLeftParts(self, triple_out):
879 # type: (Optional[BoolParamBox]) -> word_part_t
880 """Read substitutions and quoted strings (for lex_mode_e.ShCommand).
881
882 If triple_out is set, then we try parsing triple quoted strings,
883 and set its value to True if we got one.
884 """
885 if self.token_type in (Id.Left_DoubleQuote, Id.Left_DollarDoubleQuote):
886 # Note: $"" is a synonym for "". It might make sense if it added
887 # \n \0 \x00 \u{123} etc. But that's not what bash does!
888 dq_part = self._ReadDoubleQuoted(self.cur_token)
889 # Got empty word "" and there's a " after
890 if (triple_out and len(dq_part.parts) == 0 and
891 self.lexer.ByteLookAhead() == '"'):
892
893 self._SetNext(lex_mode_e.ShCommand)
894 self._GetToken()
895 # HACK: magically transform the third " in """ to
896 # Id.Left_TDoubleQuote, so that """ is the terminator
897 left_dq_token = self.cur_token
898 left_dq_token.id = Id.Left_TDoubleQuote
899 triple_out.b = True # let caller know we got it
900 return self._ReadDoubleQuoted(left_dq_token)
901
902 return dq_part
903
904 if self.token_type in (Id.Left_SingleQuote, Id.Left_RSingleQuote,
905 Id.Left_DollarSingleQuote):
906 if self.token_type == Id.Left_SingleQuote:
907 lexer_mode = lex_mode_e.SQ_Raw
908 triple_left_id = Id.Left_TSingleQuote
909 elif self.token_type == Id.Left_RSingleQuote:
910 lexer_mode = lex_mode_e.SQ_Raw
911 triple_left_id = Id.Left_RTSingleQuote
912 else:
913 lexer_mode = lex_mode_e.SQ_C
914 # there is no such thing as $'''
915 triple_left_id = Id.Undefined_Tok
916
917 sq_part = self._ReadSingleQuoted(self.cur_token, lexer_mode)
918
919 # Got empty '' or r'' and there's a ' after
920 # u'' and b'' are handled in _ReadYshSingleQuoted
921 if (triple_left_id != Id.Undefined_Tok and
922 triple_out is not None and len(sq_part.sval) == 0 and
923 self.lexer.ByteLookAhead() == "'"):
924
925 self._SetNext(lex_mode_e.ShCommand)
926 self._GetToken()
927
928 # HACK: magically transform the third ' in ''' to
929 # Id.Left_TSingleQuote, so that ''' is the terminator
930 left_sq_token = self.cur_token
931 left_sq_token.id = triple_left_id
932
933 triple_out.b = True # let caller know we got it
934 return self._ReadSingleQuoted(left_sq_token, lexer_mode)
935
936 return sq_part
937
938 if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick,
939 Id.Left_ProcSubIn, Id.Left_ProcSubOut):
940 return self._ReadCommandSub(self.token_type, d_quoted=False)
941
942 if self.token_type == Id.Left_DollarBrace:
943 return self._ReadBracedVarSub(self.cur_token, d_quoted=False)
944
945 if self.token_type == Id.Left_DollarDParen:
946 return self._ReadArithSub()
947
948 if self.token_type == Id.Left_DollarBracket:
949 return self._ReadExprSub(lex_mode_e.ShCommand)
950
951 if self.token_type == Id.Left_DollarBraceZsh:
952 return self._ReadZshVarSub(self.cur_token)
953
954 raise AssertionError(self.cur_token)
955
956 def _ReadExtGlob(self):
957 # type: () -> word_part.ExtGlob
958 """
959 Grammar:
960 Item = CompoundWord | EPSILON # important: @(foo|) is allowed
961 LEFT = '@(' | '*(' | '+(' | '?(' | '!('
962 RIGHT = ')'
963 ExtGlob = LEFT (Item '|')* Item RIGHT # ITEM may be empty
964 Compound includes ExtGlob
965 """
966 left_token = self.cur_token
967 right_token = None # type: Token
968 arms = [] # type: List[CompoundWord]
969
970 self.lexer.PushHint(Id.Op_RParen, Id.Right_ExtGlob)
971 self._SetNext(lex_mode_e.ExtGlob) # advance past LEFT
972
973 read_word = False # did we just a read a word? To handle @(||).
974
975 while True:
976 self._GetToken()
977
978 if self.token_type == Id.Right_ExtGlob:
979 if not read_word:
980 arms.append(CompoundWord([]))
981 right_token = self.cur_token
982 break
983
984 elif self.token_type == Id.Op_Pipe:
985 if not read_word:
986 arms.append(CompoundWord([]))
987 read_word = False
988 self._SetNext(lex_mode_e.ExtGlob)
989
990 # lex_mode_e.ExtGlob should only produce these 4 kinds of tokens
991 elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub,
992 Kind.ExtGlob):
993 w = self._ReadCompoundWord(lex_mode_e.ExtGlob)
994 arms.append(w)
995 read_word = True
996
997 elif self.token_kind == Kind.Eof:
998 p_die('Unexpected EOF reading extended glob that began here',
999 left_token)
1000
1001 else:
1002 raise AssertionError(self.cur_token)
1003
1004 return word_part.ExtGlob(left_token, arms, right_token)
1005
1006 def _ReadBashRegexGroup(self):
1007 # type: () -> word_part.BashRegexGroup
1008 """
1009 Grammar:
1010 BashRegexGroup = '(' WORD? ')
1011 """
1012 left_token = self.cur_token
1013 assert left_token.id == Id.BashRegex_LParen, left_token
1014
1015 arms = [] # type: List[CompoundWord]
1016
1017 self.lexer.PushHint(Id.Op_RParen, Id.Right_BashRegexGroup)
1018 self._SetNext(lex_mode_e.BashRegexFakeInner) # advance past LEFT
1019
1020 self._GetToken()
1021 if self.token_type == Id.Right_BashRegexGroup: # empty ()
1022 return word_part.BashRegexGroup(left_token, None, self.cur_token)
1023
1024 # lex_mode_e.BashRegex should only produce these 4 kinds of tokens
1025 if self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub, Kind.BashRegex):
1026 # Fake lexer mode that translates Id.WS_Space to Id.Lit_Chars
1027 # To allow bash style [[ s =~ (a b) ]]
1028 w = self._ReadCompoundWord(lex_mode_e.BashRegexFakeInner)
1029 arms.append(w)
1030
1031 self._GetToken()
1032 if self.token_type != Id.Right_BashRegexGroup:
1033 p_die('Expected ) to close bash regex group', self.cur_token)
1034
1035 return word_part.BashRegexGroup(left_token, w, self.cur_token)
1036
1037 p_die('Expected word after ( opening bash regex group', self.cur_token)
1038
1039 def _ReadLikeDQ(self, left_token, is_ysh_expr, out_parts):
1040 # type: (Optional[Token], bool, List[word_part_t]) -> None
1041 """
1042 Args:
1043 left_token: A token if we are reading a double quoted part, or None if
1044 we're reading a here doc.
1045 is_ysh_expr: Whether to disallow backticks and invalid char escapes
1046 out_parts: list of word_part to append to
1047 """
1048 if left_token:
1049 if left_token.id in (Id.Left_TDoubleQuote,
1050 Id.Left_DollarTDoubleQuote):
1051 expected_end_tokens = 3
1052 else:
1053 expected_end_tokens = 1
1054 else:
1055 expected_end_tokens = 1000 # here doc will break
1056
1057 num_end_tokens = 0
1058 while num_end_tokens < expected_end_tokens:
1059 self._SetNext(lex_mode_e.DQ)
1060 self._GetToken()
1061
1062 if self.token_kind == Kind.Lit:
1063 if self.token_type == Id.Lit_EscapedChar:
1064 tok = self.cur_token
1065 ch = lexer.TokenSliceLeft(tok, 1)
1066 part = word_part.EscapedLiteral(tok,
1067 ch) # type: word_part_t
1068 else:
1069 if self.token_type == Id.Lit_BadBackslash:
1070 # echo "\z" is OK in shell, but 'x = "\z" is a syntax error in
1071 # YSH.
1072 # Slight hole: We don't catch 'x = ${undef:-"\z"} because of the
1073 # recursion (unless parse_backslash)
1074 if (is_ysh_expr or
1075 not self.parse_opts.parse_backslash()):
1076 p_die(
1077 "Invalid char escape in double quoted string (OILS-ERR-12)",
1078 self.cur_token)
1079 elif self.token_type == Id.Lit_Dollar:
1080 if is_ysh_expr or not self.parse_opts.parse_dollar():
1081 p_die("Literal $ should be quoted like \$",
1082 self.cur_token)
1083
1084 part = self.cur_token
1085 out_parts.append(part)
1086
1087 elif self.token_kind == Kind.Left:
1088 if self.token_type == Id.Left_Backtick and is_ysh_expr:
1089 p_die("Invalid backtick: use $(cmd) or \\` in YSH strings",
1090 self.cur_token)
1091
1092 part = self._ReadDoubleQuotedLeftParts()
1093 out_parts.append(part)
1094
1095 elif self.token_kind == Kind.VSub:
1096 tok = self.cur_token
1097 part = SimpleVarSub(tok)
1098 out_parts.append(part)
1099 # NOTE: parsing "$f(x)" would BREAK CODE. Could add a more for it
1100 # later.
1101
1102 elif self.token_kind == Kind.Right:
1103 assert self.token_type == Id.Right_DoubleQuote, self.token_type
1104 if left_token:
1105 num_end_tokens += 1
1106
1107 # In a here doc, the right quote is literal!
1108 out_parts.append(self.cur_token)
1109
1110 elif self.token_kind == Kind.Eof:
1111 if left_token:
1112 p_die(
1113 'Unexpected EOF reading double-quoted string that began here',
1114 left_token)
1115 else: # here docs will have an EOF in their token stream
1116 break
1117
1118 else:
1119 raise AssertionError(self.cur_token)
1120
1121 if self.token_kind != Kind.Right:
1122 num_end_tokens = 0 # """ must be CONSECUTIVE
1123
1124 if expected_end_tokens == 1:
1125 out_parts.pop()
1126 elif expected_end_tokens == 3:
1127 out_parts.pop()
1128 out_parts.pop()
1129 out_parts.pop()
1130
1131 # Remove space from """ in both expression mode and command mode
1132 if (left_token and left_token.id
1133 in (Id.Left_TDoubleQuote, Id.Left_DollarTDoubleQuote)):
1134 word_compile.RemoveLeadingSpaceDQ(out_parts)
1135
1136 # Return nothing, since we appended to 'out_parts'
1137
1138 def _ReadDoubleQuoted(self, left_token):
1139 # type: (Token) -> DoubleQuoted
1140 """Helper function for "hello $name".
1141
1142 Args:
1143 eof_type: for stopping at }, Id.Lit_RBrace
1144 here_doc: Whether we are reading in a here doc context
1145
1146 Also ${foo%%a b c} # treat this as double quoted. until you hit
1147 """
1148 parts = [] # type: List[word_part_t]
1149 self._ReadLikeDQ(left_token, False, parts)
1150
1151 right_quote = self.cur_token
1152 return DoubleQuoted(left_token, parts, right_quote)
1153
1154 def ReadDoubleQuoted(self, left_token, parts):
1155 # type: (Token, List[word_part_t]) -> Token
1156 """For expression mode.
1157
1158 Read var x = "${dir:-}/$name"; etc.
1159 """
1160 self._ReadLikeDQ(left_token, True, parts)
1161 return self.cur_token
1162
1163 def _ReadCommandSub(self, left_id, d_quoted=False):
1164 # type: (Id_t, bool) -> CommandSub
1165 """
1166 NOTE: This is not in the grammar, because word parts aren't in the grammar!
1167
1168 command_sub = '$(' command_list ')'
1169 | '@(' command_list ')'
1170 | '<(' command_list ')'
1171 | '>(' command_list ')'
1172 | ` command_list `
1173 """
1174 left_token = self.cur_token
1175
1176 # Set the lexer in a state so ) becomes the EOF token.
1177 if left_id in (Id.Left_DollarParen, Id.Left_AtParen, Id.Left_ProcSubIn,
1178 Id.Left_ProcSubOut):
1179 self._SetNext(lex_mode_e.ShCommand) # advance past $( etc.
1180
1181 right_id = Id.Eof_RParen
1182 self.lexer.PushHint(Id.Op_RParen, right_id)
1183 c_parser = self.parse_ctx.MakeParserForCommandSub(
1184 self.line_reader, self.lexer, right_id)
1185 # NOTE: This doesn't use something like main_loop because we don't want
1186 # to interleave parsing and execution! Unlike 'source' and 'eval'.
1187 node = c_parser.ParseCommandSub()
1188
1189 right_token = c_parser.w_parser.cur_token
1190
1191 elif left_id == Id.Left_Backtick and self.parse_ctx.do_lossless:
1192 # NOTE: This is an APPROXIMATE solution for translation ONLY. See
1193 # test/osh2oil.
1194
1195 right_id = Id.Eof_Backtick
1196 self.lexer.PushHint(Id.Left_Backtick, right_id)
1197 c_parser = self.parse_ctx.MakeParserForCommandSub(
1198 self.line_reader, self.lexer, right_id)
1199 node = c_parser.ParseCommandSub()
1200 right_token = c_parser.w_parser.cur_token
1201
1202 elif left_id == Id.Left_Backtick:
1203 if not self.parse_opts.parse_backticks():
1204 p_die('Use $(cmd) instead of backticks (parse_backticks)',
1205 left_token)
1206
1207 self._SetNext(lex_mode_e.Backtick) # advance past `
1208
1209 parts = [] # type: List[str]
1210 while True:
1211 self._GetToken()
1212 #log("TOK %s", self.cur_token)
1213
1214 if self.token_type == Id.Backtick_Quoted:
1215 # Remove leading \
1216 parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1217
1218 elif self.token_type == Id.Backtick_DoubleQuote:
1219 # Compatibility: If backticks are double quoted, then double quotes
1220 # within them have to be \"
1221 # Shells aren't smart enough to match nested " and ` quotes (but OSH
1222 # is)
1223 if d_quoted:
1224 # Remove leading \
1225 parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1226 else:
1227 parts.append(lexer.TokenVal(self.cur_token))
1228
1229 elif self.token_type == Id.Backtick_Other:
1230 parts.append(lexer.TokenVal(self.cur_token))
1231
1232 elif self.token_type == Id.Backtick_Right:
1233 break
1234
1235 elif self.token_type == Id.Eof_Real:
1236 # Note: this parse error is in the ORIGINAL context. No code_str yet.
1237 p_die('Unexpected EOF while looking for closing backtick',
1238 left_token)
1239
1240 else:
1241 raise AssertionError(self.cur_token)
1242
1243 self._SetNext(lex_mode_e.Backtick)
1244
1245 # Calculate right SPID on CommandSub BEFORE re-parsing.
1246 right_token = self.cur_token
1247
1248 code_str = ''.join(parts)
1249 #log('code %r', code_str)
1250
1251 # Save lines into a new, temporary arena, so SnipCodeBlock() isn't
1252 # messed up. Note: This is similar to how we parse aliases in
1253 # osh/cmd_parse.py. It won't have the same location info as
1254 # MakeParserForCommandSub(), because the reader is different.
1255 arena = alloc.Arena()
1256 # TODO: arena.PushSource()?
1257
1258 line_reader = reader.StringLineReader(code_str, arena)
1259 c_parser = self.parse_ctx.MakeOshParser(line_reader)
1260 src = source.Reparsed('backticks', left_token, right_token)
1261 with alloc.ctx_SourceCode(arena, src):
1262 node = c_parser.ParseCommandSub()
1263
1264 else:
1265 raise AssertionError(left_id)
1266
1267 return CommandSub(left_token, node, right_token)
1268
1269 def _ReadExprSub(self, lex_mode):
1270 # type: (lex_mode_t) -> word_part.ExprSub
1271 """$[d->key] $[obj.method()] etc."""
1272 left_token = self.cur_token
1273
1274 self._SetNext(lex_mode_e.Expr)
1275 enode, right_token = self.parse_ctx.ParseYshExpr(
1276 self.lexer, grammar_nt.ysh_expr_sub)
1277
1278 self._SetNext(lex_mode) # Move past ]
1279 return word_part.ExprSub(left_token, enode, right_token)
1280
1281 def ParseVarDecl(self, kw_token):
1282 # type: (Token) -> VarDecl
1283 """
1284 oil_var_decl: name_type_list '=' testlist end_stmt
1285
1286 Note that assignments must end with \n ; } or EOF. Unlike shell
1287 assignments, we disallow:
1288
1289 var x = 42 | wc -l
1290 var x = 42 && echo hi
1291 """
1292 self._SetNext(lex_mode_e.Expr)
1293 enode, last_token = self.parse_ctx.ParseVarDecl(kw_token, self.lexer)
1294 # Hack to move } from what the Expr lexer modes gives to what CommandParser
1295 # wants
1296 if last_token.id == Id.Op_RBrace:
1297 last_token.id = Id.Lit_RBrace
1298
1299 # Let the CommandParser see the Op_Semi or Op_Newline.
1300 self.buffered_word = last_token
1301 self._SetNext(lex_mode_e.ShCommand) # always back to this
1302 return enode
1303
1304 def ParseMutation(self, kw_token, var_checker):
1305 # type: (Token, VarChecker) -> Mutation
1306 """
1307 setvar i = 42
1308 setvar i += 1
1309 setvar a[i] = 42
1310 setvar a[i] += 1
1311 setvar d.key = 42
1312 setvar d.key += 1
1313 """
1314 self._SetNext(lex_mode_e.Expr)
1315 enode, last_token = self.parse_ctx.ParseMutation(kw_token, self.lexer)
1316 # Hack to move } from what the Expr lexer modes gives to what CommandParser
1317 # wants
1318 if last_token.id == Id.Op_RBrace:
1319 last_token.id = Id.Lit_RBrace
1320
1321 for lhs in enode.lhs:
1322 UP_lhs = lhs
1323 with tagswitch(lhs) as case:
1324 if case(y_lhs_e.Var):
1325 lhs = cast(Token, UP_lhs)
1326 var_checker.Check(kw_token.id, lexer.LazyStr(lhs), lhs)
1327
1328 # Note: this does not cover cases like
1329 # setvar (a[0])[1] = v
1330 # setvar (d.key).other = v
1331 # This leaks into catching all typos statically, which may be
1332 # possible if 'use' makes all names explicit.
1333 elif case(y_lhs_e.Subscript):
1334 lhs = cast(Subscript, UP_lhs)
1335 if lhs.obj.tag() == expr_e.Var:
1336 v = cast(expr.Var, lhs.obj)
1337 var_checker.Check(kw_token.id, v.name, v.left)
1338
1339 elif case(y_lhs_e.Attribute):
1340 lhs = cast(Attribute, UP_lhs)
1341 if lhs.obj.tag() == expr_e.Var:
1342 v = cast(expr.Var, lhs.obj)
1343 var_checker.Check(kw_token.id, v.name, v.left)
1344
1345 # Let the CommandParser see the Op_Semi or Op_Newline.
1346 self.buffered_word = last_token
1347 self._SetNext(lex_mode_e.ShCommand) # always back to this
1348 return enode
1349
1350 def ParseBareDecl(self):
1351 # type: () -> expr_t
1352 """
1353 x = {name: val}
1354 """
1355 self._SetNext(lex_mode_e.Expr)
1356 self._GetToken()
1357 enode, last_token = self.parse_ctx.ParseYshExpr(
1358 self.lexer, grammar_nt.command_expr)
1359 if last_token.id == Id.Op_RBrace:
1360 last_token.id = Id.Lit_RBrace
1361 self.buffered_word = last_token
1362 self._SetNext(lex_mode_e.ShCommand)
1363 return enode
1364
1365 def ParseYshExprForCommand(self):
1366 # type: () -> expr_t
1367
1368 # Fudge for this case
1369 # for x in(y) {
1370 # versus
1371 # for x in (y) {
1372 #
1373 # In the former case, ReadWord on 'in' puts the lexer past (.
1374 # Also see LookPastSpace in CommandParers.
1375 # A simpler solution would be nicer.
1376
1377 if self.token_type == Id.Op_LParen:
1378 self.lexer.MaybeUnreadOne()
1379
1380 enode, _ = self.parse_ctx.ParseYshExpr(self.lexer, grammar_nt.ysh_expr)
1381
1382 self._SetNext(lex_mode_e.ShCommand)
1383 return enode
1384
1385 def ParseCommandExpr(self):
1386 # type: () -> expr_t
1387 """
1388 = 1+2
1389 """
1390 enode, last_token = self.parse_ctx.ParseYshExpr(
1391 self.lexer, grammar_nt.command_expr)
1392
1393 # In some cases, such as the case statement, we expect *the lexer* to be
1394 # pointing at the token right after the expression. But the expression
1395 # parser must have read to the `last_token`. Unreading places the lexer
1396 # back in the expected state. Ie:
1397 #
1398 # case (x) { case (x) {
1399 # (else) { = x } (else) { = x }
1400 # ^ The lexer is here ^ Unread to here
1401 # } }
1402 assert last_token.id in (Id.Op_Newline, Id.Eof_Real, Id.Op_Semi,
1403 Id.Op_RBrace), last_token
1404 if last_token.id != Id.Eof_Real:
1405 # Eof_Real is the only token we cannot unread
1406 self.lexer.MaybeUnreadOne()
1407
1408 return enode
1409
1410 def ParseProc(self, node):
1411 # type: (Proc) -> None
1412
1413 # proc name-with-hyphens() must be accepted
1414 self._SetNext(lex_mode_e.ShCommand)
1415 self._GetToken()
1416 # example: 'proc f[' gets you Lit_ArrayLhsOpen
1417 if self.token_type != Id.Lit_Chars:
1418 p_die('Invalid proc name %s' % ui.PrettyToken(self.cur_token),
1419 self.cur_token)
1420
1421 # TODO: validate this more. Disallow proc 123 { }, which isn't disallowed
1422 # for shell functions. Similar to IsValidVarName().
1423 node.name = self.cur_token
1424
1425 last_token = self.parse_ctx.ParseProc(self.lexer, node)
1426
1427 # Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1428 assert last_token.id == Id.Op_LBrace
1429 last_token.id = Id.Lit_LBrace
1430 self.buffered_word = last_token
1431
1432 self._SetNext(lex_mode_e.ShCommand)
1433
1434 def ParseFunc(self, node):
1435 # type: (Func) -> None
1436 last_token = self.parse_ctx.ParseFunc(self.lexer, node)
1437
1438 # Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1439 assert last_token.id == Id.Op_LBrace
1440 last_token.id = Id.Lit_LBrace
1441 self.buffered_word = last_token
1442
1443 self._SetNext(lex_mode_e.ShCommand)
1444
1445 def ParseYshCasePattern(self):
1446 # type: () -> Tuple[pat_t, Token]
1447 pat, left_tok, last_token = self.parse_ctx.ParseYshCasePattern(
1448 self.lexer)
1449
1450 if last_token.id == Id.Op_LBrace:
1451 last_token.id = Id.Lit_LBrace
1452 self.buffered_word = last_token
1453
1454 return pat, left_tok
1455
1456 def NewlineOkForYshCase(self):
1457 # type: () -> Id_t
1458 """Check for optional newline and consume it.
1459
1460 This is a special case of `_NewlineOk` which fixed some "off-by-one" issues
1461 which crop up while parsing Ysh Case Arms. For more details, see
1462 #oil-dev > Progress On YSH Case Grammar on zulip.
1463
1464 Returns a token id which is filled with the choice of
1465
1466 word { echo word }
1467 (3) { echo expr }
1468 /e/ { echo eggex }
1469 } # right brace
1470 """
1471 while True:
1472 next_id = self.lexer.LookAheadOne(lex_mode_e.Expr)
1473
1474 # Cannot lookahead past lines
1475 if next_id == Id.Unknown_Tok:
1476 if not self.lexer.MoveToNextLine(): # Try to move to next line
1477 break # EOF
1478 continue
1479
1480 next_kind = consts.GetKind(next_id)
1481 if next_id != Id.Op_Newline and next_kind != Kind.Ignored:
1482 break
1483
1484 self.lexer.Read(lex_mode_e.Expr)
1485
1486 if next_id in (Id.Op_RBrace, Id.Op_LParen, Id.Arith_Slash):
1487 self._SetNext(lex_mode_e.Expr) # Continue in expression mode
1488 else:
1489 # Consume the trailing Op_Newline
1490 self._SetNext(lex_mode_e.ShCommand)
1491 self._GetToken()
1492
1493 return next_id
1494
1495 def _ReadArithExpr(self, end_id):
1496 # type: (Id_t) -> arith_expr_t
1497 """Read and parse an arithmetic expression in various contexts.
1498
1499 $(( 1+2 ))
1500 (( a=1+2 ))
1501 ${a[ 1+2 ]}
1502 ${a : 1+2 : 1+2}
1503
1504 See tests/arith-context.test.sh for ambiguous cases.
1505
1506 ${a[a[0]]} is valid # VS_RBRACKET vs Id.Arith_RBracket
1507
1508 ${s : a<b?0:1 : 1} # VS_COLON vs Id.Arith_Colon
1509
1510 See the assertion in ArithParser.Parse() -- unexpected extra input.
1511 """
1512 # calls self.ReadWord(lex_mode_e.Arith)
1513 anode = self.a_parser.Parse()
1514 cur_id = self.a_parser.CurrentId()
1515 if end_id != Id.Undefined_Tok and cur_id != end_id:
1516 p_die(
1517 'Unexpected token after arithmetic expression (%s != %s)' %
1518 (ui.PrettyId(cur_id), ui.PrettyId(end_id)),
1519 loc.Word(self.a_parser.cur_word))
1520 return anode
1521
1522 def _ReadArithSub(self):
1523 # type: () -> word_part.ArithSub
1524 """Read an arith substitution, which contains an arith expression, e.g.
1525
1526 $((a + 1)).
1527 """
1528 left_tok = self.cur_token
1529
1530 # The second one needs to be disambiguated in stuff like stuff like:
1531 # $(echo $(( 1+2 )) )
1532 self.lexer.PushHint(Id.Op_RParen, Id.Right_DollarDParen)
1533
1534 # NOTE: To disambiguate $(( as arith sub vs. command sub and subshell, we
1535 # could save the lexer/reader state here, and retry if the arithmetic parse
1536 # fails. But we can almost always catch this at parse time. There could
1537 # be some exceptions like:
1538 # $((echo * foo)) # looks like multiplication
1539 # $((echo / foo)) # looks like division
1540
1541 # $(( )) is valid
1542 anode = arith_expr.EmptyZero # type: arith_expr_t
1543
1544 self._NextNonSpace()
1545 if self.token_type != Id.Arith_RParen:
1546 anode = self._ReadArithExpr(Id.Arith_RParen)
1547
1548 self._SetNext(lex_mode_e.ShCommand)
1549
1550 # Ensure we get closing )
1551 self._GetToken()
1552 if self.token_type != Id.Right_DollarDParen:
1553 p_die('Expected second ) to end arith sub', self.cur_token)
1554
1555 right_tok = self.cur_token
1556 return word_part.ArithSub(left_tok, anode, right_tok)
1557
1558 def ReadDParen(self):
1559 # type: () -> Tuple[arith_expr_t, Token]
1560 """Read ((1+ 2)) -- command context.
1561
1562 We're using the word parser because it's very similar to _ReadArithExpr
1563 above.
1564
1565 This also returns the terminating Id.Op_DRightParen token for location
1566 info.
1567 """
1568 # (( )) is valid
1569 anode = arith_expr.EmptyZero # type: arith_expr_t
1570
1571 self.lexer.PushHint(Id.Op_RParen, Id.Op_DRightParen)
1572
1573 self._NextNonSpace()
1574 if self.token_type != Id.Arith_RParen:
1575 anode = self._ReadArithExpr(Id.Arith_RParen)
1576
1577 self._SetNext(lex_mode_e.ShCommand)
1578
1579 # Ensure we get the second )
1580 self._GetToken()
1581 right = self.cur_token
1582 if right.id != Id.Op_DRightParen:
1583 p_die('Expected second ) to end arith statement', right)
1584
1585 self._SetNext(lex_mode_e.ShCommand)
1586
1587 return anode, right
1588
1589 def _NextNonSpace(self):
1590 # type: () -> None
1591 """Advance in lex_mode_e.Arith until non-space token.
1592
1593 Same logic as _ReadWord, but used in
1594 $(( ))
1595 (( ))
1596 for (( ))
1597
1598 You can read self.token_type after this, without calling _GetToken.
1599 """
1600 while True:
1601 self._SetNext(lex_mode_e.Arith)
1602 self._GetToken()
1603 if self.token_kind not in (Kind.Ignored, Kind.WS):
1604 break
1605
1606 def ReadForExpression(self):
1607 # type: () -> command.ForExpr
1608 """Read ((i=0; i<5; ++i)) -- part of command context."""
1609 self._NextNonSpace() # skip over ((
1610 cur_id = self.token_type # for end of arith expressions
1611
1612 if cur_id == Id.Arith_Semi: # for (( ; i < 10; i++ ))
1613 init_node = arith_expr.EmptyZero # type: arith_expr_t
1614 else:
1615 init_node = self.a_parser.Parse()
1616 cur_id = self.a_parser.CurrentId()
1617 self._NextNonSpace()
1618
1619 # It's odd to keep track of both cur_id and self.token_type in this
1620 # function, but it works, and is tested in 'test/parse_error.sh
1621 # arith-integration'
1622 if cur_id != Id.Arith_Semi: # for (( x=0 b; ... ))
1623 p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1624
1625 self._GetToken()
1626 cur_id = self.token_type
1627
1628 if cur_id == Id.Arith_Semi: # for (( ; ; i++ ))
1629 # empty condition is TRUE
1630 cond_node = arith_expr.EmptyOne # type: arith_expr_t
1631 else:
1632 cond_node = self.a_parser.Parse()
1633 cur_id = self.a_parser.CurrentId()
1634
1635 if cur_id != Id.Arith_Semi: # for (( x=0; x<5 b ))
1636 p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1637
1638 self._NextNonSpace()
1639 if self.token_type == Id.Arith_RParen: # for (( ; ; ))
1640 update_node = arith_expr.EmptyZero # type: arith_expr_t
1641 else:
1642 update_node = self._ReadArithExpr(Id.Arith_RParen)
1643
1644 self._NextNonSpace()
1645 if self.token_type != Id.Arith_RParen:
1646 p_die('Expected ) to end for loop expression', self.cur_token)
1647 self._SetNext(lex_mode_e.ShCommand)
1648
1649 # redirects is None, will be assigned in CommandEvaluator
1650 node = command.ForExpr.CreateNull()
1651 node.init = init_node
1652 node.cond = cond_node
1653 node.update = update_node
1654 return node
1655
1656 def _ReadArrayLiteral(self):
1657 # type: () -> word_part_t
1658 """a=(1 2 3)
1659
1660 TODO: See osh/cmd_parse.py:164 for Id.Lit_ArrayLhsOpen, for a[x++]=1
1661
1662 We want:
1663
1664 A=(['x']=1 ["x"]=2 [$x$y]=3)
1665
1666 Maybe allow this as a literal string? Because I think I've seen it before?
1667 Or maybe force people to patch to learn the rule.
1668
1669 A=([x]=4)
1670
1671 Starts with Lit_Other '[', and then it has Lit_ArrayLhsClose
1672 Maybe enforce that ALL have keys or NONE of have keys.
1673 """
1674 self._SetNext(lex_mode_e.ShCommand) # advance past (
1675 self._GetToken()
1676 if self.cur_token.id != Id.Op_LParen:
1677 p_die('Expected ( after =', self.cur_token)
1678 left_token = self.cur_token
1679 right_token = None # type: Token
1680
1681 # MUST use a new word parser (with same lexer).
1682 w_parser = self.parse_ctx.MakeWordParser(self.lexer, self.line_reader)
1683 words = [] # type: List[CompoundWord]
1684 done = False
1685 while not done:
1686 w = w_parser.ReadWord(lex_mode_e.ShCommand)
1687 with tagswitch(w) as case:
1688 if case(word_e.Operator):
1689 tok = cast(Token, w)
1690 if tok.id == Id.Right_Initializer:
1691 right_token = tok
1692 done = True # can't use break here
1693 # Unlike command parsing, array parsing allows embedded \n.
1694 elif tok.id == Id.Op_Newline:
1695 continue
1696 else:
1697 p_die('Unexpected token in array literal', loc.Word(w))
1698
1699 elif case(word_e.Compound):
1700 words.append(cast(CompoundWord, w))
1701
1702 else:
1703 raise AssertionError()
1704
1705 initializer_words = [] # type: List[InitializerWord_t]
1706 for w in words:
1707 pair = word_.DetectAssocPair(w)
1708 if pair is not None:
1709 word_.TildeDetectAssign(pair.value) # pair.value is modified
1710 initializer_words.append(pair)
1711 else:
1712 w2 = braces.BraceDetect(w) # type: word_t
1713 if w2 is None:
1714 w2 = w
1715 w3 = word_.TildeDetect(w2) # type: word_t
1716 if w3 is None:
1717 w3 = w2
1718 initializer_words.append(InitializerWord.ArrayWord(w3))
1719
1720 # invariant List?
1721 return word_part.InitializerLiteral(left_token, initializer_words,
1722 right_token)
1723
1724 def ParseProcCallArgs(self, start_symbol):
1725 # type: (int) -> ArgList
1726 """ json write (x) """
1727 self.lexer.MaybeUnreadOne()
1728
1729 arg_list = ArgList.CreateNull(alloc_lists=True)
1730 arg_list.left = self.cur_token
1731 self.parse_ctx.ParseProcCallArgs(self.lexer, arg_list, start_symbol)
1732 return arg_list
1733
1734 def _MaybeReadWordPart(self, is_first, lex_mode, parts):
1735 # type: (bool, lex_mode_t, List[word_part_t]) -> bool
1736 """Helper for _ReadCompoundWord3."""
1737 done = False
1738
1739 if self.token_type == Id.Lit_EscapedChar:
1740 tok = self.cur_token
1741 assert tok.length == 2
1742 ch = lexer.TokenSliceLeft(tok, 1)
1743 if not self.parse_opts.parse_backslash():
1744 if not pyutil.IsValidCharEscape(ch):
1745 p_die('Invalid char escape in unquoted word (OILS-ERR-13)',
1746 self.cur_token)
1747
1748 part = word_part.EscapedLiteral(self.cur_token,
1749 ch) # type: word_part_t
1750 else:
1751 part = self.cur_token
1752
1753 if is_first and self.token_type == Id.Lit_VarLike: # foo=
1754 parts.append(part)
1755 # Unfortunately it's awkward to pull the check for a=(1 2) up to
1756 # _ReadWord.
1757 next_id = self.lexer.LookPastSpace(lex_mode)
1758 if next_id == Id.Op_LParen:
1759 self.lexer.PushHint(Id.Op_RParen, Id.Right_Initializer)
1760 part2 = self._ReadArrayLiteral()
1761 parts.append(part2)
1762
1763 # Array literal must be the last part of the word.
1764 self._SetNext(lex_mode)
1765 self._GetToken()
1766 # EOF, whitespace, newline, Right_Subshell
1767 if self.token_kind not in KINDS_THAT_END_WORDS:
1768 p_die('Unexpected token after array literal',
1769 self.cur_token)
1770 done = True
1771
1772 elif (is_first and self.parse_opts.parse_at() and
1773 self.token_type == Id.Lit_Splice):
1774
1775 splice_tok = self.cur_token
1776 part2 = word_part.Splice(splice_tok,
1777 lexer.TokenSliceLeft(splice_tok, 1))
1778
1779 parts.append(part2)
1780
1781 # @words must be the last part of the word
1782 self._SetNext(lex_mode)
1783 self._GetToken()
1784 # EOF, whitespace, newline, Right_Subshell
1785 if self.token_kind not in KINDS_THAT_END_WORDS:
1786 p_die('Unexpected token after array splice', self.cur_token)
1787 done = True
1788
1789 elif (is_first and self.parse_opts.parse_at() and
1790 self.token_type == Id.Lit_AtLBracket): # @[split(x)]
1791 part2 = self._ReadExprSub(lex_mode_e.DQ)
1792 parts.append(part2)
1793
1794 # @[split(x)]
1795 self._SetNext(lex_mode)
1796 self._GetToken()
1797 # EOF, whitespace, newline, Right_Subshell
1798 if self.token_kind not in KINDS_THAT_END_WORDS:
1799 p_die('Unexpected token after Expr splice', self.cur_token)
1800 done = True
1801
1802 elif (is_first and self.parse_opts.parse_at() and
1803 self.token_type == Id.Lit_AtLBraceDot):
1804 p_die('TODO: @{.myproc builtin sub}', self.cur_token)
1805
1806 elif (is_first and self.parse_opts.parse_at_all() and
1807 self.token_type == Id.Lit_At):
1808 # Because $[x] ${x} and perhaps $/x/ are reserved, it makes sense for @
1809 # at the beginning of a word to be reserved.
1810
1811 # Although should we relax 'echo @' ? I'm tempted to have a shortcut for
1812 # @_argv and
1813 p_die('Literal @ starting a word must be quoted (parse_at_all)',
1814 self.cur_token)
1815
1816 else:
1817 # not a literal with lookahead; append it
1818 parts.append(part)
1819
1820 return done
1821
1822 def _ReadCompoundWord(self, lex_mode):
1823 # type: (lex_mode_t) -> CompoundWord
1824 return self._ReadCompoundWord3(lex_mode, Id.Undefined_Tok, True)
1825
1826 def _ReadCompoundWord3(self, lex_mode, eof_type, empty_ok):
1827 # type: (lex_mode_t, Id_t, bool) -> CompoundWord
1828 """
1829 Precondition: Looking at the first token of the first word part
1830 Postcondition: Looking at the token after, e.g. space or operator
1831
1832 NOTE: eof_type is necessary because / is a literal, i.e. Lit_Slash, but it
1833 could be an operator delimiting a compound word. Can we change lexer modes
1834 and remove this special case?
1835 """
1836 w = CompoundWord([])
1837 num_parts = 0
1838 brace_count = 0
1839 done = False
1840 is_triple_quoted = None # type: Optional[BoolParamBox]
1841
1842 while not done:
1843 self._GetToken()
1844
1845 allow_done = empty_ok or num_parts != 0
1846 if allow_done and self.token_type == eof_type:
1847 done = True # e.g. for ${foo//pat/replace}
1848
1849 # Keywords like "for" are treated like literals
1850 elif self.token_kind in (Kind.Lit, Kind.History, Kind.KW,
1851 Kind.ControlFlow, Kind.BoolUnary,
1852 Kind.BoolBinary):
1853
1854 # Syntax error for { and }
1855 if self.token_type == Id.Lit_LBrace:
1856 brace_count += 1
1857 elif self.token_type == Id.Lit_RBrace:
1858 brace_count -= 1
1859 elif self.token_type == Id.Lit_Dollar:
1860 if not self.parse_opts.parse_dollar():
1861 if num_parts == 0 and lex_mode == lex_mode_e.ShCommand:
1862 next_byte = self.lexer.ByteLookAhead()
1863 # TODO: switch lexer modes and parse $/d+/. But not ${a:-$/d+/}
1864 if next_byte == '/':
1865 #log('next_byte %r', next_byte)
1866 pass
1867
1868 p_die('Literal $ should be quoted like \$',
1869 self.cur_token)
1870
1871 done = self._MaybeReadWordPart(num_parts == 0, lex_mode,
1872 w.parts)
1873
1874 elif self.token_kind == Kind.VSub:
1875 vsub_token = self.cur_token
1876
1877 part = SimpleVarSub(vsub_token) # type: word_part_t
1878 w.parts.append(part)
1879
1880 elif self.token_kind == Kind.ExtGlob:
1881 # If parse_at, we can take over @( to start @(seq 3)
1882 # Users can also use look at ,(*.py|*.sh)
1883 if (self.parse_opts.parse_at() and
1884 self.token_type == Id.ExtGlob_At and num_parts == 0):
1885 cs_part = self._ReadCommandSub(Id.Left_AtParen,
1886 d_quoted=False)
1887 # RARE mutation of tok.id!
1888 cs_part.left_token.id = Id.Left_AtParen
1889 part = cs_part # for type safety
1890
1891 # Same check as _MaybeReadWordPart. @(seq 3)x is illegal, just like
1892 # a=(one two)x and @arrayfunc(3)x.
1893 self._GetToken()
1894 if self.token_kind not in KINDS_THAT_END_WORDS:
1895 p_die('Unexpected token after @()', self.cur_token)
1896 done = True
1897
1898 else:
1899 if HAVE_FNM_EXTMATCH == 0:
1900 p_die(
1901 "Extended glob won't work without FNM_EXTMATCH support in libc",
1902 self.cur_token)
1903 part = self._ReadExtGlob()
1904 w.parts.append(part)
1905
1906 elif self.token_kind == Kind.BashRegex:
1907 if self.token_type == Id.BashRegex_LParen: # Opening (
1908 part = self._ReadBashRegexGroup()
1909 w.parts.append(part)
1910 else:
1911 assert self.token_type == Id.BashRegex_AllowedInParens
1912 p_die('Invalid token in bash regex', self.cur_token)
1913
1914 elif self.token_kind == Kind.Left:
1915 try_triple_quote = (self.parse_opts.parse_triple_quote() and
1916 lex_mode == lex_mode_e.ShCommand and
1917 num_parts == 0)
1918
1919 # Save allocation
1920 if try_triple_quote:
1921 is_triple_quoted = BoolParamBox(False)
1922
1923 part = self._ReadUnquotedLeftParts(is_triple_quoted)
1924 w.parts.append(part)
1925
1926 # NOT done yet, will advance below
1927 elif self.token_kind == Kind.Right:
1928 # Still part of the word; will be done on the next iter.
1929 if self.token_type == Id.Right_DoubleQuote:
1930 pass
1931 # Never happens, no PushHint for this case.
1932 #elif self.token_type == Id.Right_DollarParen:
1933 # pass
1934 elif self.token_type == Id.Right_Subshell:
1935 # LEXER HACK for (case x in x) ;; esac )
1936 # Rewind before it's used
1937 assert self.next_lex_mode == lex_mode_e.Undefined
1938 if self.lexer.MaybeUnreadOne():
1939 self.lexer.PushHint(Id.Op_RParen, Id.Right_Subshell)
1940 self._SetNext(lex_mode)
1941 done = True
1942 else:
1943 done = True
1944
1945 elif self.token_kind == Kind.Ignored:
1946 done = True
1947
1948 else:
1949 # LEXER HACK for unbalanced case clause. 'case foo in esac' is valid,
1950 # so to test for ESAC, we can read ) before getting a chance to
1951 # PushHint(Id.Op_RParen, Id.Right_CasePat). So here we unread one
1952 # token and do it again.
1953
1954 # We get Id.Op_RParen at top level: case x in x) ;; esac
1955 # We get Id.Eof_RParen inside ComSub: $(case x in x) ;; esac )
1956 if self.token_type in (Id.Op_RParen, Id.Eof_RParen):
1957 # Rewind before it's used
1958 assert self.next_lex_mode == lex_mode_e.Undefined
1959 if self.lexer.MaybeUnreadOne():
1960 if self.token_type == Id.Eof_RParen:
1961 # Redo translation
1962 self.lexer.PushHint(Id.Op_RParen, Id.Eof_RParen)
1963 self._SetNext(lex_mode)
1964
1965 done = True # anything we don't recognize means we're done
1966
1967 if not done:
1968 self._SetNext(lex_mode)
1969 num_parts += 1
1970
1971 if (self.parse_opts.parse_brace() and num_parts > 1 and
1972 brace_count != 0):
1973 # accept { and }, but not foo{
1974 p_die(
1975 'Word has unbalanced { }. Maybe add a space or quote it like \{',
1976 loc.Word(w))
1977
1978 if is_triple_quoted and is_triple_quoted.b and num_parts > 1:
1979 p_die('Unexpected parts after triple quoted string',
1980 loc.WordPart(w.parts[-1]))
1981
1982 if 0:
1983 from _devbuild.gen.syntax_asdl import word_part_str
1984 word_key = ' '.join(word_part_str(p.tag()) for p in w.parts)
1985 WORD_HIST[word_key] += 1
1986
1987 # YSH word restriction
1988 # (r'' u'' b'' are stripped on shopt -s parse_ysh_string)
1989 if not self.parse_opts.parse_word_join() and not _IsValidYshWord(w):
1990 p_die("Invalid quoted word part in YSH (OILS-ERR-17)",
1991 loc.WordPart(part))
1992
1993 return w
1994
1995 def _ReadArithWord(self):
1996 # type: () -> Optional[word_t]
1997 """ Helper for ReadArithWord() """
1998 self._GetToken()
1999
2000 if self.token_kind == Kind.Unknown:
2001 # e.g. happened during dynamic parsing of unset 'a[$foo]' in gherkin
2002 p_die(
2003 'Unexpected token while parsing arithmetic: %r' %
2004 lexer.TokenVal(self.cur_token), self.cur_token)
2005
2006 elif self.token_kind == Kind.Eof:
2007 return self.cur_token
2008
2009 elif self.token_kind == Kind.Ignored:
2010 # Space should be ignored.
2011 self._SetNext(lex_mode_e.Arith)
2012 return None
2013
2014 elif self.token_kind in (Kind.Arith, Kind.Right):
2015 # Id.Right_DollarDParen IS just a normal token, handled by ArithParser
2016 self._SetNext(lex_mode_e.Arith)
2017 return self.cur_token
2018
2019 elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub):
2020 return self._ReadCompoundWord(lex_mode_e.Arith)
2021
2022 else:
2023 raise AssertionError(self.cur_token)
2024
2025 def _ReadWord(self, word_mode):
2026 # type: (lex_mode_t) -> Optional[word_t]
2027 """Helper function for ReadWord()."""
2028
2029 # Change the pseudo lexer mode to a real lexer mode
2030 if word_mode == lex_mode_e.ShCommandFakeBrack:
2031 lex_mode = lex_mode_e.ShCommand
2032 else:
2033 lex_mode = word_mode
2034
2035 self._GetToken()
2036
2037 if self.token_kind == Kind.Eof:
2038 # No advance
2039 return self.cur_token
2040
2041 # Allow Arith for ) at end of for loop?
2042 elif self.token_kind in (Kind.Op, Kind.Redir, Kind.Arith):
2043 self._SetNext(lex_mode)
2044
2045 # Newlines are complicated. See 3x2 matrix in the comment about
2046 # self.multiline and self.newline_state above.
2047 if self.token_type == Id.Op_Newline:
2048 if self.multiline:
2049 if self.newline_state > 1:
2050 # This points at a blank line, but at least it gives the line number
2051 p_die('Invalid blank line in multiline mode',
2052 self.cur_token)
2053 return None
2054
2055 if self.returned_newline: # skip
2056 return None
2057
2058 return self.cur_token
2059
2060 elif self.token_kind == Kind.Right:
2061 if self.token_type not in (Id.Right_Subshell, Id.Right_ShFunction,
2062 Id.Right_CasePat, Id.Right_Initializer):
2063 raise AssertionError(self.cur_token)
2064
2065 self._SetNext(lex_mode)
2066 return self.cur_token
2067
2068 elif self.token_kind in (Kind.Ignored, Kind.WS):
2069 self._SetNext(lex_mode)
2070 return None
2071
2072 else:
2073 assert self.token_kind in (Kind.VSub, Kind.Lit, Kind.History,
2074 Kind.Left, Kind.KW, Kind.ControlFlow,
2075 Kind.BoolUnary, Kind.BoolBinary,
2076 Kind.ExtGlob,
2077 Kind.BashRegex), 'Unhandled token kind'
2078
2079 if (word_mode == lex_mode_e.ShCommandFakeBrack and
2080 self.parse_opts.parse_bracket() and
2081 self.token_type == Id.Lit_LBracket):
2082 # Change [ from Kind.Lit -> Kind.Op
2083 # So CommandParser can treat
2084 # assert [42 === x]
2085 # like
2086 # json write (x)
2087 bracket_word = self.cur_token
2088 bracket_word.id = Id.Op_LBracket
2089
2090 self._SetNext(lex_mode)
2091 return bracket_word
2092
2093 # We're beginning a word. If we see Id.Lit_Pound, change to
2094 # lex_mode_e.Comment and read until end of line.
2095 if self.token_type == Id.Lit_Pound:
2096 self._SetNext(lex_mode_e.Comment)
2097 self._GetToken()
2098
2099 # NOTE: The # could be the last character in the file. It can't be
2100 # Eof_{RParen,Backtick} because #) and #` are comments.
2101 assert self.token_type in (Id.Ignored_Comment, Id.Eof_Real), \
2102 self.cur_token
2103
2104 # The next iteration will go into Kind.Ignored and set lex state to
2105 # lex_mode_e.ShCommand/etc.
2106 return None # tell ReadWord() to try again after comment
2107
2108 elif self.token_type == Id.Lit_TPound: ### doc comment
2109 self._SetNext(lex_mode_e.Comment)
2110 self._GetToken()
2111
2112 if self.token_type == Id.Ignored_Comment and self.emit_doc_token:
2113 return self.cur_token
2114
2115 return None # tell ReadWord() to try again after comment
2116
2117 else:
2118 # r'' u'' b'' at the beginning of a word
2119 if (self.token_type == Id.Lit_Chars and
2120 self.lexer.LookAheadOne(
2121 lex_mode_e.ShCommand) == Id.Left_SingleQuote):
2122
2123 # When shopt -s parse_ysh_string:
2124 # echo r'hi' is like echo 'hi'
2125 #
2126 # echo u'\u{3bc}' b'\yff' works
2127
2128 tok = self.cur_token
2129 if self.parse_opts.parse_ysh_string():
2130 if lexer.TokenEquals(tok, 'r'):
2131 left_id = Id.Left_RSingleQuote
2132 elif lexer.TokenEquals(tok, 'u'):
2133 left_id = Id.Left_USingleQuote
2134 elif lexer.TokenEquals(tok, 'b'):
2135 left_id = Id.Left_BSingleQuote
2136 else:
2137 left_id = Id.Undefined_Tok
2138
2139 if left_id != Id.Undefined_Tok:
2140 # skip the r, and then 'foo' will be read as normal
2141 self._SetNext(lex_mode_e.ShCommand)
2142
2143 self._GetToken()
2144 assert self.token_type == Id.Left_SingleQuote, self.token_type
2145
2146 # Read the word in a different lexer mode
2147 return self._ReadYshSingleQuoted(left_id)
2148
2149 return self._ReadCompoundWord(lex_mode)
2150
2151 def ParseVarRef(self):
2152 # type: () -> BracedVarSub
2153 """DYNAMIC parsing of what's inside ${!ref}
2154
2155 # Same as VarOf production
2156 VarRefExpr = VarOf EOF
2157 """
2158 self._SetNext(lex_mode_e.VSub_1)
2159
2160 self._GetToken()
2161 if self.token_kind != Kind.VSub:
2162 p_die('Expected var name', self.cur_token)
2163
2164 part = self._ParseVarOf()
2165 # NOTE: no ${ } means no part.left and part.right
2166 part.left = part.name_tok # cheat to make test pass
2167 part.right = part.name_tok
2168
2169 self._GetToken()
2170 if self.token_type != Id.Eof_Real:
2171 p_die('Expected end of var ref expression', self.cur_token)
2172 return part
2173
2174 def LookPastSpace(self):
2175 # type: () -> Id_t
2176 """Look ahead to the next token.
2177
2178 For the CommandParser to recognize
2179 array= (1 2 3)
2180 YSH for ( versus bash for ((
2181 YSH if ( versus if test
2182 YSH while ( versus while test
2183 YSH bare assignment 'grep =' versus 'grep foo'
2184 """
2185 assert self.token_type != Id.Undefined_Tok
2186 if self.cur_token.id == Id.WS_Space:
2187 id_ = self.lexer.LookPastSpace(lex_mode_e.ShCommand)
2188 else:
2189 id_ = self.cur_token.id
2190 return id_
2191
2192 def LookAheadFuncParens(self):
2193 # type: () -> bool
2194 """Special lookahead for f( ) { echo hi; } to check for ( )"""
2195 assert self.token_type != Id.Undefined_Tok
2196
2197 # We have to handle 2 cases because we buffer a token
2198 if self.cur_token.id == Id.Op_LParen: # saw funcname(
2199 return self.lexer.LookAheadFuncParens(1) # go back one char
2200
2201 elif self.cur_token.id == Id.WS_Space: # saw funcname WHITESPACE
2202 return self.lexer.LookAheadFuncParens(0)
2203
2204 else:
2205 return False
2206
2207 def ReadWord(self, word_mode):
2208 # type: (lex_mode_t) -> word_t
2209 """Read the next word, using the given lexer mode.
2210
2211 This is a stateful wrapper for the stateless _ReadWord function.
2212 """
2213 assert word_mode in (lex_mode_e.ShCommand,
2214 lex_mode_e.ShCommandFakeBrack,
2215 lex_mode_e.DBracket, lex_mode_e.BashRegex)
2216
2217 if self.buffered_word: # For integration with pgen2
2218 w = self.buffered_word
2219 self.buffered_word = None
2220 else:
2221 while True:
2222 w = self._ReadWord(word_mode)
2223 if w is not None:
2224 break
2225
2226 self.returned_newline = (word_.CommandId(w) == Id.Op_Newline)
2227 return w
2228
2229 def ReadArithWord(self):
2230 # type: () -> word_t
2231 while True:
2232 w = self._ReadArithWord()
2233 if w is not None:
2234 break
2235 return w
2236
2237 def ReadHereDocBody(self, parts):
2238 # type: (List[word_part_t]) -> None
2239 """
2240 A here doc is like a double quoted context, except " isn't special.
2241 """
2242 self._ReadLikeDQ(None, False, parts)
2243 # Returns nothing
2244
2245 def ReadForPlugin(self):
2246 # type: () -> CompoundWord
2247 """For $PS1, $PS4, etc.
2248
2249 This is just like reading a here doc line. "\n" is allowed, as
2250 well as the typical substitutions ${x} $(echo hi) $((1 + 2)).
2251 """
2252 w = CompoundWord([])
2253 self._ReadLikeDQ(None, False, w.parts)
2254 return w
2255
2256 def EmitDocToken(self, b):
2257 # type: (bool) -> None
2258 self.emit_doc_token = b
2259
2260 def Multiline(self, b):
2261 # type: (bool) -> None
2262 self.multiline = b
2263
2264
2265if 0:
2266 import collections
2267 WORD_HIST = collections.Counter()
2268
2269# vim: sw=4