osh/word_parse.py

OILS / osh / word_parse.py View on Github | oils.pub

2269 lines, 1210 significant

1	# Copyright 2016 Andy Chu. All rights reserved.
2	# Licensed under the Apache License, Version 2.0 (the "License");
3	# you may not use this file except in compliance with the License.
4	# You may obtain a copy of the License at
5	#
6	# http://www.apache.org/licenses/LICENSE-2.0
7	"""
8	word_parse.py - Parse the shell word language.
9
10	Hairy example:
11
12	hi$((1 + 2))"$(echo hi)"${var:-__"$(echo default)"__}
13
14	Substitutions can be nested, but which inner subs are allowed depends on the
15	outer sub. Notes:
16
17	lex_mode_e.ShCommand (_ReadUnquotedLeftParts)
18	All subs and quotes are allowed:
19	$v ${v} $() `` $(()) '' "" $'' $"" <() >()
20
21	lex_mode_e.DQ (_ReadDoubleQuotedLeftParts)
22	Var, Command, Arith, but no quotes.
23	$v ${v} $() `` $(())
24	No process substitution.
25
26	lex_mode_e.Arith
27	Similar to DQ: Var, Command, and Arith sub, but no process sub. bash doesn't
28	allow quotes, but OSH does. We allow ALL FOUR kinds of quotes, because we
29	need those for associative array indexing.
30
31	lex_mode_e.VSub_ArgUnquoted
32	Like ShCommand, everything is allowed (even process substitutions), but we
33	stop at }, and space is SIGNIFICANT.
34
35	Example: ${a:- b }
36
37	${X:-$v} ${X:-${v}} ${X:-$(echo hi)} ${X:-`echo hi`} ${X:-$((1+2))}
38	${X:-'single'} ${X:-"double"} ${X:-$'\n'} ${X:-<(echo hi)}
39
40	lex_mode_e.VSub_ArgDQ
41	In contrast to DQ, VS_ARG_DQ accepts nested "" and $'' and $"", e.g.
42	"${x:-"default"}".
43
44	In contrast, VSub_ArgUnquoted respects single quotes and process
45	substitution.
46
47	It's weird that double quotes are allowed. Space is also significant here,
48	e.g. "${x:-a "b"}".
49	"""
50
51	from _devbuild.gen import grammar_nt
52	from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str, Kind
53	from _devbuild.gen.types_asdl import (lex_mode_t, lex_mode_e)
54	from _devbuild.gen.syntax_asdl import (
55	BoolParamBox,
56	Token,
57	SimpleVarSub,
58	loc,
59	source,
60	DoubleQuoted,
61	SingleQuoted,
62	BracedVarSub,
63	CommandSub,
64	InitializerWord,
65	InitializerWord_t,
66	bracket_op,
67	bracket_op_t,
68	suffix_op,
69	suffix_op_t,
70	rhs_word,
71	rhs_word_e,
72	rhs_word_t,
73	word_e,
74	word_t,
75	CompoundWord,
76	word_part,
77	word_part_t,
78	y_lhs_e,
79	arith_expr_t,
80	command,
81	expr,
82	expr_e,
83	expr_t,
84	pat_t,
85	ArgList,
86	Proc,
87	Func,
88	Subscript,
89	Attribute,
90	arith_expr,
91	VarDecl,
92	Mutation,
93	word_part_e,
94	)
95	from core import alloc
96	from core.error import p_die
97	from mycpp.mylib import log
98	from core import pyutil
99	from display import ui
100	from frontend import consts
101	from frontend import lexer
102	from frontend import reader
103	from osh import tdop
104	from osh import arith_parse
105	from osh import braces
106	from osh import word_
107	from osh import word_compile
108	from mycpp.mylib import tagswitch
109
110	from libc import HAVE_FNM_EXTMATCH
111
112	from typing import List, Optional, Tuple, cast
113	from typing import TYPE_CHECKING
114	if TYPE_CHECKING:
115	from frontend.lexer import Lexer
116	from frontend.parse_lib import ParseContext
117	from frontend.reader import _Reader
118	from osh.cmd_parse import VarChecker
119
120	unused1 = log
121	unused2 = Id_str
122
123	KINDS_THAT_END_WORDS = [Kind.Eof, Kind.WS, Kind.Op, Kind.Right]
124
125
126	def _IsValidYshWord(w):
127	# type: (CompoundWord) -> bool
128	"""YSH word restriction
129
130	Allowed:
131	'foo' r'foo' --flag r'foo'
132	--flag='foo'
133	--flag="foo"
134	Not allowed:
135	--flag=r'bar' NAME=u'value' # ambiguous
136	--flag=b''' multi '''
137	"""
138	parts = w.parts
139	n = len(parts)
140
141	if n != 0 and word_.LiteralId(parts[0]) == Id.Lit_Tilde:
142	# ~bob/src/'dir with spaces' is allowed
143	# ~bob/src/u'dir with spaces' is ambiguous, but allowed for simplicity
144	return True # early return
145
146	ok = True
147	if n >= 2:
148	for part in parts:
149	if part.tag() in (word_part_e.SingleQuoted,
150	word_part_e.DoubleQuoted):
151	ok = False
152
153	# Allow special cases:
154	# --flag='val' NAME='bar'
155	# But NOT
156	# --flag=r'val' NAME=r'val'
157	if not ok:
158	if (n == 2 and word_.LiteralId(parts[0]) == Id.Lit_VarLike):
159	ok = True
160	elif (n == 3 and word_.LiteralId(parts[0]) == Id.Lit_Chars and
161	word_.LiteralId(parts[1]) == Id.Lit_Equals):
162	ok = True
163
164	return ok
165
166
167	class WordEmitter(object):
168	"""Common interface for [ and [["""
169
170	def __init__(self):
171	# type: () -> None
172	"""Empty constructor for mycpp."""
173	pass
174
175	def ReadWord(self, lex_mode):
176	# type: (lex_mode_t) -> word_t
177	raise NotImplementedError()
178
179
180	class WordParser(WordEmitter):
181
182	def __init__(self, parse_ctx, lexer, line_reader):
183	# type: (ParseContext, Lexer, _Reader) -> None
184	self.parse_ctx = parse_ctx
185	self.lexer = lexer
186	self.line_reader = line_reader
187	self.arena = line_reader.arena
188
189	self.parse_opts = parse_ctx.parse_opts
190	self.a_parser = tdop.TdopParser(arith_parse.Spec(), self,
191	self.parse_opts)
192	self.Reset()
193
194	def Init(self, lex_mode):
195	# type: (lex_mode_t) -> None
196	"""Used to parse arithmetic, see ParseContext."""
197	self.next_lex_mode = lex_mode
198
199	def Reset(self):
200	# type: () -> None
201	"""Called by interactive loop."""
202	# For _GetToken()
203	self.cur_token = None # type: Token
204	self.token_kind = Kind.Undefined
205	self.token_type = Id.Undefined_Tok
206
207	self.next_lex_mode = lex_mode_e.ShCommand
208
209	# Boolean mutated by CommandParser via word_.ctx_EmitDocToken. For ### doc
210	# comments
211	self.emit_doc_token = False
212	# Boolean mutated by CommandParser via word_.ctx_Multiline. '...' starts
213	# multiline mode.
214	self.multiline = False
215
216	# For detecting invalid \n\n in multiline mode. Counts what we got
217	# directly from the lexer.
218	self.newline_state = 0
219	# For consolidating \n\n -> \n for the CALLER. This simplifies the parsers
220	# that consume words.
221	self.returned_newline = False
222
223	# For integration with pgen2
224	self.buffered_word = None # type: word_t
225
226	def _GetToken(self):
227	# type: () -> None
228	"""Call this when you need to make a decision based on any of:
229
230	self.token_type
231	self.token_kind
232	self.cur_token
233	"""
234	if self.next_lex_mode == lex_mode_e.Undefined:
235	return # _SetNext() not called, so do nothing
236
237	is_fake = self.next_lex_mode == lex_mode_e.BashRegexFakeInner
238	real_mode = (lex_mode_e.BashRegex if is_fake else self.next_lex_mode)
239
240	self.cur_token = self.lexer.Read(real_mode)
241
242	# MUTATE TOKEN for fake lexer mode.
243	# This is for crazy stuff bash allows, like [[ s =~ (< >) ]]
244	if (is_fake and self.cur_token.id
245	in (Id.WS_Space, Id.BashRegex_AllowedInParens)):
246	self.cur_token.id = Id.Lit_Chars
247
248	self.token_type = self.cur_token.id
249	self.token_kind = consts.GetKind(self.token_type)
250
251	# number of consecutive newlines, ignoring whitespace
252	if self.token_type == Id.Op_Newline:
253	self.newline_state += 1
254	elif self.token_kind != Kind.WS:
255	self.newline_state = 0
256
257	self.parse_ctx.trail.AppendToken(self.cur_token) # For completion
258	self.next_lex_mode = lex_mode_e.Undefined
259
260	def _SetNext(self, lex_mode):
261	# type: (lex_mode_t) -> None
262	"""Set the next lex state, but don't actually read a token.
263
264	We need this for proper interactive parsing.
265	"""
266	self.next_lex_mode = lex_mode
267
268	def _ReadVarOpArg(self, arg_lex_mode):
269	# type: (lex_mode_t) -> rhs_word_t
270
271	# NOTE: Operators like \| and < are not treated as special, so ${a:- \| >} is
272	# valid, even when unquoted.
273	self._SetNext(arg_lex_mode)
274	self._GetToken()
275
276	w = self._ReadVarOpArg2(arg_lex_mode, Id.Undefined_Tok,
277	True) # empty_ok
278
279	# If the Compound has no parts, and we're in a double-quoted VarSub
280	# arg, and empty_ok, then return Empty. This is so it can evaluate to
281	# the empty string and not get elided.
282	#
283	# Examples:
284	# - "${s:-}", "${s/%pat/}"
285	# It's similar to LooksLikeShAssignment where we turn x= into x=''. And it
286	# has the same potential problem of not having Token location info.
287	#
288	# NOTE: empty_ok is False only for the PatSub pattern, which means we'll
289	# return a Compound with no parts, which is explicitly checked with a
290	# custom error message.
291	if len(w.parts) == 0 and arg_lex_mode == lex_mode_e.VSub_ArgDQ:
292	return rhs_word.Empty
293
294	return w
295
296	def _ReadVarOpArg2(self, arg_lex_mode, eof_type, empty_ok):
297	# type: (lex_mode_t, Id_t, bool) -> CompoundWord
298	"""Return a CompoundWord.
299
300	Helper function for _ReadVarOpArg and used directly by
301	_ReadPatSubVarOp.
302	"""
303	w = self._ReadCompoundWord3(arg_lex_mode, eof_type, empty_ok)
304	#log('w %s', w)
305	tilde = word_.TildeDetect(w)
306	if tilde:
307	w = tilde
308	return w
309
310	def _ReadSliceVarOp(self):
311	# type: () -> suffix_op.Slice
312	"""
313	Looking token after first ':'
314
315	ArithExpr? (':' ArithExpr? )? '}'
316	"""
317	self._NextNonSpace()
318
319	cur_id = self.token_type
320
321	if cur_id in (Id.Arith_RBrace, Id.Arith_Colon): # ${a:} or ${a::}
322	begin = arith_expr.EmptyZero # type: arith_expr_t
323	else:
324	begin = self.a_parser.Parse()
325	cur_id = self.a_parser.CurrentId() # advance
326
327	if cur_id == Id.Arith_RBrace: # ${a:1} or ${@:1}
328	# No length specified, so it's N
329	no_length = None # type: Optional[arith_expr_t]
330	return suffix_op.Slice(begin, no_length)
331
332	elif cur_id == Id.Arith_Colon: # ${a:1:} or ${@:1:}
333	colon_tok = self.cur_token
334	self._NextNonSpace()
335
336	if self.token_type == Id.Arith_RBrace:
337	# quirky bash behavior:
338	# ${a:1:} or ${a::} means length ZERO
339	# but ${a:1} or ${a:} means length N
340	if self.parse_opts.strict_parse_slice():
341	p_die(
342	"Slice length: Add explicit zero, or omit : for N (strict_parse_slice)",
343	colon_tok)
344
345	length = arith_expr.EmptyZero # type: arith_expr_t
346	else:
347	length = self._ReadArithExpr(Id.Arith_RBrace)
348
349	return suffix_op.Slice(begin, length)
350
351	else:
352	p_die("Expected : or } in slice", self.cur_token)
353
354	raise AssertionError() # for MyPy
355
356	def _ReadPatSubVarOp(self):
357	# type: () -> suffix_op.PatSub
358	"""Looking at the first '/' after VarOf:
359
360	VarSub = ...
361	\| VarOf '/' Match ( '/' WORD? )?
362	Match = '/' WORD # can't be empty
363	\| '#' WORD? # may be empty
364	\| '%' WORD?
365	"""
366	slash_tok = self.cur_token # location info
367	replace_mode = Id.Undefined_Tok # bizarre syntax / # %
368
369	self._SetNext(lex_mode_e.VSub_ArgUnquoted) # advance past /
370
371	self._GetToken()
372	if self.token_type == Id.Right_DollarBrace:
373	pat = CompoundWord([])
374	return suffix_op.PatSub(pat, rhs_word.Empty, replace_mode,
375	slash_tok)
376
377	if self.token_type in (Id.Lit_Slash, Id.Lit_Pound, Id.Lit_Percent):
378	replace_mode = self.token_type
379	self._SetNext(lex_mode_e.VSub_ArgUnquoted)
380
381	# Bash quirk:
382	# echo ${x/#/replace} has an empty pattern
383	# echo ${x////replace} is non-empty; it means echo ${x//'/'/replace}
384	empty_ok = replace_mode != Id.Lit_Slash
385	pat = self._ReadVarOpArg2(lex_mode_e.VSub_ArgUnquoted, Id.Lit_Slash,
386	empty_ok)
387	#log('pat 1 %r', pat)
388
389	if self.token_type == Id.Lit_Slash:
390	# read until }
391	replace = self._ReadVarOpArg(
392	lex_mode_e.VSub_ArgUnquoted) # type: rhs_word_t
393	#log('r 1 %r', replace)
394	else:
395	# e.g. ${v/a} is the same as ${v/a/} -- empty replacement string
396	replace = rhs_word.Empty
397
398	self._GetToken()
399	if self.token_type != Id.Right_DollarBrace:
400	# This happens on invalid code
401	p_die(
402	"Expected } after replacement string, got %s" %
403	ui.PrettyId(self.token_type), self.cur_token)
404
405	return suffix_op.PatSub(pat, replace, replace_mode, slash_tok)
406
407	def _ReadSubscript(self):
408	# type: () -> bracket_op_t
409	""" Subscript = '[' ('@' \| '*' \| ArithExpr) ']' """
410	# Lookahead to see if we get @ or *. Otherwise read a full arithmetic
411	# expression.
412	next_id = self.lexer.LookPastSpace(lex_mode_e.Arith)
413	if next_id in (Id.Lit_At, Id.Arith_Star):
414	op = bracket_op.WholeArray(next_id) # type: bracket_op_t
415
416	self._SetNext(lex_mode_e.Arith) # skip past [
417	self._GetToken()
418	self._SetNext(lex_mode_e.Arith) # skip past @
419	self._GetToken()
420	else:
421	self._SetNext(lex_mode_e.Arith) # skip past [
422	anode = self._ReadArithExpr(Id.Arith_RBracket)
423	op = bracket_op.ArrayIndex(anode)
424
425	if self.token_type != Id.Arith_RBracket: # Should be looking at ]
426	p_die('Expected ] to close subscript', self.cur_token)
427
428	self._SetNext(lex_mode_e.VSub_2) # skip past ]
429	self._GetToken() # Needed to be in the same spot as no subscript
430
431	return op
432
433	def _ParseVarOf(self):
434	# type: () -> BracedVarSub
435	"""
436	VarOf = NAME Subscript?
437	\| NUMBER # no subscript allowed, none of these are arrays
438	# ${@[1]} doesn't work, even though slicing does
439	\| VarSymbol
440	"""
441	self._GetToken()
442	name_token = self.cur_token
443	self._SetNext(lex_mode_e.VSub_2)
444
445	self._GetToken() # Check for []
446	if self.token_type == Id.VOp2_LBracket:
447	bracket_op = self._ReadSubscript()
448	else:
449	bracket_op = None
450
451	part = BracedVarSub.CreateNull()
452	part.name_tok = name_token
453	part.var_name = lexer.TokenVal(name_token)
454	part.bracket_op = bracket_op
455	return part
456
457	def _ParseVarExpr(self, arg_lex_mode, allow_query=False):
458	# type: (lex_mode_t, bool) -> BracedVarSub
459	"""Start parsing at the op -- we already skipped past the name."""
460	part = self._ParseVarOf()
461
462	self._GetToken()
463	if self.token_type == Id.Right_DollarBrace:
464	return part # no ops
465
466	op_kind = self.token_kind
467
468	if op_kind == Kind.VTest:
469	tok = self.cur_token
470	arg_word = self._ReadVarOpArg(arg_lex_mode)
471	if self.token_type != Id.Right_DollarBrace:
472	p_die('Expected } to close ${', self.cur_token)
473
474	part.suffix_op = suffix_op.Unary(tok, arg_word)
475
476	elif op_kind == Kind.VOpYsh:
477	tok = self.cur_token
478	arg_word = self._ReadVarOpArg(arg_lex_mode)
479	if self.token_type != Id.Right_DollarBrace:
480	p_die('Expected } to close ${', self.cur_token)
481
482	UP_arg_word = arg_word
483	with tagswitch(arg_word) as case:
484	if case(rhs_word_e.Empty):
485	pass
486	elif case(rhs_word_e.Compound):
487	arg_word = cast(CompoundWord, UP_arg_word)
488	# This handles ${x\|html} and ${x %.3f} now
489	# However I think ${x %.3f} should be statically parsed? It can enter
490	# the printf lexer modes.
491	ok, arg, quoted = word_.StaticEval(arg_word)
492	if not ok or quoted:
493	p_die('Expected a constant argument',
494	loc.Word(arg_word))
495
496	part.suffix_op = suffix_op.Static(tok, arg)
497
498	elif op_kind == Kind.VOp0:
499	part.suffix_op = self.cur_token # Nullary
500	self._SetNext(lex_mode_e.VSub_2) # Expecting }
501	self._GetToken()
502
503	elif op_kind == Kind.VOp1: # % %% # ## etc.
504	tok = self.cur_token
505	# Weird exception that all shells have: these operators take a glob
506	# pattern, so they're lexed as VSub_ArgUnquoted, not VSub_ArgDQ
507	arg_word = self._ReadVarOpArg(lex_mode_e.VSub_ArgUnquoted)
508	if self.token_type != Id.Right_DollarBrace:
509	p_die('Expected } to close ${', self.cur_token)
510
511	part.suffix_op = suffix_op.Unary(tok, arg_word)
512
513	elif op_kind == Kind.VOp2: # / : [ ]
514	if self.token_type == Id.VOp2_Slash:
515	patsub_op = self._ReadPatSubVarOp() # type: suffix_op_t
516	part.suffix_op = patsub_op
517
518	# Checked by the method above
519	assert self.token_type == Id.Right_DollarBrace, self.cur_token
520
521	elif self.token_type == Id.VOp2_Colon:
522	part.suffix_op = self._ReadSliceVarOp()
523	# NOTE: } in arithmetic mode.
524	if self.token_type != Id.Arith_RBrace:
525	# Token seems off; doesn't point to X in # ${a:1:2 X
526	p_die('Expected } to close ${', self.cur_token)
527
528	else:
529	# TODO: Does this ever happen?
530	p_die('Unexpected token in ${} (%s)' % 'VOp2', self.cur_token)
531
532	elif op_kind == Kind.VOp3: # ${prefix@} etc.
533	if allow_query:
534	part.suffix_op = self.cur_token # Nullary
535	self._SetNext(lex_mode_e.VSub_2) # Expecting }
536	self._GetToken()
537	else:
538	p_die("Unexpected token in ${} (%s)" % 'VOp3', self.cur_token)
539
540	# NOTE: Arith_RBrace is for slicing, because it reads } in arithmetic
541	# mode. It's redundantly checked above.
542	if self.token_type not in (Id.Right_DollarBrace, Id.Arith_RBrace):
543	# ${a.} or ${!a.}
544	p_die('Expected } to close ${', self.cur_token)
545
546	# Now look for ops
547	return part
548
549	def _ReadZshVarSub(self, left_token):
550	# type: (Token) -> word_part.ZshVarSub
551
552	self._SetNext(lex_mode_e.VSub_Zsh) # Move past ${(foo)
553
554	# Can be empty
555	w = self._ReadCompoundWord3(lex_mode_e.VSub_Zsh, Id.Right_DollarBrace,
556	True)
557	self._GetToken()
558	return word_part.ZshVarSub(left_token, w, self.cur_token)
559
560	def ReadBracedVarSub(self, left_token):
561	# type: (Token) -> Tuple[BracedVarSub, Token]
562	""" For YSH expressions like var x = ${x:-"default"}. """
563	part = self._ReadBracedVarSub(left_token, d_quoted=False)
564	last_token = self.cur_token
565	return part, last_token
566
567	def _ReadBracedVarSub(self, left_token, d_quoted):
568	# type: (Token, bool) -> BracedVarSub
569	"""For the ${} expression language.
570
571	NAME = [a-zA-Z_][a-zA-Z0-9_]*
572	NUMBER = [0-9]+ # ${10}, ${11}, ...
573
574	Subscript = '[' ('@' \| '*' \| ArithExpr) ']'
575	VarSymbol = '!' \| '@' \| '#' \| ...
576	VarOf = NAME Subscript?
577	\| NUMBER # no subscript allowed, none of these are arrays
578	# ${@[1]} doesn't work, even though slicing does
579	\| VarSymbol
580
581	NULLARY_OP = '@Q' \| '@E' \| '@P' \| '@A' \| '@a' # VOp0
582
583	TEST_OP = '-' \| ':-' \| '=' \| ':=' \| '+' \| ':+' \| '?' \| ':?'
584	STRIP_OP = '#' \| '##' \| '%' \| '%%'
585	CASE_OP = ',' \| ',,' \| '^' \| '^^'
586	UnaryOp = TEST_OP \| STRIP_OP \| CASE_OP
587
588	YSH_UNARY = '\|' \| ' ' # ${x\|html} and ${x %.3f}.
589	# SPACE is operator not %
590	Match = ('/' \| '#' \| '%') WORD # match all / prefix / suffix
591	VarExpr = VarOf
592	\| VarOf NULLARY_OP
593	\| VarOf UnaryOp WORD
594	\| VarOf YSH_UNARY STATIC_WORD
595	\| VarOf ':' ArithExpr (':' ArithExpr )?
596	\| VarOf '/' Match '/' WORD
597
598	LengthExpr = '#' VarOf # can't apply operators after length
599
600	RefOrKeys = '!' VarExpr # CAN apply operators after a named ref
601	# ${!ref[0]} vs ${!keys[@]} resolved later
602
603	PrefixQuery = '!' NAME ('*' \| '@') # list variable names with a prefix
604
605	BuiltinSub = '.' WORD+ # ${.myproc 'builtin' $sub}
606
607	VarSub = LengthExpr
608	\| RefOrKeys
609	\| PrefixQuery
610	\| VarExpr
611	\| BuiltinSub
612
613	NOTES:
614	- Arithmetic expressions are used twice, inside subscripts ${a[x+1]} and
615	slicing ${a:x+1:y+2}
616	- ${#} and ${!} need LL(2) lookahead (considering how my tokenizer works)
617	- @ and * are technically arithmetic expressions in this implementation
618	- We don't account for bash 4.4: ${param@operator} -- Q E P A a. Note that
619	it's also vectorized.
620
621	Strictness over bash:
622	- echo ${a[0][0]} doesn't do anything useful, so we disallow it from the
623	grammar
624	- ! and # prefixes can't be composed, even though named refs can be
625	composed with other operators
626	- '#' means 4 different things: length prefix, VarSymbol, UnaryOp to strip
627	a prefix, and it can also be a literal part of WORD.
628
629	From the parser's point of view, the prefix # can't be combined with
630	UnaryOp/slicing/matching, and the ! can. However
631
632	- ${a[@]:1:2} is not allowed
633	- ${#a[@]:1:2} is allowed, but gives the wrong answer
634	"""
635	if d_quoted:
636	arg_lex_mode = lex_mode_e.VSub_ArgDQ
637	else:
638	arg_lex_mode = lex_mode_e.VSub_ArgUnquoted
639
640	self._SetNext(lex_mode_e.VSub_1)
641	self._GetToken()
642
643	ty = self.token_type
644	first_tok = self.cur_token
645
646	if ty == Id.VSub_Pound:
647	# Disambiguate
648	next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
649	if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
650	# e.g. a name, '#' is the prefix
651	self._SetNext(lex_mode_e.VSub_1)
652	part = self._ParseVarOf()
653
654	self._GetToken()
655	if self.token_type != Id.Right_DollarBrace:
656	p_die('Expected } after length expression', self.cur_token)
657
658	part.prefix_op = first_tok
659
660	else: # not a prefix, '#' is the variable
661	part = self._ParseVarExpr(arg_lex_mode)
662
663	elif ty == Id.VSub_Bang:
664	next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
665	if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
666	# e.g. a name, '!' is the prefix
667	# ${!a} -- this is a ref
668	# ${!3} -- this is ref
669	# ${!a[1]} -- this is a ref
670	# ${!a[@]} -- this is a keys
671	# No lookahead -- do it in a second step, or at runtime
672	self._SetNext(lex_mode_e.VSub_1)
673	part = self._ParseVarExpr(arg_lex_mode, allow_query=True)
674
675	part.prefix_op = first_tok
676
677	else: # not a prefix, '!' is the variable
678	part = self._ParseVarExpr(arg_lex_mode)
679
680	elif ty == Id.VSub_Dot:
681	# Note: this will become a new builtin_sub type, so this method must
682	# return word_part_t rather than BracedVarSub. I don't think that
683	# should cause problems.
684	p_die('TODO: ${.myproc builtin sub}', self.cur_token)
685
686	# VS_NAME, VS_NUMBER, symbol that isn't # or !
687	elif self.token_kind == Kind.VSub:
688	part = self._ParseVarExpr(arg_lex_mode)
689
690	else:
691	# e.g. ${^}
692	p_die('Unexpected token in ${}', self.cur_token)
693
694	part.left = left_token # attach the argument
695	part.right = self.cur_token
696	return part
697
698	def _ReadSingleQuoted(self, left_token, lex_mode):
699	# type: (Token, lex_mode_t) -> SingleQuoted
700	"""Internal method to read a word_part."""
701	tokens = [] # type: List[Token]
702	# In command mode, we never disallow backslashes like '\'
703	right_quote = self.ReadSingleQuoted(lex_mode, left_token, tokens,
704	False)
705	sval = word_compile.EvalSingleQuoted(left_token.id, tokens)
706	node = SingleQuoted(left_token, sval, right_quote)
707	return node
708
709	def ReadSingleQuoted(self, lex_mode, left_token, out_tokens, is_ysh_expr):
710	# type: (lex_mode_t, Token, List[Token], bool) -> Token
711	"""Appends to out_tokens; returns last token
712
713	Used by expr_parse.py
714	"""
715	# TODO: Remove and use out_tokens
716	tokens = [] # type: List[Token]
717
718	# echo '\' is allowed, but x = '\' is invalid, in favor of x = r'\'
719	no_backslashes = is_ysh_expr and left_token.id == Id.Left_SingleQuote
720
721	expected_end_tokens = 3 if left_token.id in (
722	Id.Left_TSingleQuote, Id.Left_RTSingleQuote, Id.Left_UTSingleQuote,
723	Id.Left_BTSingleQuote) else 1
724	num_end_tokens = 0
725
726	while num_end_tokens < expected_end_tokens:
727	self._SetNext(lex_mode)
728	self._GetToken()
729
730	# Kind.Char emitted in lex_mode.SQ_C
731	if self.token_kind in (Kind.Lit, Kind.Char):
732	tok = self.cur_token
733	# Happens in lex_mode_e.SQ: 'one\two' is ambiguous, should be
734	# r'one\two' or c'one\\two'
735	if no_backslashes and lexer.TokenContains(tok, '\\'):
736	p_die(
737	r"Strings with backslashes should look like r'\n' or u'\n' or b'\n'",
738	tok)
739
740	if is_ysh_expr:
741	# Disallow var x = $'\001'. Arguably we don't need these
742	# checks because u'\u{1}' is the way to write it.
743	if self.token_type == Id.Char_Octal3:
744	p_die(
745	r"Use \xhh or \u{...} instead of octal escapes in YSH strings",
746	tok)
747
748	if self.token_type == Id.Char_Hex and self.cur_token.length != 4:
749	# disallow \xH
750	p_die(
751	r'Invalid hex escape in YSH string (must be \xHH)',
752	tok)
753
754	tokens.append(tok)
755
756	elif self.token_kind == Kind.Unknown:
757	tok = self.cur_token
758	assert tok.id == Id.Unknown_Backslash, tok
759
760	# x = $'\z' is disallowed; ditto for echo $'\z' if shopt -u parse_backslash
761	if is_ysh_expr or not self.parse_opts.parse_backslash():
762	p_die(
763	"Invalid char escape in C-style string literal (OILS-ERR-11)",
764	tok)
765
766	tokens.append(tok)
767
768	elif self.token_kind == Kind.Eof:
769	p_die('Unexpected EOF in single-quoted string that began here',
770	left_token)
771
772	elif self.token_kind == Kind.Right:
773	# assume Id.Right_SingleQuote
774	num_end_tokens += 1
775	tokens.append(self.cur_token)
776
777	else:
778	raise AssertionError(self.cur_token)
779
780	if self.token_kind != Kind.Right:
781	num_end_tokens = 0 # we need three in a ROW
782
783	if expected_end_tokens == 1:
784	tokens.pop()
785	elif expected_end_tokens == 3: # Get rid of spurious end tokens
786	tokens.pop()
787	tokens.pop()
788	tokens.pop()
789
790	# Remove space from ''' r''' $''' in both expression mode and command mode
791	if left_token.id in (Id.Left_TSingleQuote, Id.Left_RTSingleQuote,
792	Id.Left_UTSingleQuote, Id.Left_BTSingleQuote):
793	word_compile.RemoveLeadingSpaceSQ(tokens)
794
795	# Validation after lexing - same 2 checks in j8.LexerDecoder
796	is_u_string = left_token.id in (Id.Left_USingleQuote,
797	Id.Left_UTSingleQuote)
798
799	for tok in tokens:
800	# u'\yff' is not valid, but b'\yff' is
801	if is_u_string and tok.id == Id.Char_YHex:
802	p_die(
803	r"%s escapes not allowed in u'' strings" %
804	lexer.TokenVal(tok), tok)
805
806	out_tokens.extend(tokens)
807	return self.cur_token
808
809	def _ReadDoubleQuotedLeftParts(self):
810	# type: () -> word_part_t
811	"""Read substitution parts in a double quoted context."""
812	if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick):
813	return self._ReadCommandSub(self.token_type, d_quoted=True)
814
815	if self.token_type == Id.Left_DollarBrace:
816	return self._ReadBracedVarSub(self.cur_token, d_quoted=True)
817
818	if self.token_type == Id.Left_DollarDParen:
819	return self._ReadArithSub()
820
821	if self.token_type == Id.Left_DollarBracket:
822	return self._ReadExprSub(lex_mode_e.DQ)
823
824	if self.token_type == Id.Left_DollarBraceZsh:
825	return self._ReadZshVarSub(self.cur_token)
826
827	raise AssertionError(self.cur_token)
828
829	def _ReadYshSingleQuoted(self, left_id):
830	# type: (Id_t) -> CompoundWord
831	"""Read YSH style strings
832
833	r'' u'' b''
834	r''' ''' u''' ''' b''' '''
835	"""
836	#log('BEF self.cur_token %s', self.cur_token)
837	if left_id == Id.Left_RSingleQuote:
838	lexer_mode = lex_mode_e.SQ_Raw
839	triple_left_id = Id.Left_RTSingleQuote
840	elif left_id == Id.Left_USingleQuote:
841	lexer_mode = lex_mode_e.J8_Str
842	triple_left_id = Id.Left_UTSingleQuote
843	elif left_id == Id.Left_BSingleQuote:
844	lexer_mode = lex_mode_e.J8_Str
845	triple_left_id = Id.Left_BTSingleQuote
846	else:
847	raise AssertionError(left_id)
848
849	# Needed for syntax checks
850	left_tok = self.cur_token
851	left_tok.id = left_id
852
853	sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
854
855	if (len(sq_part.sval) == 0 and self.lexer.ByteLookAhead() == "'"):
856	self._SetNext(lex_mode_e.ShCommand)
857	self._GetToken()
858
859	assert self.token_type == Id.Left_SingleQuote
860	# HACK: magically transform the third ' in u''' to
861	# Id.Left_UTSingleQuote, so that ''' is the terminator
862	left_tok = self.cur_token
863	left_tok.id = triple_left_id
864
865	# Handles stripping leading whitespace
866	sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
867
868	# Advance and validate
869	self._SetNext(lex_mode_e.ShCommand)
870
871	self._GetToken()
872	if self.token_kind not in KINDS_THAT_END_WORDS:
873	p_die('Unexpected token after YSH single-quoted string',
874	self.cur_token)
875
876	return CompoundWord([sq_part])
877
878	def _ReadUnquotedLeftParts(self, triple_out):
879	# type: (Optional[BoolParamBox]) -> word_part_t
880	"""Read substitutions and quoted strings (for lex_mode_e.ShCommand).
881
882	If triple_out is set, then we try parsing triple quoted strings,
883	and set its value to True if we got one.
884	"""
885	if self.token_type in (Id.Left_DoubleQuote, Id.Left_DollarDoubleQuote):
886	# Note: $"" is a synonym for "". It might make sense if it added
887	# \n \0 \x00 \u{123} etc. But that's not what bash does!
888	dq_part = self._ReadDoubleQuoted(self.cur_token)
889	# Got empty word "" and there's a " after
890	if (triple_out and len(dq_part.parts) == 0 and
891	self.lexer.ByteLookAhead() == '"'):
892
893	self._SetNext(lex_mode_e.ShCommand)
894	self._GetToken()
895	# HACK: magically transform the third " in """ to
896	# Id.Left_TDoubleQuote, so that """ is the terminator
897	left_dq_token = self.cur_token
898	left_dq_token.id = Id.Left_TDoubleQuote
899	triple_out.b = True # let caller know we got it
900	return self._ReadDoubleQuoted(left_dq_token)
901
902	return dq_part
903
904	if self.token_type in (Id.Left_SingleQuote, Id.Left_RSingleQuote,
905	Id.Left_DollarSingleQuote):
906	if self.token_type == Id.Left_SingleQuote:
907	lexer_mode = lex_mode_e.SQ_Raw
908	triple_left_id = Id.Left_TSingleQuote
909	elif self.token_type == Id.Left_RSingleQuote:
910	lexer_mode = lex_mode_e.SQ_Raw
911	triple_left_id = Id.Left_RTSingleQuote
912	else:
913	lexer_mode = lex_mode_e.SQ_C
914	# there is no such thing as $'''
915	triple_left_id = Id.Undefined_Tok
916
917	sq_part = self._ReadSingleQuoted(self.cur_token, lexer_mode)
918
919	# Got empty '' or r'' and there's a ' after
920	# u'' and b'' are handled in _ReadYshSingleQuoted
921	if (triple_left_id != Id.Undefined_Tok and
922	triple_out is not None and len(sq_part.sval) == 0 and
923	self.lexer.ByteLookAhead() == "'"):
924
925	self._SetNext(lex_mode_e.ShCommand)
926	self._GetToken()
927
928	# HACK: magically transform the third ' in ''' to
929	# Id.Left_TSingleQuote, so that ''' is the terminator
930	left_sq_token = self.cur_token
931	left_sq_token.id = triple_left_id
932
933	triple_out.b = True # let caller know we got it
934	return self._ReadSingleQuoted(left_sq_token, lexer_mode)
935
936	return sq_part
937
938	if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick,
939	Id.Left_ProcSubIn, Id.Left_ProcSubOut):
940	return self._ReadCommandSub(self.token_type, d_quoted=False)
941
942	if self.token_type == Id.Left_DollarBrace:
943	return self._ReadBracedVarSub(self.cur_token, d_quoted=False)
944
945	if self.token_type == Id.Left_DollarDParen:
946	return self._ReadArithSub()
947
948	if self.token_type == Id.Left_DollarBracket:
949	return self._ReadExprSub(lex_mode_e.ShCommand)
950
951	if self.token_type == Id.Left_DollarBraceZsh:
952	return self._ReadZshVarSub(self.cur_token)
953
954	raise AssertionError(self.cur_token)
955
956	def _ReadExtGlob(self):
957	# type: () -> word_part.ExtGlob
958	"""
959	Grammar:
960	Item = CompoundWord \| EPSILON # important: @(foo\|) is allowed
961	LEFT = '@(' \| '*(' \| '+(' \| '?(' \| '!('
962	RIGHT = ')'
963	ExtGlob = LEFT (Item '\|')* Item RIGHT # ITEM may be empty
964	Compound includes ExtGlob
965	"""
966	left_token = self.cur_token
967	right_token = None # type: Token
968	arms = [] # type: List[CompoundWord]
969
970	self.lexer.PushHint(Id.Op_RParen, Id.Right_ExtGlob)
971	self._SetNext(lex_mode_e.ExtGlob) # advance past LEFT
972
973	read_word = False # did we just a read a word? To handle @(\|\|).
974
975	while True:
976	self._GetToken()
977
978	if self.token_type == Id.Right_ExtGlob:
979	if not read_word:
980	arms.append(CompoundWord([]))
981	right_token = self.cur_token
982	break
983
984	elif self.token_type == Id.Op_Pipe:
985	if not read_word:
986	arms.append(CompoundWord([]))
987	read_word = False
988	self._SetNext(lex_mode_e.ExtGlob)
989
990	# lex_mode_e.ExtGlob should only produce these 4 kinds of tokens
991	elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub,
992	Kind.ExtGlob):
993	w = self._ReadCompoundWord(lex_mode_e.ExtGlob)
994	arms.append(w)
995	read_word = True
996
997	elif self.token_kind == Kind.Eof:
998	p_die('Unexpected EOF reading extended glob that began here',
999	left_token)
1000
1001	else:
1002	raise AssertionError(self.cur_token)
1003
1004	return word_part.ExtGlob(left_token, arms, right_token)
1005
1006	def _ReadBashRegexGroup(self):
1007	# type: () -> word_part.BashRegexGroup
1008	"""
1009	Grammar:
1010	BashRegexGroup = '(' WORD? ')
1011	"""
1012	left_token = self.cur_token
1013	assert left_token.id == Id.BashRegex_LParen, left_token
1014
1015	arms = [] # type: List[CompoundWord]
1016
1017	self.lexer.PushHint(Id.Op_RParen, Id.Right_BashRegexGroup)
1018	self._SetNext(lex_mode_e.BashRegexFakeInner) # advance past LEFT
1019
1020	self._GetToken()
1021	if self.token_type == Id.Right_BashRegexGroup: # empty ()
1022	return word_part.BashRegexGroup(left_token, None, self.cur_token)
1023
1024	# lex_mode_e.BashRegex should only produce these 4 kinds of tokens
1025	if self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub, Kind.BashRegex):
1026	# Fake lexer mode that translates Id.WS_Space to Id.Lit_Chars
1027	# To allow bash style [[ s =~ (a b) ]]
1028	w = self._ReadCompoundWord(lex_mode_e.BashRegexFakeInner)
1029	arms.append(w)
1030
1031	self._GetToken()
1032	if self.token_type != Id.Right_BashRegexGroup:
1033	p_die('Expected ) to close bash regex group', self.cur_token)
1034
1035	return word_part.BashRegexGroup(left_token, w, self.cur_token)
1036
1037	p_die('Expected word after ( opening bash regex group', self.cur_token)
1038
1039	def _ReadLikeDQ(self, left_token, is_ysh_expr, out_parts):
1040	# type: (Optional[Token], bool, List[word_part_t]) -> None
1041	"""
1042	Args:
1043	left_token: A token if we are reading a double quoted part, or None if
1044	we're reading a here doc.
1045	is_ysh_expr: Whether to disallow backticks and invalid char escapes
1046	out_parts: list of word_part to append to
1047	"""
1048	if left_token:
1049	if left_token.id in (Id.Left_TDoubleQuote,
1050	Id.Left_DollarTDoubleQuote):
1051	expected_end_tokens = 3
1052	else:
1053	expected_end_tokens = 1
1054	else:
1055	expected_end_tokens = 1000 # here doc will break
1056
1057	num_end_tokens = 0
1058	while num_end_tokens < expected_end_tokens:
1059	self._SetNext(lex_mode_e.DQ)
1060	self._GetToken()
1061
1062	if self.token_kind == Kind.Lit:
1063	if self.token_type == Id.Lit_EscapedChar:
1064	tok = self.cur_token
1065	ch = lexer.TokenSliceLeft(tok, 1)
1066	part = word_part.EscapedLiteral(tok,
1067	ch) # type: word_part_t
1068	else:
1069	if self.token_type == Id.Lit_BadBackslash:
1070	# echo "\z" is OK in shell, but 'x = "\z" is a syntax error in
1071	# YSH.
1072	# Slight hole: We don't catch 'x = ${undef:-"\z"} because of the
1073	# recursion (unless parse_backslash)
1074	if (is_ysh_expr or
1075	not self.parse_opts.parse_backslash()):
1076	p_die(
1077	"Invalid char escape in double quoted string (OILS-ERR-12)",
1078	self.cur_token)
1079	elif self.token_type == Id.Lit_Dollar:
1080	if is_ysh_expr or not self.parse_opts.parse_dollar():
1081	p_die("Literal $ should be quoted like \$",
1082	self.cur_token)
1083
1084	part = self.cur_token
1085	out_parts.append(part)
1086
1087	elif self.token_kind == Kind.Left:
1088	if self.token_type == Id.Left_Backtick and is_ysh_expr:
1089	p_die("Invalid backtick: use $(cmd) or \\` in YSH strings",
1090	self.cur_token)
1091
1092	part = self._ReadDoubleQuotedLeftParts()
1093	out_parts.append(part)
1094
1095	elif self.token_kind == Kind.VSub:
1096	tok = self.cur_token
1097	part = SimpleVarSub(tok)
1098	out_parts.append(part)
1099	# NOTE: parsing "$f(x)" would BREAK CODE. Could add a more for it
1100	# later.
1101
1102	elif self.token_kind == Kind.Right:
1103	assert self.token_type == Id.Right_DoubleQuote, self.token_type
1104	if left_token:
1105	num_end_tokens += 1
1106
1107	# In a here doc, the right quote is literal!
1108	out_parts.append(self.cur_token)
1109
1110	elif self.token_kind == Kind.Eof:
1111	if left_token:
1112	p_die(
1113	'Unexpected EOF reading double-quoted string that began here',
1114	left_token)
1115	else: # here docs will have an EOF in their token stream
1116	break
1117
1118	else:
1119	raise AssertionError(self.cur_token)
1120
1121	if self.token_kind != Kind.Right:
1122	num_end_tokens = 0 # """ must be CONSECUTIVE
1123
1124	if expected_end_tokens == 1:
1125	out_parts.pop()
1126	elif expected_end_tokens == 3:
1127	out_parts.pop()
1128	out_parts.pop()
1129	out_parts.pop()
1130
1131	# Remove space from """ in both expression mode and command mode
1132	if (left_token and left_token.id
1133	in (Id.Left_TDoubleQuote, Id.Left_DollarTDoubleQuote)):
1134	word_compile.RemoveLeadingSpaceDQ(out_parts)
1135
1136	# Return nothing, since we appended to 'out_parts'
1137
1138	def _ReadDoubleQuoted(self, left_token):
1139	# type: (Token) -> DoubleQuoted
1140	"""Helper function for "hello $name".
1141
1142	Args:
1143	eof_type: for stopping at }, Id.Lit_RBrace
1144	here_doc: Whether we are reading in a here doc context
1145
1146	Also ${foo%%a b c} # treat this as double quoted. until you hit
1147	"""
1148	parts = [] # type: List[word_part_t]
1149	self._ReadLikeDQ(left_token, False, parts)
1150
1151	right_quote = self.cur_token
1152	return DoubleQuoted(left_token, parts, right_quote)
1153
1154	def ReadDoubleQuoted(self, left_token, parts):
1155	# type: (Token, List[word_part_t]) -> Token
1156	"""For expression mode.
1157
1158	Read var x = "${dir:-}/$name"; etc.
1159	"""
1160	self._ReadLikeDQ(left_token, True, parts)
1161	return self.cur_token
1162
1163	def _ReadCommandSub(self, left_id, d_quoted=False):
1164	# type: (Id_t, bool) -> CommandSub
1165	"""
1166	NOTE: This is not in the grammar, because word parts aren't in the grammar!
1167
1168	command_sub = '$(' command_list ')'
1169	\| '@(' command_list ')'
1170	\| '<(' command_list ')'
1171	\| '>(' command_list ')'
1172	\| ` command_list `
1173	"""
1174	left_token = self.cur_token
1175
1176	# Set the lexer in a state so ) becomes the EOF token.
1177	if left_id in (Id.Left_DollarParen, Id.Left_AtParen, Id.Left_ProcSubIn,
1178	Id.Left_ProcSubOut):
1179	self._SetNext(lex_mode_e.ShCommand) # advance past $( etc.
1180
1181	right_id = Id.Eof_RParen
1182	self.lexer.PushHint(Id.Op_RParen, right_id)
1183	c_parser = self.parse_ctx.MakeParserForCommandSub(
1184	self.line_reader, self.lexer, right_id)
1185	# NOTE: This doesn't use something like main_loop because we don't want
1186	# to interleave parsing and execution! Unlike 'source' and 'eval'.
1187	node = c_parser.ParseCommandSub()
1188
1189	right_token = c_parser.w_parser.cur_token
1190
1191	elif left_id == Id.Left_Backtick and self.parse_ctx.do_lossless:
1192	# NOTE: This is an APPROXIMATE solution for translation ONLY. See
1193	# test/osh2oil.
1194
1195	right_id = Id.Eof_Backtick
1196	self.lexer.PushHint(Id.Left_Backtick, right_id)
1197	c_parser = self.parse_ctx.MakeParserForCommandSub(
1198	self.line_reader, self.lexer, right_id)
1199	node = c_parser.ParseCommandSub()
1200	right_token = c_parser.w_parser.cur_token
1201
1202	elif left_id == Id.Left_Backtick:
1203	if not self.parse_opts.parse_backticks():
1204	p_die('Use $(cmd) instead of backticks (parse_backticks)',
1205	left_token)
1206
1207	self._SetNext(lex_mode_e.Backtick) # advance past `
1208
1209	parts = [] # type: List[str]
1210	while True:
1211	self._GetToken()
1212	#log("TOK %s", self.cur_token)
1213
1214	if self.token_type == Id.Backtick_Quoted:
1215	# Remove leading \
1216	parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1217
1218	elif self.token_type == Id.Backtick_DoubleQuote:
1219	# Compatibility: If backticks are double quoted, then double quotes
1220	# within them have to be \"
1221	# Shells aren't smart enough to match nested " and ` quotes (but OSH
1222	# is)
1223	if d_quoted:
1224	# Remove leading \
1225	parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1226	else:
1227	parts.append(lexer.TokenVal(self.cur_token))
1228
1229	elif self.token_type == Id.Backtick_Other:
1230	parts.append(lexer.TokenVal(self.cur_token))
1231
1232	elif self.token_type == Id.Backtick_Right:
1233	break
1234
1235	elif self.token_type == Id.Eof_Real:
1236	# Note: this parse error is in the ORIGINAL context. No code_str yet.
1237	p_die('Unexpected EOF while looking for closing backtick',
1238	left_token)
1239
1240	else:
1241	raise AssertionError(self.cur_token)
1242
1243	self._SetNext(lex_mode_e.Backtick)
1244
1245	# Calculate right SPID on CommandSub BEFORE re-parsing.
1246	right_token = self.cur_token
1247
1248	code_str = ''.join(parts)
1249	#log('code %r', code_str)
1250
1251	# Save lines into a new, temporary arena, so SnipCodeBlock() isn't
1252	# messed up. Note: This is similar to how we parse aliases in
1253	# osh/cmd_parse.py. It won't have the same location info as
1254	# MakeParserForCommandSub(), because the reader is different.
1255	arena = alloc.Arena()
1256	# TODO: arena.PushSource()?
1257
1258	line_reader = reader.StringLineReader(code_str, arena)
1259	c_parser = self.parse_ctx.MakeOshParser(line_reader)
1260	src = source.Reparsed('backticks', left_token, right_token)
1261	with alloc.ctx_SourceCode(arena, src):
1262	node = c_parser.ParseCommandSub()
1263
1264	else:
1265	raise AssertionError(left_id)
1266
1267	return CommandSub(left_token, node, right_token)
1268
1269	def _ReadExprSub(self, lex_mode):
1270	# type: (lex_mode_t) -> word_part.ExprSub
1271	"""$[d->key] $[obj.method()] etc."""
1272	left_token = self.cur_token
1273
1274	self._SetNext(lex_mode_e.Expr)
1275	enode, right_token = self.parse_ctx.ParseYshExpr(
1276	self.lexer, grammar_nt.ysh_expr_sub)
1277
1278	self._SetNext(lex_mode) # Move past ]
1279	return word_part.ExprSub(left_token, enode, right_token)
1280
1281	def ParseVarDecl(self, kw_token):
1282	# type: (Token) -> VarDecl
1283	"""
1284	oil_var_decl: name_type_list '=' testlist end_stmt
1285
1286	Note that assignments must end with \n ; } or EOF. Unlike shell
1287	assignments, we disallow:
1288
1289	var x = 42 \| wc -l
1290	var x = 42 && echo hi
1291	"""
1292	self._SetNext(lex_mode_e.Expr)
1293	enode, last_token = self.parse_ctx.ParseVarDecl(kw_token, self.lexer)
1294	# Hack to move } from what the Expr lexer modes gives to what CommandParser
1295	# wants
1296	if last_token.id == Id.Op_RBrace:
1297	last_token.id = Id.Lit_RBrace
1298
1299	# Let the CommandParser see the Op_Semi or Op_Newline.
1300	self.buffered_word = last_token
1301	self._SetNext(lex_mode_e.ShCommand) # always back to this
1302	return enode
1303
1304	def ParseMutation(self, kw_token, var_checker):
1305	# type: (Token, VarChecker) -> Mutation
1306	"""
1307	setvar i = 42
1308	setvar i += 1
1309	setvar a[i] = 42
1310	setvar a[i] += 1
1311	setvar d.key = 42
1312	setvar d.key += 1
1313	"""
1314	self._SetNext(lex_mode_e.Expr)
1315	enode, last_token = self.parse_ctx.ParseMutation(kw_token, self.lexer)
1316	# Hack to move } from what the Expr lexer modes gives to what CommandParser
1317	# wants
1318	if last_token.id == Id.Op_RBrace:
1319	last_token.id = Id.Lit_RBrace
1320
1321	for lhs in enode.lhs:
1322	UP_lhs = lhs
1323	with tagswitch(lhs) as case:
1324	if case(y_lhs_e.Var):
1325	lhs = cast(Token, UP_lhs)
1326	var_checker.Check(kw_token.id, lexer.LazyStr(lhs), lhs)
1327
1328	# Note: this does not cover cases like
1329	# setvar (a[0])[1] = v
1330	# setvar (d.key).other = v
1331	# This leaks into catching all typos statically, which may be
1332	# possible if 'use' makes all names explicit.
1333	elif case(y_lhs_e.Subscript):
1334	lhs = cast(Subscript, UP_lhs)
1335	if lhs.obj.tag() == expr_e.Var:
1336	v = cast(expr.Var, lhs.obj)
1337	var_checker.Check(kw_token.id, v.name, v.left)
1338
1339	elif case(y_lhs_e.Attribute):
1340	lhs = cast(Attribute, UP_lhs)
1341	if lhs.obj.tag() == expr_e.Var:
1342	v = cast(expr.Var, lhs.obj)
1343	var_checker.Check(kw_token.id, v.name, v.left)
1344
1345	# Let the CommandParser see the Op_Semi or Op_Newline.
1346	self.buffered_word = last_token
1347	self._SetNext(lex_mode_e.ShCommand) # always back to this
1348	return enode
1349
1350	def ParseBareDecl(self):
1351	# type: () -> expr_t
1352	"""
1353	x = {name: val}
1354	"""
1355	self._SetNext(lex_mode_e.Expr)
1356	self._GetToken()
1357	enode, last_token = self.parse_ctx.ParseYshExpr(
1358	self.lexer, grammar_nt.command_expr)
1359	if last_token.id == Id.Op_RBrace:
1360	last_token.id = Id.Lit_RBrace
1361	self.buffered_word = last_token
1362	self._SetNext(lex_mode_e.ShCommand)
1363	return enode
1364
1365	def ParseYshExprForCommand(self):
1366	# type: () -> expr_t
1367
1368	# Fudge for this case
1369	# for x in(y) {
1370	# versus
1371	# for x in (y) {
1372	#
1373	# In the former case, ReadWord on 'in' puts the lexer past (.
1374	# Also see LookPastSpace in CommandParers.
1375	# A simpler solution would be nicer.
1376
1377	if self.token_type == Id.Op_LParen:
1378	self.lexer.MaybeUnreadOne()
1379
1380	enode, _ = self.parse_ctx.ParseYshExpr(self.lexer, grammar_nt.ysh_expr)
1381
1382	self._SetNext(lex_mode_e.ShCommand)
1383	return enode
1384
1385	def ParseCommandExpr(self):
1386	# type: () -> expr_t
1387	"""
1388	= 1+2
1389	"""
1390	enode, last_token = self.parse_ctx.ParseYshExpr(
1391	self.lexer, grammar_nt.command_expr)
1392
1393	# In some cases, such as the case statement, we expect the lexer to be
1394	# pointing at the token right after the expression. But the expression
1395	# parser must have read to the `last_token`. Unreading places the lexer
1396	# back in the expected state. Ie:
1397	#
1398	# case (x) { case (x) {
1399	# (else) { = x } (else) { = x }
1400	# ^ The lexer is here ^ Unread to here
1401	# } }
1402	assert last_token.id in (Id.Op_Newline, Id.Eof_Real, Id.Op_Semi,
1403	Id.Op_RBrace), last_token
1404	if last_token.id != Id.Eof_Real:
1405	# Eof_Real is the only token we cannot unread
1406	self.lexer.MaybeUnreadOne()
1407
1408	return enode
1409
1410	def ParseProc(self, node):
1411	# type: (Proc) -> None
1412
1413	# proc name-with-hyphens() must be accepted
1414	self._SetNext(lex_mode_e.ShCommand)
1415	self._GetToken()
1416	# example: 'proc f[' gets you Lit_ArrayLhsOpen
1417	if self.token_type != Id.Lit_Chars:
1418	p_die('Invalid proc name %s' % ui.PrettyToken(self.cur_token),
1419	self.cur_token)
1420
1421	# TODO: validate this more. Disallow proc 123 { }, which isn't disallowed
1422	# for shell functions. Similar to IsValidVarName().
1423	node.name = self.cur_token
1424
1425	last_token = self.parse_ctx.ParseProc(self.lexer, node)
1426
1427	# Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1428	assert last_token.id == Id.Op_LBrace
1429	last_token.id = Id.Lit_LBrace
1430	self.buffered_word = last_token
1431
1432	self._SetNext(lex_mode_e.ShCommand)
1433
1434	def ParseFunc(self, node):
1435	# type: (Func) -> None
1436	last_token = self.parse_ctx.ParseFunc(self.lexer, node)
1437
1438	# Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1439	assert last_token.id == Id.Op_LBrace
1440	last_token.id = Id.Lit_LBrace
1441	self.buffered_word = last_token
1442
1443	self._SetNext(lex_mode_e.ShCommand)
1444
1445	def ParseYshCasePattern(self):
1446	# type: () -> Tuple[pat_t, Token]
1447	pat, left_tok, last_token = self.parse_ctx.ParseYshCasePattern(
1448	self.lexer)
1449
1450	if last_token.id == Id.Op_LBrace:
1451	last_token.id = Id.Lit_LBrace
1452	self.buffered_word = last_token
1453
1454	return pat, left_tok
1455
1456	def NewlineOkForYshCase(self):
1457	# type: () -> Id_t
1458	"""Check for optional newline and consume it.
1459
1460	This is a special case of `_NewlineOk` which fixed some "off-by-one" issues
1461	which crop up while parsing Ysh Case Arms. For more details, see
1462	#oil-dev > Progress On YSH Case Grammar on zulip.
1463
1464	Returns a token id which is filled with the choice of
1465
1466	word { echo word }
1467	(3) { echo expr }
1468	/e/ { echo eggex }
1469	} # right brace
1470	"""
1471	while True:
1472	next_id = self.lexer.LookAheadOne(lex_mode_e.Expr)
1473
1474	# Cannot lookahead past lines
1475	if next_id == Id.Unknown_Tok:
1476	if not self.lexer.MoveToNextLine(): # Try to move to next line
1477	break # EOF
1478	continue
1479
1480	next_kind = consts.GetKind(next_id)
1481	if next_id != Id.Op_Newline and next_kind != Kind.Ignored:
1482	break
1483
1484	self.lexer.Read(lex_mode_e.Expr)
1485
1486	if next_id in (Id.Op_RBrace, Id.Op_LParen, Id.Arith_Slash):
1487	self._SetNext(lex_mode_e.Expr) # Continue in expression mode
1488	else:
1489	# Consume the trailing Op_Newline
1490	self._SetNext(lex_mode_e.ShCommand)
1491	self._GetToken()
1492
1493	return next_id
1494
1495	def _ReadArithExpr(self, end_id):
1496	# type: (Id_t) -> arith_expr_t
1497	"""Read and parse an arithmetic expression in various contexts.
1498
1499	$(( 1+2 ))
1500	(( a=1+2 ))
1501	${a[ 1+2 ]}
1502	${a : 1+2 : 1+2}
1503
1504	See tests/arith-context.test.sh for ambiguous cases.
1505
1506	${a[a[0]]} is valid # VS_RBRACKET vs Id.Arith_RBracket
1507
1508	${s : a<b?0:1 : 1} # VS_COLON vs Id.Arith_Colon
1509
1510	See the assertion in ArithParser.Parse() -- unexpected extra input.
1511	"""
1512	# calls self.ReadWord(lex_mode_e.Arith)
1513	anode = self.a_parser.Parse()
1514	cur_id = self.a_parser.CurrentId()
1515	if end_id != Id.Undefined_Tok and cur_id != end_id:
1516	p_die(
1517	'Unexpected token after arithmetic expression (%s != %s)' %
1518	(ui.PrettyId(cur_id), ui.PrettyId(end_id)),
1519	loc.Word(self.a_parser.cur_word))
1520	return anode
1521
1522	def _ReadArithSub(self):
1523	# type: () -> word_part.ArithSub
1524	"""Read an arith substitution, which contains an arith expression, e.g.
1525
1526	$((a + 1)).
1527	"""
1528	left_tok = self.cur_token
1529
1530	# The second one needs to be disambiguated in stuff like stuff like:
1531	# $(echo $(( 1+2 )) )
1532	self.lexer.PushHint(Id.Op_RParen, Id.Right_DollarDParen)
1533
1534	# NOTE: To disambiguate $(( as arith sub vs. command sub and subshell, we
1535	# could save the lexer/reader state here, and retry if the arithmetic parse
1536	# fails. But we can almost always catch this at parse time. There could
1537	# be some exceptions like:
1538	# $((echo * foo)) # looks like multiplication
1539	# $((echo / foo)) # looks like division
1540
1541	# $(( )) is valid
1542	anode = arith_expr.EmptyZero # type: arith_expr_t
1543
1544	self._NextNonSpace()
1545	if self.token_type != Id.Arith_RParen:
1546	anode = self._ReadArithExpr(Id.Arith_RParen)
1547
1548	self._SetNext(lex_mode_e.ShCommand)
1549
1550	# Ensure we get closing )
1551	self._GetToken()
1552	if self.token_type != Id.Right_DollarDParen:
1553	p_die('Expected second ) to end arith sub', self.cur_token)
1554
1555	right_tok = self.cur_token
1556	return word_part.ArithSub(left_tok, anode, right_tok)
1557
1558	def ReadDParen(self):
1559	# type: () -> Tuple[arith_expr_t, Token]
1560	"""Read ((1+ 2)) -- command context.
1561
1562	We're using the word parser because it's very similar to _ReadArithExpr
1563	above.
1564
1565	This also returns the terminating Id.Op_DRightParen token for location
1566	info.
1567	"""
1568	# (( )) is valid
1569	anode = arith_expr.EmptyZero # type: arith_expr_t
1570
1571	self.lexer.PushHint(Id.Op_RParen, Id.Op_DRightParen)
1572
1573	self._NextNonSpace()
1574	if self.token_type != Id.Arith_RParen:
1575	anode = self._ReadArithExpr(Id.Arith_RParen)
1576
1577	self._SetNext(lex_mode_e.ShCommand)
1578
1579	# Ensure we get the second )
1580	self._GetToken()
1581	right = self.cur_token
1582	if right.id != Id.Op_DRightParen:
1583	p_die('Expected second ) to end arith statement', right)
1584
1585	self._SetNext(lex_mode_e.ShCommand)
1586
1587	return anode, right
1588
1589	def _NextNonSpace(self):
1590	# type: () -> None
1591	"""Advance in lex_mode_e.Arith until non-space token.
1592
1593	Same logic as _ReadWord, but used in
1594	$(( ))
1595	(( ))
1596	for (( ))
1597
1598	You can read self.token_type after this, without calling _GetToken.
1599	"""
1600	while True:
1601	self._SetNext(lex_mode_e.Arith)
1602	self._GetToken()
1603	if self.token_kind not in (Kind.Ignored, Kind.WS):
1604	break
1605
1606	def ReadForExpression(self):
1607	# type: () -> command.ForExpr
1608	"""Read ((i=0; i<5; ++i)) -- part of command context."""
1609	self._NextNonSpace() # skip over ((
1610	cur_id = self.token_type # for end of arith expressions
1611
1612	if cur_id == Id.Arith_Semi: # for (( ; i < 10; i++ ))
1613	init_node = arith_expr.EmptyZero # type: arith_expr_t
1614	else:
1615	init_node = self.a_parser.Parse()
1616	cur_id = self.a_parser.CurrentId()
1617	self._NextNonSpace()
1618
1619	# It's odd to keep track of both cur_id and self.token_type in this
1620	# function, but it works, and is tested in 'test/parse_error.sh
1621	# arith-integration'
1622	if cur_id != Id.Arith_Semi: # for (( x=0 b; ... ))
1623	p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1624
1625	self._GetToken()
1626	cur_id = self.token_type
1627
1628	if cur_id == Id.Arith_Semi: # for (( ; ; i++ ))
1629	# empty condition is TRUE
1630	cond_node = arith_expr.EmptyOne # type: arith_expr_t
1631	else:
1632	cond_node = self.a_parser.Parse()
1633	cur_id = self.a_parser.CurrentId()
1634
1635	if cur_id != Id.Arith_Semi: # for (( x=0; x<5 b ))
1636	p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1637
1638	self._NextNonSpace()
1639	if self.token_type == Id.Arith_RParen: # for (( ; ; ))
1640	update_node = arith_expr.EmptyZero # type: arith_expr_t
1641	else:
1642	update_node = self._ReadArithExpr(Id.Arith_RParen)
1643
1644	self._NextNonSpace()
1645	if self.token_type != Id.Arith_RParen:
1646	p_die('Expected ) to end for loop expression', self.cur_token)
1647	self._SetNext(lex_mode_e.ShCommand)
1648
1649	# redirects is None, will be assigned in CommandEvaluator
1650	node = command.ForExpr.CreateNull()
1651	node.init = init_node
1652	node.cond = cond_node
1653	node.update = update_node
1654	return node
1655
1656	def _ReadArrayLiteral(self):
1657	# type: () -> word_part_t
1658	"""a=(1 2 3)
1659
1660	TODO: See osh/cmd_parse.py:164 for Id.Lit_ArrayLhsOpen, for a[x++]=1
1661
1662	We want:
1663
1664	A=(['x']=1 ["x"]=2 [$x$y]=3)
1665
1666	Maybe allow this as a literal string? Because I think I've seen it before?
1667	Or maybe force people to patch to learn the rule.
1668
1669	A=([x]=4)
1670
1671	Starts with Lit_Other '[', and then it has Lit_ArrayLhsClose
1672	Maybe enforce that ALL have keys or NONE of have keys.
1673	"""
1674	self._SetNext(lex_mode_e.ShCommand) # advance past (
1675	self._GetToken()
1676	if self.cur_token.id != Id.Op_LParen:
1677	p_die('Expected ( after =', self.cur_token)
1678	left_token = self.cur_token
1679	right_token = None # type: Token
1680
1681	# MUST use a new word parser (with same lexer).
1682	w_parser = self.parse_ctx.MakeWordParser(self.lexer, self.line_reader)
1683	words = [] # type: List[CompoundWord]
1684	done = False
1685	while not done:
1686	w = w_parser.ReadWord(lex_mode_e.ShCommand)
1687	with tagswitch(w) as case:
1688	if case(word_e.Operator):
1689	tok = cast(Token, w)
1690	if tok.id == Id.Right_Initializer:
1691	right_token = tok
1692	done = True # can't use break here
1693	# Unlike command parsing, array parsing allows embedded \n.
1694	elif tok.id == Id.Op_Newline:
1695	continue
1696	else:
1697	p_die('Unexpected token in array literal', loc.Word(w))
1698
1699	elif case(word_e.Compound):
1700	words.append(cast(CompoundWord, w))
1701
1702	else:
1703	raise AssertionError()
1704
1705	initializer_words = [] # type: List[InitializerWord_t]
1706	for w in words:
1707	pair = word_.DetectAssocPair(w)
1708	if pair is not None:
1709	word_.TildeDetectAssign(pair.value) # pair.value is modified
1710	initializer_words.append(pair)
1711	else:
1712	w2 = braces.BraceDetect(w) # type: word_t
1713	if w2 is None:
1714	w2 = w
1715	w3 = word_.TildeDetect(w2) # type: word_t
1716	if w3 is None:
1717	w3 = w2
1718	initializer_words.append(InitializerWord.ArrayWord(w3))
1719
1720	# invariant List?
1721	return word_part.InitializerLiteral(left_token, initializer_words,
1722	right_token)
1723
1724	def ParseProcCallArgs(self, start_symbol):
1725	# type: (int) -> ArgList
1726	""" json write (x) """
1727	self.lexer.MaybeUnreadOne()
1728
1729	arg_list = ArgList.CreateNull(alloc_lists=True)
1730	arg_list.left = self.cur_token
1731	self.parse_ctx.ParseProcCallArgs(self.lexer, arg_list, start_symbol)
1732	return arg_list
1733
1734	def _MaybeReadWordPart(self, is_first, lex_mode, parts):
1735	# type: (bool, lex_mode_t, List[word_part_t]) -> bool
1736	"""Helper for _ReadCompoundWord3."""
1737	done = False
1738
1739	if self.token_type == Id.Lit_EscapedChar:
1740	tok = self.cur_token
1741	assert tok.length == 2
1742	ch = lexer.TokenSliceLeft(tok, 1)
1743	if not self.parse_opts.parse_backslash():
1744	if not pyutil.IsValidCharEscape(ch):
1745	p_die('Invalid char escape in unquoted word (OILS-ERR-13)',
1746	self.cur_token)
1747
1748	part = word_part.EscapedLiteral(self.cur_token,
1749	ch) # type: word_part_t
1750	else:
1751	part = self.cur_token
1752
1753	if is_first and self.token_type == Id.Lit_VarLike: # foo=
1754	parts.append(part)
1755	# Unfortunately it's awkward to pull the check for a=(1 2) up to
1756	# _ReadWord.
1757	next_id = self.lexer.LookPastSpace(lex_mode)
1758	if next_id == Id.Op_LParen:
1759	self.lexer.PushHint(Id.Op_RParen, Id.Right_Initializer)
1760	part2 = self._ReadArrayLiteral()
1761	parts.append(part2)
1762
1763	# Array literal must be the last part of the word.
1764	self._SetNext(lex_mode)
1765	self._GetToken()
1766	# EOF, whitespace, newline, Right_Subshell
1767	if self.token_kind not in KINDS_THAT_END_WORDS:
1768	p_die('Unexpected token after array literal',
1769	self.cur_token)
1770	done = True
1771
1772	elif (is_first and self.parse_opts.parse_at() and
1773	self.token_type == Id.Lit_Splice):
1774
1775	splice_tok = self.cur_token
1776	part2 = word_part.Splice(splice_tok,
1777	lexer.TokenSliceLeft(splice_tok, 1))
1778
1779	parts.append(part2)
1780
1781	# @words must be the last part of the word
1782	self._SetNext(lex_mode)
1783	self._GetToken()
1784	# EOF, whitespace, newline, Right_Subshell
1785	if self.token_kind not in KINDS_THAT_END_WORDS:
1786	p_die('Unexpected token after array splice', self.cur_token)
1787	done = True
1788
1789	elif (is_first and self.parse_opts.parse_at() and
1790	self.token_type == Id.Lit_AtLBracket): # @[split(x)]
1791	part2 = self._ReadExprSub(lex_mode_e.DQ)
1792	parts.append(part2)
1793
1794	# @[split(x)]
1795	self._SetNext(lex_mode)
1796	self._GetToken()
1797	# EOF, whitespace, newline, Right_Subshell
1798	if self.token_kind not in KINDS_THAT_END_WORDS:
1799	p_die('Unexpected token after Expr splice', self.cur_token)
1800	done = True
1801
1802	elif (is_first and self.parse_opts.parse_at() and
1803	self.token_type == Id.Lit_AtLBraceDot):
1804	p_die('TODO: @{.myproc builtin sub}', self.cur_token)
1805
1806	elif (is_first and self.parse_opts.parse_at_all() and
1807	self.token_type == Id.Lit_At):
1808	# Because $[x] ${x} and perhaps $/x/ are reserved, it makes sense for @
1809	# at the beginning of a word to be reserved.
1810
1811	# Although should we relax 'echo @' ? I'm tempted to have a shortcut for
1812	# @_argv and
1813	p_die('Literal @ starting a word must be quoted (parse_at_all)',
1814	self.cur_token)
1815
1816	else:
1817	# not a literal with lookahead; append it
1818	parts.append(part)
1819
1820	return done
1821
1822	def _ReadCompoundWord(self, lex_mode):
1823	# type: (lex_mode_t) -> CompoundWord
1824	return self._ReadCompoundWord3(lex_mode, Id.Undefined_Tok, True)
1825
1826	def _ReadCompoundWord3(self, lex_mode, eof_type, empty_ok):
1827	# type: (lex_mode_t, Id_t, bool) -> CompoundWord
1828	"""
1829	Precondition: Looking at the first token of the first word part
1830	Postcondition: Looking at the token after, e.g. space or operator
1831
1832	NOTE: eof_type is necessary because / is a literal, i.e. Lit_Slash, but it
1833	could be an operator delimiting a compound word. Can we change lexer modes
1834	and remove this special case?
1835	"""
1836	w = CompoundWord([])
1837	num_parts = 0
1838	brace_count = 0
1839	done = False
1840	is_triple_quoted = None # type: Optional[BoolParamBox]
1841
1842	while not done:
1843	self._GetToken()
1844
1845	allow_done = empty_ok or num_parts != 0
1846	if allow_done and self.token_type == eof_type:
1847	done = True # e.g. for ${foo//pat/replace}
1848
1849	# Keywords like "for" are treated like literals
1850	elif self.token_kind in (Kind.Lit, Kind.History, Kind.KW,
1851	Kind.ControlFlow, Kind.BoolUnary,
1852	Kind.BoolBinary):
1853
1854	# Syntax error for { and }
1855	if self.token_type == Id.Lit_LBrace:
1856	brace_count += 1
1857	elif self.token_type == Id.Lit_RBrace:
1858	brace_count -= 1
1859	elif self.token_type == Id.Lit_Dollar:
1860	if not self.parse_opts.parse_dollar():
1861	if num_parts == 0 and lex_mode == lex_mode_e.ShCommand:
1862	next_byte = self.lexer.ByteLookAhead()
1863	# TODO: switch lexer modes and parse $/d+/. But not ${a:-$/d+/}
1864	if next_byte == '/':
1865	#log('next_byte %r', next_byte)
1866	pass
1867
1868	p_die('Literal $ should be quoted like \$',
1869	self.cur_token)
1870
1871	done = self._MaybeReadWordPart(num_parts == 0, lex_mode,
1872	w.parts)
1873
1874	elif self.token_kind == Kind.VSub:
1875	vsub_token = self.cur_token
1876
1877	part = SimpleVarSub(vsub_token) # type: word_part_t
1878	w.parts.append(part)
1879
1880	elif self.token_kind == Kind.ExtGlob:
1881	# If parse_at, we can take over @( to start @(seq 3)
1882	# Users can also use look at ,(.py\|.sh)
1883	if (self.parse_opts.parse_at() and
1884	self.token_type == Id.ExtGlob_At and num_parts == 0):
1885	cs_part = self._ReadCommandSub(Id.Left_AtParen,
1886	d_quoted=False)
1887	# RARE mutation of tok.id!
1888	cs_part.left_token.id = Id.Left_AtParen
1889	part = cs_part # for type safety
1890
1891	# Same check as _MaybeReadWordPart. @(seq 3)x is illegal, just like
1892	# a=(one two)x and @arrayfunc(3)x.
1893	self._GetToken()
1894	if self.token_kind not in KINDS_THAT_END_WORDS:
1895	p_die('Unexpected token after @()', self.cur_token)
1896	done = True
1897
1898	else:
1899	if HAVE_FNM_EXTMATCH == 0:
1900	p_die(
1901	"Extended glob won't work without FNM_EXTMATCH support in libc",
1902	self.cur_token)
1903	part = self._ReadExtGlob()
1904	w.parts.append(part)
1905
1906	elif self.token_kind == Kind.BashRegex:
1907	if self.token_type == Id.BashRegex_LParen: # Opening (
1908	part = self._ReadBashRegexGroup()
1909	w.parts.append(part)
1910	else:
1911	assert self.token_type == Id.BashRegex_AllowedInParens
1912	p_die('Invalid token in bash regex', self.cur_token)
1913
1914	elif self.token_kind == Kind.Left:
1915	try_triple_quote = (self.parse_opts.parse_triple_quote() and
1916	lex_mode == lex_mode_e.ShCommand and
1917	num_parts == 0)
1918
1919	# Save allocation
1920	if try_triple_quote:
1921	is_triple_quoted = BoolParamBox(False)
1922
1923	part = self._ReadUnquotedLeftParts(is_triple_quoted)
1924	w.parts.append(part)
1925
1926	# NOT done yet, will advance below
1927	elif self.token_kind == Kind.Right:
1928	# Still part of the word; will be done on the next iter.
1929	if self.token_type == Id.Right_DoubleQuote:
1930	pass
1931	# Never happens, no PushHint for this case.
1932	#elif self.token_type == Id.Right_DollarParen:
1933	# pass
1934	elif self.token_type == Id.Right_Subshell:
1935	# LEXER HACK for (case x in x) ;; esac )
1936	# Rewind before it's used
1937	assert self.next_lex_mode == lex_mode_e.Undefined
1938	if self.lexer.MaybeUnreadOne():
1939	self.lexer.PushHint(Id.Op_RParen, Id.Right_Subshell)
1940	self._SetNext(lex_mode)
1941	done = True
1942	else:
1943	done = True
1944
1945	elif self.token_kind == Kind.Ignored:
1946	done = True
1947
1948	else:
1949	# LEXER HACK for unbalanced case clause. 'case foo in esac' is valid,
1950	# so to test for ESAC, we can read ) before getting a chance to
1951	# PushHint(Id.Op_RParen, Id.Right_CasePat). So here we unread one
1952	# token and do it again.
1953
1954	# We get Id.Op_RParen at top level: case x in x) ;; esac
1955	# We get Id.Eof_RParen inside ComSub: $(case x in x) ;; esac )
1956	if self.token_type in (Id.Op_RParen, Id.Eof_RParen):
1957	# Rewind before it's used
1958	assert self.next_lex_mode == lex_mode_e.Undefined
1959	if self.lexer.MaybeUnreadOne():
1960	if self.token_type == Id.Eof_RParen:
1961	# Redo translation
1962	self.lexer.PushHint(Id.Op_RParen, Id.Eof_RParen)
1963	self._SetNext(lex_mode)
1964
1965	done = True # anything we don't recognize means we're done
1966
1967	if not done:
1968	self._SetNext(lex_mode)
1969	num_parts += 1
1970
1971	if (self.parse_opts.parse_brace() and num_parts > 1 and
1972	brace_count != 0):
1973	# accept { and }, but not foo{
1974	p_die(
1975	'Word has unbalanced { }. Maybe add a space or quote it like \{',
1976	loc.Word(w))
1977
1978	if is_triple_quoted and is_triple_quoted.b and num_parts > 1:
1979	p_die('Unexpected parts after triple quoted string',
1980	loc.WordPart(w.parts[-1]))
1981
1982	if 0:
1983	from _devbuild.gen.syntax_asdl import word_part_str
1984	word_key = ' '.join(word_part_str(p.tag()) for p in w.parts)
1985	WORD_HIST[word_key] += 1
1986
1987	# YSH word restriction
1988	# (r'' u'' b'' are stripped on shopt -s parse_ysh_string)
1989	if not self.parse_opts.parse_word_join() and not _IsValidYshWord(w):
1990	p_die("Invalid quoted word part in YSH (OILS-ERR-17)",
1991	loc.WordPart(part))
1992
1993	return w
1994
1995	def _ReadArithWord(self):
1996	# type: () -> Optional[word_t]
1997	""" Helper for ReadArithWord() """
1998	self._GetToken()
1999
2000	if self.token_kind == Kind.Unknown:
2001	# e.g. happened during dynamic parsing of unset 'a[$foo]' in gherkin
2002	p_die(
2003	'Unexpected token while parsing arithmetic: %r' %
2004	lexer.TokenVal(self.cur_token), self.cur_token)
2005
2006	elif self.token_kind == Kind.Eof:
2007	return self.cur_token
2008
2009	elif self.token_kind == Kind.Ignored:
2010	# Space should be ignored.
2011	self._SetNext(lex_mode_e.Arith)
2012	return None
2013
2014	elif self.token_kind in (Kind.Arith, Kind.Right):
2015	# Id.Right_DollarDParen IS just a normal token, handled by ArithParser
2016	self._SetNext(lex_mode_e.Arith)
2017	return self.cur_token
2018
2019	elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub):
2020	return self._ReadCompoundWord(lex_mode_e.Arith)
2021
2022	else:
2023	raise AssertionError(self.cur_token)
2024
2025	def _ReadWord(self, word_mode):
2026	# type: (lex_mode_t) -> Optional[word_t]
2027	"""Helper function for ReadWord()."""
2028
2029	# Change the pseudo lexer mode to a real lexer mode
2030	if word_mode == lex_mode_e.ShCommandFakeBrack:
2031	lex_mode = lex_mode_e.ShCommand
2032	else:
2033	lex_mode = word_mode
2034
2035	self._GetToken()
2036
2037	if self.token_kind == Kind.Eof:
2038	# No advance
2039	return self.cur_token
2040
2041	# Allow Arith for ) at end of for loop?
2042	elif self.token_kind in (Kind.Op, Kind.Redir, Kind.Arith):
2043	self._SetNext(lex_mode)
2044
2045	# Newlines are complicated. See 3x2 matrix in the comment about
2046	# self.multiline and self.newline_state above.
2047	if self.token_type == Id.Op_Newline:
2048	if self.multiline:
2049	if self.newline_state > 1:
2050	# This points at a blank line, but at least it gives the line number
2051	p_die('Invalid blank line in multiline mode',
2052	self.cur_token)
2053	return None
2054
2055	if self.returned_newline: # skip
2056	return None
2057
2058	return self.cur_token
2059
2060	elif self.token_kind == Kind.Right:
2061	if self.token_type not in (Id.Right_Subshell, Id.Right_ShFunction,
2062	Id.Right_CasePat, Id.Right_Initializer):
2063	raise AssertionError(self.cur_token)
2064
2065	self._SetNext(lex_mode)
2066	return self.cur_token
2067
2068	elif self.token_kind in (Kind.Ignored, Kind.WS):
2069	self._SetNext(lex_mode)
2070	return None
2071
2072	else:
2073	assert self.token_kind in (Kind.VSub, Kind.Lit, Kind.History,
2074	Kind.Left, Kind.KW, Kind.ControlFlow,
2075	Kind.BoolUnary, Kind.BoolBinary,
2076	Kind.ExtGlob,
2077	Kind.BashRegex), 'Unhandled token kind'
2078
2079	if (word_mode == lex_mode_e.ShCommandFakeBrack and
2080	self.parse_opts.parse_bracket() and
2081	self.token_type == Id.Lit_LBracket):
2082	# Change [ from Kind.Lit -> Kind.Op
2083	# So CommandParser can treat
2084	# assert [42 === x]
2085	# like
2086	# json write (x)
2087	bracket_word = self.cur_token
2088	bracket_word.id = Id.Op_LBracket
2089
2090	self._SetNext(lex_mode)
2091	return bracket_word
2092
2093	# We're beginning a word. If we see Id.Lit_Pound, change to
2094	# lex_mode_e.Comment and read until end of line.
2095	if self.token_type == Id.Lit_Pound:
2096	self._SetNext(lex_mode_e.Comment)
2097	self._GetToken()
2098
2099	# NOTE: The # could be the last character in the file. It can't be
2100	# Eof_{RParen,Backtick} because #) and #` are comments.
2101	assert self.token_type in (Id.Ignored_Comment, Id.Eof_Real), \
2102	self.cur_token
2103
2104	# The next iteration will go into Kind.Ignored and set lex state to
2105	# lex_mode_e.ShCommand/etc.
2106	return None # tell ReadWord() to try again after comment
2107
2108	elif self.token_type == Id.Lit_TPound: ### doc comment
2109	self._SetNext(lex_mode_e.Comment)
2110	self._GetToken()
2111
2112	if self.token_type == Id.Ignored_Comment and self.emit_doc_token:
2113	return self.cur_token
2114
2115	return None # tell ReadWord() to try again after comment
2116
2117	else:
2118	# r'' u'' b'' at the beginning of a word
2119	if (self.token_type == Id.Lit_Chars and
2120	self.lexer.LookAheadOne(
2121	lex_mode_e.ShCommand) == Id.Left_SingleQuote):
2122
2123	# When shopt -s parse_ysh_string:
2124	# echo r'hi' is like echo 'hi'
2125	#
2126	# echo u'\u{3bc}' b'\yff' works
2127
2128	tok = self.cur_token
2129	if self.parse_opts.parse_ysh_string():
2130	if lexer.TokenEquals(tok, 'r'):
2131	left_id = Id.Left_RSingleQuote
2132	elif lexer.TokenEquals(tok, 'u'):
2133	left_id = Id.Left_USingleQuote
2134	elif lexer.TokenEquals(tok, 'b'):
2135	left_id = Id.Left_BSingleQuote
2136	else:
2137	left_id = Id.Undefined_Tok
2138
2139	if left_id != Id.Undefined_Tok:
2140	# skip the r, and then 'foo' will be read as normal
2141	self._SetNext(lex_mode_e.ShCommand)
2142
2143	self._GetToken()
2144	assert self.token_type == Id.Left_SingleQuote, self.token_type
2145
2146	# Read the word in a different lexer mode
2147	return self._ReadYshSingleQuoted(left_id)
2148
2149	return self._ReadCompoundWord(lex_mode)
2150
2151	def ParseVarRef(self):
2152	# type: () -> BracedVarSub
2153	"""DYNAMIC parsing of what's inside ${!ref}
2154
2155	# Same as VarOf production
2156	VarRefExpr = VarOf EOF
2157	"""
2158	self._SetNext(lex_mode_e.VSub_1)
2159
2160	self._GetToken()
2161	if self.token_kind != Kind.VSub:
2162	p_die('Expected var name', self.cur_token)
2163
2164	part = self._ParseVarOf()
2165	# NOTE: no ${ } means no part.left and part.right
2166	part.left = part.name_tok # cheat to make test pass
2167	part.right = part.name_tok
2168
2169	self._GetToken()
2170	if self.token_type != Id.Eof_Real:
2171	p_die('Expected end of var ref expression', self.cur_token)
2172	return part
2173
2174	def LookPastSpace(self):
2175	# type: () -> Id_t
2176	"""Look ahead to the next token.
2177
2178	For the CommandParser to recognize
2179	array= (1 2 3)
2180	YSH for ( versus bash for ((
2181	YSH if ( versus if test
2182	YSH while ( versus while test
2183	YSH bare assignment 'grep =' versus 'grep foo'
2184	"""
2185	assert self.token_type != Id.Undefined_Tok
2186	if self.cur_token.id == Id.WS_Space:
2187	id_ = self.lexer.LookPastSpace(lex_mode_e.ShCommand)
2188	else:
2189	id_ = self.cur_token.id
2190	return id_
2191
2192	def LookAheadFuncParens(self):
2193	# type: () -> bool
2194	"""Special lookahead for f( ) { echo hi; } to check for ( )"""
2195	assert self.token_type != Id.Undefined_Tok
2196
2197	# We have to handle 2 cases because we buffer a token
2198	if self.cur_token.id == Id.Op_LParen: # saw funcname(
2199	return self.lexer.LookAheadFuncParens(1) # go back one char
2200
2201	elif self.cur_token.id == Id.WS_Space: # saw funcname WHITESPACE
2202	return self.lexer.LookAheadFuncParens(0)
2203
2204	else:
2205	return False
2206
2207	def ReadWord(self, word_mode):
2208	# type: (lex_mode_t) -> word_t
2209	"""Read the next word, using the given lexer mode.
2210
2211	This is a stateful wrapper for the stateless _ReadWord function.
2212	"""
2213	assert word_mode in (lex_mode_e.ShCommand,
2214	lex_mode_e.ShCommandFakeBrack,
2215	lex_mode_e.DBracket, lex_mode_e.BashRegex)
2216
2217	if self.buffered_word: # For integration with pgen2
2218	w = self.buffered_word
2219	self.buffered_word = None
2220	else:
2221	while True:
2222	w = self._ReadWord(word_mode)
2223	if w is not None:
2224	break
2225
2226	self.returned_newline = (word_.CommandId(w) == Id.Op_Newline)
2227	return w
2228
2229	def ReadArithWord(self):
2230	# type: () -> word_t
2231	while True:
2232	w = self._ReadArithWord()
2233	if w is not None:
2234	break
2235	return w
2236
2237	def ReadHereDocBody(self, parts):
2238	# type: (List[word_part_t]) -> None
2239	"""
2240	A here doc is like a double quoted context, except " isn't special.
2241	"""
2242	self._ReadLikeDQ(None, False, parts)
2243	# Returns nothing
2244
2245	def ReadForPlugin(self):
2246	# type: () -> CompoundWord
2247	"""For $PS1, $PS4, etc.
2248
2249	This is just like reading a here doc line. "\n" is allowed, as
2250	well as the typical substitutions ${x} $(echo hi) $((1 + 2)).
2251	"""
2252	w = CompoundWord([])
2253	self._ReadLikeDQ(None, False, w.parts)
2254	return w
2255
2256	def EmitDocToken(self, b):
2257	# type: (bool) -> None
2258	self.emit_doc_token = b
2259
2260	def Multiline(self, b):
2261	# type: (bool) -> None
2262	self.multiline = b
2263
2264
2265	if 0:
2266	import collections
2267	WORD_HIST = collections.Counter()
2268
2269	# vim: sw=4