data_lang/j8.py

OILS / data_lang / j8.py View on Github | oils.pub

1338 lines, 704 significant

1	#!/usr/bin/env python2
2	"""
3	j8.py: J8 Notation, a superset of JSON
4
5	Later:
6
7	- Unify with ASDL pretty printing - NIL8
8	- {} [] are identical
9	- () is for statically typed ASDL data
10	(command.Simple blame_tok:(...) words:[ ])
11	although we are also using [] for typed ASDL arrays, not just JSON
12	- object IDs
13	- @ x123 can create an ID
14	- ! x123 can reference an ID
15	- <> can be for non-J8 data types? For the = operator
16	- 'hi \(name)' interpolation is useful for code
17
18	- Common between JSON8 and NIL8 - for writing by hand
19	- comments - # line or // line (JSON5 uses // line, following JS)
20	- unquoted identifier names - NIL8 could be more relaxed for (+ 1 (* 3 4))
21	- commas
22	- JSON8 could have trailing commas rule
23	- NIL8 at least has no commas for [1 2 "hi"]
24	"""
25
26	import math
27
28	from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str
29	from _devbuild.gen.nil8_asdl import (nvalue, nvalue_t)
30	from _devbuild.gen.runtime_asdl import error_code_e
31	from _devbuild.gen.value_asdl import (value, value_e, value_t, value_str, Obj)
32
33	from core import bash_impl
34	from core import error
35	from data_lang import pyj8
36	# dependency issue: consts.py pulls in frontend/option_def.py
37	from frontend import consts
38	from frontend import match
39	from mycpp import mops
40	from mycpp import mylib
41	from mycpp.mylib import tagswitch, iteritems, NewDict, log
42
43	import fastfunc
44
45	_ = log
46
47	from typing import cast, Dict, List, Tuple, Optional
48
49
50	# COPIED from ui.ValType() to break dep
51	def ValType(val):
52	# type: (value_t) -> str
53	"""For displaying type errors in the UI."""
54
55	return value_str(val.tag(), dot=False)
56
57
58	if mylib.PYTHON:
59
60	def HeapValueId(val):
61	# type: (value_t) -> int
62	"""
63	Python's id() returns the address, which is up to 64 bits.
64
65	In C++ we can use the GC ID, which fits within 32 bits.
66	"""
67	return id(val)
68
69
70	def ValueId(val):
71	# type: (value_t) -> int
72	"""
73	Return an integer ID for object that:
74
75	1. Can be used to determine whether 2 objects are the same, e.g. for
76	List, Dict, Func, Proc, etc.
77	2. Will help detect object cycles
78
79	Primitives types like Int and Float don't have this notion. They're
80	immutable values that are copied and compared by value.
81	"""
82	with tagswitch(val) as case:
83	if case(value_e.Null, value_e.Bool, value_e.Int, value_e.Float,
84	value_e.Str):
85	# These will not be on the heap if we switch to tagged pointers
86	# Str is handled conservatively - when we add small string
87	# optimization, some strings will be values, so we assume all are.
88	return -1
89	else:
90	return HeapValueId(val)
91
92
93	def ValueIdString(val):
94	# type: (value_t) -> str
95	"""Used by pp value (42) and = 42"""
96	heap_id = ValueId(val) # could be -1
97	if heap_id == -1:
98	return ''
99	else:
100	return ' 0x%s' % mylib.hex_lower(heap_id)
101
102
103	def Utf8Encode(code):
104	# type: (int) -> str
105	"""Return utf-8 encoded bytes from a unicode code point.
106
107	Based on https://stackoverflow.com/a/23502707
108	"""
109	num_cont_bytes = 0
110
111	if code <= 0x7F:
112	return chr(code & 0x7F) # ASCII
113
114	elif code <= 0x7FF:
115	num_cont_bytes = 1
116	elif code <= 0xFFFF:
117	num_cont_bytes = 2
118	else:
119	# What about the check code <= 0x10FFFF ?
120	# - it happens in statically parsed $'' u''
121	# - but not dynamically parsed echo -e / printf, following bash/zsh
122	num_cont_bytes = 3
123
124	bytes_ = [] # type: List[int]
125	for _ in xrange(num_cont_bytes):
126	bytes_.append(0x80 \| (code & 0x3F))
127	code >>= 6
128
129	b = (0x1E << (6 - num_cont_bytes)) \| (code & (0x3F >> num_cont_bytes))
130	bytes_.append(b)
131	bytes_.reverse()
132
133	# mod 256 because Python ints don't wrap around!
134	tmp = [chr(b & 0xFF) for b in bytes_]
135	return ''.join(tmp)
136
137
138	SHOW_CYCLES = 1 << 1 # show as [...] or {...} or (...), with object ID
139	LOSSY_JSON_STRINGS = 1 << 3 # JSON may lose data about strings
140	INF_NAN_ARE_NULL = 1 << 4 # another lossy json issue
141
142	NON_DATA_IS_NULL = 1 << 6
143	NON_DATA_IS_ERROR = 1 << 7
144	# Otherwise, non-data objects like Eggex will be <Eggex 0xff>
145
146	# Hack until we fully translate
147	assert pyj8.LOSSY_JSON_STRINGS == LOSSY_JSON_STRINGS
148
149
150	def _Print(val, buf, indent, options=0):
151	# type: (value_t, mylib.BufWriter, int, int) -> None
152	"""
153	Args:
154	indent: number of spaces to indent, or -1 for everything on one line
155	"""
156	p = InstancePrinter(buf, indent, options)
157	p.Print(val)
158
159
160	def PrintMessage(val, buf, indent, type_errors):
161	# type: (value_t, mylib.BufWriter, int, bool) -> None
162	""" For json8 write (x) and toJson8()
163
164	Caller must handle error.Encode
165	"""
166	options = 0
167	if type_errors:
168	options \|= NON_DATA_IS_ERROR
169	else:
170	options \|= NON_DATA_IS_NULL
171	_Print(val, buf, indent, options=options)
172
173
174	def PrintJsonMessage(val, buf, indent, type_errors):
175	# type: (value_t, mylib.BufWriter, int, bool) -> None
176	""" For json write (x) and toJson()
177
178	Caller must handle error.Encode()
179	Doesn't decay to b'' strings - will use Unicode replacement char.
180	"""
181	options = LOSSY_JSON_STRINGS \| INF_NAN_ARE_NULL
182	if type_errors:
183	options \|= NON_DATA_IS_ERROR
184	else:
185	options \|= NON_DATA_IS_NULL
186	_Print(val, buf, indent, options=options)
187
188
189	def PrintLine(val, f):
190	# type: (value_t, mylib.Writer) -> None
191	""" For pp test_ (x) """
192
193	# error.Encode should be impossible - we show cycles and non-data
194	buf = mylib.BufWriter()
195
196	_Print(val, buf, -1, options=SHOW_CYCLES)
197
198	f.write(buf.getvalue())
199	f.write('\n')
200
201
202	def EncodeString(s, buf, unquoted_ok=False):
203	# type: (str, mylib.BufWriter, bool) -> None
204	""" For pp proc, etc."""
205
206	if unquoted_ok and fastfunc.CanOmitQuotes(s):
207	buf.write(s)
208	return
209
210	_Print(value.Str(s), buf, -1)
211
212
213	def MaybeEncodeString(s):
214	# type: (str) -> str
215	""" For write --json8 $s and compexport """
216
217	# TODO: add unquoted_ok here?
218	# /usr/local/foo-bar/x.y/a_b
219
220	buf = mylib.BufWriter()
221	_Print(value.Str(s), buf, -1)
222	return buf.getvalue()
223
224
225	def MaybeEncodeJsonString(s):
226	# type: (str) -> str
227	""" For write --json """
228
229	# TODO: add unquoted_ok here?
230	# /usr/local/foo-bar/x.y/a_b
231	buf = mylib.BufWriter()
232	_Print(value.Str(s), buf, -1, options=LOSSY_JSON_STRINGS)
233	return buf.getvalue()
234
235
236	class InstancePrinter(object):
237	"""Print a value tree as J8/JSON."""
238
239	def __init__(self, buf, indent, options):
240	# type: (mylib.BufWriter, int, int) -> None
241	self.buf = buf
242	self.indent = indent
243	self.options = options
244
245	# Key is vm.HeapValueId(val)
246	self.visiting = {} # type: Dict[int, bool]
247
248	def _ItemIndent(self, level):
249	# type: (int) -> None
250
251	if self.indent == -1:
252	return
253
254	self.buf.write_spaces((level + 1) * self.indent)
255
256	def _BracketIndent(self, level):
257	# type: (int) -> None
258
259	if self.indent == -1:
260	return
261
262	self.buf.write_spaces(level * self.indent)
263
264	def _MaybeNewline(self):
265	# type: () -> None
266	if self.indent == -1:
267	return
268	self.buf.write('\n')
269
270	def _MaybeSpace(self):
271	# type: () -> None
272	if self.indent == -1:
273	return
274	self.buf.write(' ')
275
276	def _PrintList(self, val, level):
277	# type: (value.List, int) -> None
278
279	if len(val.items) == 0: # Special case like Python/JS
280	self.buf.write('[]')
281	else:
282	self.buf.write('[')
283	self._MaybeNewline()
284	for i, item in enumerate(val.items):
285	if i != 0:
286	self.buf.write(',')
287	self._MaybeNewline()
288
289	self._ItemIndent(level)
290	self.Print(item, level + 1)
291	self._MaybeNewline()
292
293	self._BracketIndent(level)
294	self.buf.write(']')
295
296	def _PrintMapping(self, d, left, right, level):
297	# type: (Dict[str, value_t], str, str, int) -> None
298	if len(d) == 0: # Special case like Python/JS
299	self.buf.write(left)
300	self.buf.write(right)
301	else:
302	self.buf.write(left)
303	self._MaybeNewline()
304	i = 0
305	for k, v in iteritems(d):
306	if i != 0:
307	self.buf.write(',')
308	self._MaybeNewline()
309
310	self._ItemIndent(level)
311
312	pyj8.WriteString(k, self.options, self.buf)
313
314	self.buf.write(':')
315	self._MaybeSpace()
316
317	self.Print(v, level + 1)
318
319	i += 1
320
321	self._MaybeNewline()
322	self._BracketIndent(level)
323	self.buf.write(right)
324
325	def _PrintDict(self, val, level):
326	# type: (value.Dict, int) -> None
327	self._PrintMapping(val.d, '{', '}', level)
328
329	def _PrintObj(self, val, level):
330	# type: (Obj, int) -> None
331
332	self._PrintMapping(val.d, '(', ')', level)
333
334	if val.prototype:
335	self.buf.write(' --> ')
336	self._PrintObj(val.prototype, level)
337
338	def _PrintBashPrefix(self, type_str, level):
339	# type: (str, int) -> None
340
341	self.buf.write('{')
342	self._MaybeNewline()
343	self._ItemIndent(level)
344	self.buf.write('"type":')
345	self._MaybeSpace()
346	self.buf.write(
347	type_str) # "InternalStringArray", "BashArray", or "BashAssoc",
348
349	self._MaybeNewline()
350
351	self._ItemIndent(level)
352	self.buf.write('"data":')
353	self._MaybeSpace()
354
355	def _PrintBashSuffix(self, level):
356	# type: (int) -> None
357	self._MaybeNewline()
358	self._BracketIndent(level)
359	self.buf.write('}')
360
361	def _PrintBashArray(self, val, level):
362	# type: (value.BashArray, int) -> None
363
364	self._PrintBashPrefix('"BashArray",', level)
365
366	if bash_impl.BashArray_Count(val) == 0: # Special case like Python/JS
367	self.buf.write('{}')
368	else:
369	self.buf.write('{')
370	self._MaybeNewline()
371
372	i = 0
373	for k in bash_impl.BashArray_GetKeys(val):
374	if i != 0:
375	self.buf.write(',')
376	self._MaybeNewline()
377
378	self._ItemIndent(level + 1)
379	pyj8.WriteString(mops.ToStr(k), self.options, self.buf)
380
381	self.buf.write(':')
382	self._MaybeSpace()
383
384	v, error_code = bash_impl.BashArray_GetElement(val, k)
385	assert error_code == error_code_e.OK, error_code
386	pyj8.WriteString(v, self.options, self.buf)
387
388	i += 1
389
390	self._MaybeNewline()
391
392	self._BracketIndent(level + 1)
393	self.buf.write('}')
394
395	self._PrintBashSuffix(level)
396
397	def _PrintInternalStringArray(self, val, level):
398	# type: (value.InternalStringArray, int) -> None
399
400	self._PrintBashPrefix('"InternalStringArray",', level)
401
402	if bash_impl.InternalStringArray_Count(
403	val) == 0: # Special case like Python/JS
404	self.buf.write('{}')
405	else:
406	self.buf.write('{')
407	self._MaybeNewline()
408
409	first = True
410	for i, s in enumerate(
411	bash_impl.InternalStringArray_GetValues(val)):
412	if s is None:
413	continue
414
415	if not first:
416	self.buf.write(',')
417	self._MaybeNewline()
418
419	self._ItemIndent(level + 1)
420	pyj8.WriteString(str(i), self.options, self.buf)
421
422	self.buf.write(':')
423	self._MaybeSpace()
424
425	pyj8.WriteString(s, self.options, self.buf)
426
427	first = False
428
429	self._MaybeNewline()
430
431	self._BracketIndent(level + 1)
432	self.buf.write('}')
433
434	self._PrintBashSuffix(level)
435
436	def _PrintBashAssoc(self, val, level):
437	# type: (value.BashAssoc, int) -> None
438
439	self._PrintBashPrefix('"BashAssoc",', level)
440
441	if bash_impl.BashAssoc_Count(val) == 0: # Special case like Python/JS
442	self.buf.write('{}')
443	else:
444	self.buf.write('{')
445	self._MaybeNewline()
446
447	i = 0
448	for k2, v2 in iteritems(bash_impl.BashAssoc_GetDict(val)):
449	if i != 0:
450	self.buf.write(',')
451	self._MaybeNewline()
452
453	self._ItemIndent(level + 1)
454	pyj8.WriteString(k2, self.options, self.buf)
455
456	self.buf.write(':')
457	self._MaybeSpace()
458
459	pyj8.WriteString(v2, self.options, self.buf)
460
461	i += 1
462
463	self._MaybeNewline()
464
465	self._BracketIndent(level + 1)
466	self.buf.write('}')
467
468	self._PrintBashSuffix(level)
469
470	def Print(self, val, level=0):
471	# type: (value_t, int) -> None
472
473	# special value that means everything is on one line
474	# It's like
475	# JSON.stringify(d, null, 0)
476	# except we use -1, not 0. 0 can still have newlines.
477
478	UP_val = val
479	with tagswitch(val) as case:
480	if case(value_e.Null):
481	self.buf.write('null')
482
483	elif case(value_e.Bool):
484	val = cast(value.Bool, UP_val)
485	self.buf.write('true' if val.b else 'false')
486
487	elif case(value_e.Int):
488	val = cast(value.Int, UP_val)
489	# TODO: avoid intermediate allocation with
490	# self.buf.WriteBigInt(val.i)
491	#
492	# Or maybe we need pyj8.WriteBigInt() because truly BigInt may
493	# be of arbitrary length, and will need a growth strategy.
494	# Although that is not very common, so we could allocate in
495	# that case.
496
497	self.buf.write(mops.ToStr(val.i))
498
499	elif case(value_e.Float):
500	val = cast(value.Float, UP_val)
501
502	fl = val.f
503	if math.isinf(fl):
504	if self.options & INF_NAN_ARE_NULL:
505	s = 'null' # negative infinity is null too
506	else:
507	s = 'INFINITY'
508	if fl < 0:
509	s = '-' + s
510	elif math.isnan(fl):
511	if self.options & INF_NAN_ARE_NULL:
512	# JavaScript JSON lib behavior: Inf and NaN are null
513	# Python has a bug in the encoder by default, and then
514	# allow_nan=False raises an error
515	s = 'null'
516	else:
517	s = 'NAN'
518	else:
519	# TODO: can we avoid intermediate allocation?
520	# self.buf.WriteFloat(val.f)
521	s = str(fl)
522
523	self.buf.write(s)
524
525	elif case(value_e.Str):
526	val = cast(value.Str, UP_val)
527
528	pyj8.WriteString(val.s, self.options, self.buf)
529
530	elif case(value_e.List):
531	val = cast(value.List, UP_val)
532
533	# Cycle detection, only for containers that can be in cycles
534	heap_id = HeapValueId(val)
535
536	if self.visiting.get(heap_id, False):
537	if self.options & SHOW_CYCLES:
538	# Showing the ID would be nice for pretty printing, but
539	# the problem is we'd have to show it TWICE to make it
540	# meaningful
541	#
542	#self.buf.write('[ -->%s ]' % ValueIdString(val))
543	self.buf.write('[...]')
544	return
545	else:
546	# node.js prints which index closes the cycle
547	raise error.Encode(
548	"Can't encode List%s in object cycle" %
549	ValueIdString(val))
550	else:
551	self.visiting[heap_id] = True
552	self._PrintList(val, level)
553	self.visiting[heap_id] = False
554
555	elif case(value_e.Dict):
556	val = cast(value.Dict, UP_val)
557
558	# Cycle detection, only for containers that can be in cycles
559	heap_id = HeapValueId(val)
560
561	if self.visiting.get(heap_id, False):
562	if self.options & SHOW_CYCLES:
563	self.buf.write('{...}')
564	return
565	else:
566	# node.js prints which key closes the cycle
567	raise error.Encode(
568	"Can't encode Dict%s in object cycle" %
569	ValueIdString(val))
570	else:
571	self.visiting[heap_id] = True
572	self._PrintDict(val, level)
573	self.visiting[heap_id] = False
574
575	elif case(value_e.Obj):
576	val = cast(Obj, UP_val)
577
578	if self.options & NON_DATA_IS_ERROR:
579	raise error.Encode("Can't encode value of type Obj")
580	elif self.options & NON_DATA_IS_NULL:
581	self.buf.write('null')
582	return
583
584	# Cycle detection, only for containers that can be in cycles
585	heap_id = HeapValueId(val)
586
587	if self.visiting.get(heap_id, False):
588	if self.options & SHOW_CYCLES:
589	self.buf.write('(...)')
590	return
591	else:
592	# node.js prints which key closes the cycle
593	raise error.Encode(
594	"Can't encode Obj%s in object cycle" %
595	ValueIdString(val))
596	else:
597	self.visiting[heap_id] = True
598	self._PrintObj(val, level)
599	self.visiting[heap_id] = False
600
601	elif case(value_e.BashArray):
602	val = cast(value.BashArray, UP_val)
603	self._PrintBashArray(val, level)
604
605	elif case(value_e.InternalStringArray):
606	val = cast(value.InternalStringArray, UP_val)
607	self._PrintInternalStringArray(val, level)
608
609	elif case(value_e.BashAssoc):
610	val = cast(value.BashAssoc, UP_val)
611	self._PrintBashAssoc(val, level)
612
613	else:
614	pass # mycpp workaround
615	if self.options & NON_DATA_IS_ERROR:
616	raise error.Encode("Can't serialize object of type %s" %
617	ValType(val))
618	elif self.options & NON_DATA_IS_NULL:
619	self.buf.write('null')
620	else:
621	# Similar to = operator, ui.DebugPrint()
622	# TODO: that prints value.Range in a special way
623	ysh_type = ValType(val)
624	# Don't show ID in 'pp test_'
625	#id_str = ValueIdString(val)
626	self.buf.write('<%s>' % ysh_type)
627
628
629	class LexerDecoder(object):
630	"""J8 lexer and string decoder.
631
632	Similar interface as SimpleLexer, except we return an optional decoded
633	string
634	"""
635
636	def __init__(self, s, is_j8, lang_str):
637	# type: (str, bool, str) -> None
638	self.s = s
639	self.is_j8 = is_j8
640	self.lang_str = lang_str
641
642	self.pos = 0
643
644	# current line being lexed -- for error messages
645	self.cur_line_num = 1
646
647	# Reuse this instance to save GC objects. JSON objects could have
648	# thousands of strings.
649	self.decoded = mylib.BufWriter()
650
651	def _Error(self, msg, end_pos):
652	# type: (str, int) -> error.Decode
653
654	# Use the current position as start pos
655	return error.Decode(msg, self.s, self.pos, end_pos, self.cur_line_num)
656
657	def Next(self):
658	# type: () -> Tuple[Id_t, int, Optional[str]]
659	""" Returns a token and updates self.pos """
660
661	tok_id, end_pos = match.MatchJ8Token(self.s, self.pos)
662
663	if not self.is_j8:
664	if tok_id in (Id.Left_BSingleQuote, Id.Left_USingleQuote):
665	raise self._Error(
666	"Single quotes aren't part of JSON; you may want 'json8 read'",
667	end_pos)
668	if tok_id == Id.Ignored_Comment:
669	raise self._Error(
670	"Comments aren't part of JSON; you may want 'json8 read'",
671	end_pos)
672
673	if tok_id in (Id.Left_DoubleQuote, Id.Left_BSingleQuote,
674	Id.Left_USingleQuote):
675	return self._DecodeString(tok_id, end_pos)
676
677	if tok_id == Id.Left_JDoubleQuote:
678	if self.is_j8:
679	return self._DecodeString(tok_id, end_pos)
680	else:
681	raise self._Error('Pure JSON does not accept j"" prefix',
682	end_pos)
683
684	if tok_id == Id.Ignored_Newline:
685	#log('LINE %d', self.cur_line_num)
686	self.cur_line_num += 1
687
688	self.pos = end_pos
689	return tok_id, end_pos, None
690
691	def NextForLines(self):
692	# type: () -> Tuple[Id_t, int, Optional[str]]
693	""" Like Next(), but for J8 Lines """
694
695	tok_id, end_pos = match.MatchJ8LinesToken(self.s, self.pos)
696
697	if tok_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote,
698	Id.Left_BSingleQuote, Id.Left_USingleQuote):
699	return self._DecodeString(tok_id, end_pos)
700
701	# Check that UNQUOTED lines are valid UTF-8. (_DecodeString() does
702	# this for quoted strings.)
703	if (tok_id == Id.Lit_Chars and
704	not pyj8.PartIsUtf8(self.s, self.pos, end_pos)):
705	raise self._Error(
706	'Invalid UTF-8 in %s string literal' % self.lang_str, end_pos)
707	if tok_id == Id.Char_AsciiControl:
708	raise self._Error(
709	"J8 Lines can't have unescaped ASCII control chars", end_pos)
710
711	if tok_id == Id.J8_Newline:
712	#log('LINE %d', self.cur_line_num)
713	self.cur_line_num += 1
714
715	self.pos = end_pos
716	return tok_id, end_pos, None
717
718	def _DecodeString(self, left_id, str_pos):
719	# type: (Id_t, int) -> Tuple[Id_t, int, Optional[str]]
720	""" Returns a string token and updates self.pos """
721
722	while True:
723	if left_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote):
724	tok_id, str_end = match.MatchJsonStrToken(self.s, str_pos)
725	else:
726	tok_id, str_end = match.MatchJ8StrToken(self.s, str_pos)
727
728	#log('String tok %s %r', Id_str(tok_id), self.s[str_pos:str_end])
729
730	if tok_id == Id.Eol_Tok:
731	# TODO: point to beginning of # quote?
732	raise self._Error(
733	'Unexpected EOF while lexing %s string' % self.lang_str,
734	str_end)
735	if tok_id == Id.Unknown_Backslash:
736	raise self._Error(
737	'Bad backslash escape in %s string' % self.lang_str,
738	str_end)
739	if tok_id == Id.Char_AsciiControl:
740	raise self._Error(
741	"%s strings can't have unescaped ASCII control chars" %
742	self.lang_str, str_end)
743
744	if tok_id in (Id.Right_SingleQuote, Id.Right_DoubleQuote):
745
746	self.pos = str_end
747
748	s = self.decoded.getvalue()
749	self.decoded.clear() # reuse this instance
750
751	#log('decoded %r', self.decoded.getvalue())
752	return Id.J8_String, str_end, s
753
754	#
755	# Now handle each kind of token
756	#
757
758	if tok_id == Id.Lit_Chars: # JSON and J8
759	part = self.s[str_pos:str_end]
760	if not pyj8.PartIsUtf8(self.s, str_pos, str_end):
761	raise self._Error(
762	'Invalid UTF-8 in %s string literal' % self.lang_str,
763	str_end)
764
765	# TODO: would be nice to avoid allocation in all these cases.
766	# But LookupCharC() would have to change.
767
768	elif tok_id == Id.Char_OneChar: # JSON and J8
769	ch = self.s[str_pos + 1]
770	part = consts.LookupCharC(ch)
771
772	elif tok_id == Id.Char_UBraced: # J8 only
773	h = self.s[str_pos + 3:str_end - 1]
774	i = int(h, 16)
775
776	# Same checks in osh/word_compile.py
777	if i > 0x10ffff:
778	raise self._Error(
779	"Code point can't be greater than U+10ffff", str_end)
780	if 0xD800 <= i and i < 0xE000:
781	raise self._Error(
782	r"\u{%s} escape is illegal because it's in the surrogate range"
783	% h, str_end)
784
785	part = Utf8Encode(i)
786
787	elif tok_id == Id.Char_YHex: # J8 only
788	h = self.s[str_pos + 2:str_end]
789
790	# Same check in osh/word_parse.py
791	if left_id != Id.Left_BSingleQuote:
792	assert left_id != Id.Left_BTSingleQuote, "Not handled here"
793	raise self._Error(
794	r"\y%s escapes not allowed in u'' strings" % h,
795	str_end)
796
797	i = int(h, 16)
798	part = chr(i)
799
800	elif tok_id == Id.Char_SurrogatePair:
801	h1 = self.s[str_pos + 2:str_pos + 6]
802	h2 = self.s[str_pos + 8:str_pos + 12]
803
804	# https://www.oilshell.org/blog/2023/06/surrogate-pair.html
805	i1 = int(h1, 16) - 0xD800 # high surrogate
806	i2 = int(h2, 16) - 0xDC00 # low surrogate
807	code_point = 0x10000 + (i1 << 10) + i2
808
809	part = Utf8Encode(code_point)
810
811	elif tok_id == Id.Char_Unicode4: # JSON only, unpaired
812	h = self.s[str_pos + 2:str_end]
813	i = int(h, 16)
814	part = Utf8Encode(i)
815
816	else:
817	# Should never happen
818	raise AssertionError(Id_str(tok_id))
819
820	#log('%s part %r', Id_str(tok_id), part)
821	self.decoded.write(part)
822	str_pos = str_end
823
824
825	class _Parser(object):
826
827	def __init__(self, s, is_j8):
828	# type: (str, bool) -> None
829	self.s = s
830	self.is_j8 = is_j8
831	self.lang_str = "J8" if is_j8 else "JSON"
832
833	self.lexer = LexerDecoder(s, is_j8, self.lang_str)
834	self.tok_id = Id.Undefined_Tok
835	self.start_pos = 0
836	self.end_pos = 0
837	self.decoded = '' # decoded J8 string
838
839	def _Next(self):
840	# type: () -> None
841
842	# This isn't the start of a J8_Bool token, it's the END of the token before it
843	while True:
844	self.start_pos = self.end_pos
845	self.tok_id, self.end_pos, self.decoded = self.lexer.Next()
846	if self.tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
847	Id.Ignored_Comment):
848	break
849	# TODO: add Ignored_Newline to count lines, and show line numbers
850	# in errors messages. The position of the last newline and a token
851	# can be used to calculate a column number.
852
853	#log('NEXT %s %s %s %s', Id_str(self.tok_id), self.start_pos, self.end_pos, self.decoded or '-')
854
855	def _Eat(self, tok_id):
856	# type: (Id_t) -> None
857
858	if self.tok_id != tok_id:
859	#log('position %r %d-%d %r', self.s, self.start_pos,
860	# self.end_pos, self.s[self.start_pos:self.end_pos])
861	raise self._ParseError("Expected %s, got %s" %
862	(Id_str(tok_id), Id_str(self.tok_id)))
863	self._Next()
864
865	def _NextForLines(self):
866	# type: () -> None
867	"""Like _Next, but use the J8 Lines lexer."""
868	self.start_pos = self.end_pos
869	self.tok_id, self.end_pos, self.decoded = self.lexer.NextForLines()
870
871	def _ParseError(self, msg):
872	# type: (str) -> error.Decode
873	return error.Decode(msg, self.s, self.start_pos, self.end_pos,
874	self.lexer.cur_line_num)
875
876
877	class Parser(_Parser):
878	"""JSON and JSON8 Parser."""
879
880	def __init__(self, s, is_j8):
881	# type: (str, bool) -> None
882	_Parser.__init__(self, s, is_j8)
883
884	def _ParsePair(self):
885	# type: () -> Tuple[str, value_t]
886
887	k = self.decoded # Save the potential string value
888	self._Eat(Id.J8_String) # Check that it's a string
889	assert k is not None
890
891	self._Eat(Id.J8_Colon)
892
893	v = self._ParseValue()
894	return k, v
895
896	def _ParseDict(self):
897	# type: () -> value_t
898	"""
899	pair = string ':' value
900	Dict = '{' '}'
901	\| '{' pair (',' pair)* '}'
902	"""
903	# precondition
904	assert self.tok_id == Id.J8_LBrace, Id_str(self.tok_id)
905
906	#log('> Dict')
907
908	d = NewDict() # type: Dict[str, value_t]
909
910	self._Next()
911	if self.tok_id == Id.J8_RBrace:
912	self._Next()
913	return value.Dict(d)
914
915	k, v = self._ParsePair()
916	d[k] = v
917	#log(' [1] k %s v %s Id %s', k, v, Id_str(self.tok_id))
918
919	while self.tok_id == Id.J8_Comma:
920	self._Next()
921	k, v = self._ParsePair()
922	d[k] = v
923	#log(' [2] k %s v %s Id %s', k, v, Id_str(self.tok_id))
924
925	self._Eat(Id.J8_RBrace)
926
927	#log('< Dict')
928
929	return value.Dict(d)
930
931	def _ParseList(self):
932	# type: () -> value_t
933	"""
934	List = '[' ']'
935	\| '[' value (',' value)* ']'
936	"""
937	assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
938
939	items = [] # type: List[value_t]
940
941	self._Next()
942	if self.tok_id == Id.J8_RBracket:
943	self._Next()
944	return value.List(items)
945
946	items.append(self._ParseValue())
947
948	while self.tok_id == Id.J8_Comma:
949	self._Next()
950	items.append(self._ParseValue())
951
952	self._Eat(Id.J8_RBracket)
953
954	return value.List(items)
955
956	def _ParseValue(self):
957	# type: () -> value_t
958	if self.tok_id == Id.J8_LBrace:
959	return self._ParseDict()
960
961	elif self.tok_id == Id.J8_LBracket:
962	return self._ParseList()
963
964	elif self.tok_id == Id.J8_Null:
965	self._Next()
966	return value.Null
967
968	elif self.tok_id == Id.J8_Bool:
969	#log('%r %d', self.s[self.start_pos], self.start_pos)
970	b = value.Bool(self.s[self.start_pos] == 't')
971	self._Next()
972	return b
973
974	elif self.tok_id == Id.J8_Int:
975	part = self.s[self.start_pos:self.end_pos]
976	self._Next()
977	ok, big = mops.FromStr2(part)
978	if not ok:
979	raise self._ParseError('Integer is too big')
980	return value.Int(big)
981
982	elif self.tok_id == Id.J8_Float:
983	part = self.s[self.start_pos:self.end_pos]
984	self._Next()
985	return value.Float(float(part))
986
987	# UString, BString too
988	elif self.tok_id == Id.J8_String:
989	str_val = value.Str(self.decoded)
990	#log('d %r', self.decoded)
991	self._Next()
992	return str_val
993
994	elif self.tok_id == Id.Eol_Tok:
995	raise self._ParseError('Unexpected EOF while parsing %s' %
996	self.lang_str)
997
998	else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
999	raise self._ParseError('Invalid token while parsing %s: %s' %
1000	(self.lang_str, Id_str(self.tok_id)))
1001
1002	def ParseValue(self):
1003	# type: () -> value_t
1004	""" Raises error.Decode. """
1005	self._Next()
1006	obj = self._ParseValue()
1007
1008	n = len(self.s)
1009	if self.start_pos != n:
1010	extra = n - self.start_pos
1011	#log('n %d pos %d', n, self.start_pos)
1012	raise self._ParseError(
1013	'Got %d bytes of unexpected trailing input' % extra)
1014	return obj
1015
1016
1017	class Nil8Parser(_Parser):
1018	"""
1019	Tokens not in JSON8:
1020	LParen RParen Symbol
1021
1022	Tokens not in JSON, but in JSON8 and NIL8:
1023	Identifier (unquoted keys)
1024	Ignored_Comment
1025	"""
1026
1027	def __init__(self, s, is_j8):
1028	# type: (str, bool) -> None
1029	_Parser.__init__(self, s, is_j8)
1030
1031	if 0:
1032
1033	def _LookAhead(self):
1034	# type: () -> Id_t
1035	"""
1036	Don't need this right now
1037	"""
1038	end_pos = self.end_pos # look ahead from last token
1039	while True:
1040	tok_id, end_pos = match.MatchJ8Token(self.s, end_pos)
1041	if tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
1042	Id.Ignored_Comment):
1043	break
1044	return tok_id
1045
1046	def _ParseRecord(self):
1047	# type: () -> nvalue_t
1048	"""
1049	Yaks
1050	(self->Next) => (-> self Next)
1051	(self->Next obj.field) => ((-> self Next) (. obj field))
1052
1053	Similar to
1054	((identity identity) 42) => 42 in Clojure
1055
1056	ASDL
1057	(Node left:(. x4beef2))
1058	(Node left !x4beef2)
1059
1060	# Ambiguous because value can be identifier.
1061	# We have to look ahead to and see if there's a colon :
1062	field =
1063	Identifier ':' value
1064	\| value
1065
1066	record = '(' head field* ')'
1067
1068	- Identifier \| Symbol are treated the same, it's a side effect of
1069	the lexing style
1070	- do positional args come before named args
1071	- () is invalid? Use [] for empty list
1072	"""
1073	assert self.tok_id == Id.J8_LParen, Id_str(self.tok_id)
1074
1075	items = [] # type: List[nvalue_t]
1076
1077	self._Next()
1078	if self.tok_id == Id.J8_RParen:
1079	self._Next()
1080	return nvalue.List(items)
1081
1082	#log('TOK %s', Id_str(self.tok_id))
1083	while self.tok_id != Id.J8_RParen:
1084	items.append(self._ParseNil8())
1085	#log('TOK 2 %s', Id_str(self.tok_id))
1086
1087	self._Eat(Id.J8_RParen)
1088
1089	return nvalue.List(items)
1090
1091	def _ParseList8(self):
1092	# type: () -> nvalue_t
1093	"""
1094	List8 = '[' value* ']'
1095
1096	No commas, not even optional ones for now.
1097	"""
1098	assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
1099
1100	items = [] # type: List[nvalue_t]
1101
1102	self._Next()
1103	if self.tok_id == Id.J8_RBracket:
1104	self._Next()
1105	return nvalue.List(items)
1106
1107	#log('TOK %s', Id_str(self.tok_id))
1108	while self.tok_id != Id.J8_RBracket:
1109	items.append(self._ParseNil8())
1110	#log('TOK 2 %s', Id_str(self.tok_id))
1111
1112	self._Eat(Id.J8_RBracket)
1113
1114	return nvalue.List(items)
1115
1116	def _ParseNil8(self):
1117	# type: () -> nvalue_t
1118	if self.tok_id == Id.J8_LParen:
1119	obj = self._ParseRecord() # type: nvalue_t
1120	#return obj
1121
1122	elif self.tok_id == Id.J8_LBracket:
1123	obj = self._ParseList8()
1124	#return obj
1125
1126	# Primitives are copied from J8 above.
1127	# TODO: We also want hex literals.
1128	elif self.tok_id == Id.J8_Null:
1129	self._Next()
1130	obj = nvalue.Null
1131
1132	elif self.tok_id == Id.J8_Bool:
1133	b = nvalue.Bool(self.s[self.start_pos] == 't')
1134	self._Next()
1135	obj = b
1136
1137	elif self.tok_id == Id.J8_Int:
1138	part = self.s[self.start_pos:self.end_pos]
1139	self._Next()
1140	obj = nvalue.Int(int(part))
1141
1142	elif self.tok_id == Id.J8_Float:
1143	part = self.s[self.start_pos:self.end_pos]
1144	self._Next()
1145	obj = nvalue.Float(float(part))
1146
1147	elif self.tok_id == Id.J8_String:
1148	str_val = nvalue.Str(self.decoded)
1149	self._Next()
1150	obj = str_val
1151
1152	# <- etc.
1153	elif self.tok_id in (Id.J8_Identifier, Id.J8_Operator, Id.J8_Colon,
1154	Id.J8_Comma):
1155	# unquoted "word" treated like a string
1156	part = self.s[self.start_pos:self.end_pos]
1157	self._Next()
1158	obj = nvalue.Symbol(part)
1159
1160	elif self.tok_id == Id.Eol_Tok:
1161	raise self._ParseError('Unexpected EOF while parsing %s' %
1162	self.lang_str)
1163
1164	else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
1165	raise self._ParseError('Invalid token while parsing %s: %s' %
1166	(self.lang_str, Id_str(self.tok_id)))
1167
1168	#log('YO %s', Id_str(self.tok_id))
1169	if self.tok_id in (Id.J8_Operator, Id.J8_Colon, Id.J8_Comma):
1170	#log('AT %s', Id_str(self.tok_id))
1171
1172	# key: "value" -> (: key "value")
1173	part = self.s[self.start_pos:self.end_pos]
1174	op = nvalue.Symbol(part)
1175
1176	self._Next()
1177	operand2 = self._ParseNil8()
1178	infix = nvalue.List([op, obj, operand2]) # type: nvalue_t
1179	#print("--> INFIX %d %s" % (id(infix), infix))
1180	return infix
1181
1182	#next_id = self._LookAhead()
1183	#print('NEXT %s' % Id_str(next_id))
1184
1185	#raise AssertionError()
1186	#print("--> OBJ %d %s" % (id(obj), obj))
1187	return obj
1188
1189	def ParseNil8(self):
1190	# type: () -> nvalue_t
1191	""" Raises error.Decode. """
1192	self._Next()
1193	#print('yo')
1194	obj = self._ParseNil8()
1195	#print("==> %d %s" % (id(obj), obj))
1196	if self.tok_id != Id.Eol_Tok:
1197	raise self._ParseError('Unexpected trailing input')
1198	return obj
1199
1200
1201	class J8LinesParser(_Parser):
1202	"""Decode lines from a string with newlines.
1203
1204	We specify this with a grammar, to preserve location info and to reduce
1205	allocations. (But note that unquoted_line is more like a LOOP than it is
1206	grammatical.)
1207
1208	Grammar:
1209
1210	end = J8_Newline \| Eol_Tok
1211
1212	empty_line = WS_Space? end
1213
1214	# special case: read until end token, but REMOVE trailing WS_Space
1215	unquoted_line = WS_Space? Lit_Chars ANY* WS_Space? end
1216
1217	j8_line = WS_Space? J8_String WS_Space? end
1218
1219	lines = (empty_line \| unquoted_line \| j8_line)*
1220
1221	where Lit_Chars is valid UTF-8
1222
1223	Notes:
1224
1225	(1) We disallow multiple strings on a line, like:
1226
1227	"json" "json2"
1228	"json" unquoted
1229
1230	(2) Internal quotes are allowed on unquoted lines. Consider this line:
1231
1232	foo "" u''
1233
1234	The "" and u'' are not a decoded string, because the line started with
1235	Id.Lit_Chars literals.
1236
1237	(3) This is related to TSV8? Similar rules. Does TSV8 have empty cells?
1238	Does it have - for empty cell?
1239	"""
1240
1241	def __init__(self, s):
1242	# type: (str) -> None
1243	_Parser.__init__(self, s, True)
1244
1245	def _Show(self, s):
1246	# type: (str) -> None
1247	log('%s tok_id %s %d-%d', s, Id_str(self.tok_id), self.start_pos,
1248	self.end_pos)
1249
1250	def _ParseLine(self, out):
1251	# type: (List[str]) -> None
1252	""" May append a line to 'out' """
1253	#self._Show('1')
1254	if self.tok_id == Id.WS_Space:
1255	self._NextForLines()
1256
1257	# Empty line - return without doing anything
1258	if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1259	self._NextForLines()
1260	return
1261
1262	# Quoted string on line
1263	if self.tok_id == Id.J8_String:
1264	out.append(self.decoded)
1265	self._NextForLines()
1266
1267	if self.tok_id == Id.WS_Space: # trailing whitespace
1268	self._NextForLines()
1269
1270	if self.tok_id not in (Id.J8_Newline, Id.Eol_Tok):
1271	raise self._ParseError('Unexpected text after J8 Line (%s)' %
1272	Id_str(self.tok_id))
1273
1274	self._NextForLines()
1275	return
1276
1277	# Unquoted line
1278	if self.tok_id == Id.Lit_Chars:
1279	# ' unquoted "" text on line ' # read every token until end
1280	string_start = self.start_pos
1281	while True:
1282	# for stripping whitespace
1283	prev_id = self.tok_id
1284	prev_start = self.start_pos
1285
1286	self._NextForLines()
1287
1288	# It would be nicer if "middle" Id.WS_Space tokens didn't have
1289	# \r, but we're sticking with the JSON spec definition of
1290	# whitespace. (As another data point, CPython on Unix allows
1291	# \r in the middle of expressions, treating it as whitespace.)
1292	if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1293	break
1294
1295	if prev_id == Id.WS_Space:
1296	string_end = prev_start # remove trailing whitespace
1297	else:
1298	string_end = self.start_pos
1299
1300	out.append(self.s[string_start:string_end])
1301
1302	self._NextForLines() # past newline
1303	return
1304
1305	raise AssertionError(Id_str(self.tok_id))
1306
1307	def Parse(self):
1308	# type: () -> List[str]
1309	""" Raises error.Decode. """
1310	self._NextForLines()
1311
1312	lines = [] # type: List[str]
1313	while self.tok_id != Id.Eol_Tok:
1314	self._ParseLine(lines)
1315
1316	if self.tok_id != Id.Eol_Tok:
1317	raise self._ParseError('Unexpected trailing input in J8 Lines')
1318
1319	return lines
1320
1321
1322	def SplitJ8Lines(s):
1323	# type: (str) -> List[str]
1324	"""Used by @(echo split command sub)
1325
1326	Raises:
1327	error.Decode
1328
1329	3 Errors:
1330	- J8 string syntax error inside quotes
1331	- Extra input on line
1332	- unquoted line isn't utf-8
1333	"""
1334	p = J8LinesParser(s)
1335	return p.Parse()
1336
1337
1338	# vim: sw=4