OILS / data_lang / j8.py View on Github | oils.pub

1338 lines, 704 significant
1#!/usr/bin/env python2
2"""
3j8.py: J8 Notation, a superset of JSON
4
5Later:
6
7- Unify with ASDL pretty printing - NIL8
8 - {} [] are identical
9 - () is for statically typed ASDL data
10 (command.Simple blame_tok:(...) words:[ ])
11 although we are also using [] for typed ASDL arrays, not just JSON
12 - object IDs
13 - @ x123 can create an ID
14 - ! x123 can reference an ID
15 - <> can be for non-J8 data types? For the = operator
16 - 'hi \(name)' interpolation is useful for code
17
18- Common between JSON8 and NIL8 - for writing by hand
19 - comments - # line or // line (JSON5 uses // line, following JS)
20 - unquoted identifier names - NIL8 could be more relaxed for (+ 1 (* 3 4))
21 - commas
22 - JSON8 could have trailing commas rule
23 - NIL8 at least has no commas for [1 2 "hi"]
24"""
25
26import math
27
28from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str
29from _devbuild.gen.nil8_asdl import (nvalue, nvalue_t)
30from _devbuild.gen.runtime_asdl import error_code_e
31from _devbuild.gen.value_asdl import (value, value_e, value_t, value_str, Obj)
32
33from core import bash_impl
34from core import error
35from data_lang import pyj8
36# dependency issue: consts.py pulls in frontend/option_def.py
37from frontend import consts
38from frontend import match
39from mycpp import mops
40from mycpp import mylib
41from mycpp.mylib import tagswitch, iteritems, NewDict, log
42
43import fastfunc
44
45_ = log
46
47from typing import cast, Dict, List, Tuple, Optional
48
49
50# COPIED from ui.ValType() to break dep
51def ValType(val):
52 # type: (value_t) -> str
53 """For displaying type errors in the UI."""
54
55 return value_str(val.tag(), dot=False)
56
57
58if mylib.PYTHON:
59
60 def HeapValueId(val):
61 # type: (value_t) -> int
62 """
63 Python's id() returns the address, which is up to 64 bits.
64
65 In C++ we can use the GC ID, which fits within 32 bits.
66 """
67 return id(val)
68
69
70def ValueId(val):
71 # type: (value_t) -> int
72 """
73 Return an integer ID for object that:
74
75 1. Can be used to determine whether 2 objects are the same, e.g. for
76 List, Dict, Func, Proc, etc.
77 2. Will help detect object cycles
78
79 Primitives types like Int and Float don't have this notion. They're
80 immutable values that are copied and compared by value.
81 """
82 with tagswitch(val) as case:
83 if case(value_e.Null, value_e.Bool, value_e.Int, value_e.Float,
84 value_e.Str):
85 # These will not be on the heap if we switch to tagged pointers
86 # Str is handled conservatively - when we add small string
87 # optimization, some strings will be values, so we assume all are.
88 return -1
89 else:
90 return HeapValueId(val)
91
92
93def ValueIdString(val):
94 # type: (value_t) -> str
95 """Used by pp value (42) and = 42"""
96 heap_id = ValueId(val) # could be -1
97 if heap_id == -1:
98 return ''
99 else:
100 return ' 0x%s' % mylib.hex_lower(heap_id)
101
102
103def Utf8Encode(code):
104 # type: (int) -> str
105 """Return utf-8 encoded bytes from a unicode code point.
106
107 Based on https://stackoverflow.com/a/23502707
108 """
109 num_cont_bytes = 0
110
111 if code <= 0x7F:
112 return chr(code & 0x7F) # ASCII
113
114 elif code <= 0x7FF:
115 num_cont_bytes = 1
116 elif code <= 0xFFFF:
117 num_cont_bytes = 2
118 else:
119 # What about the check code <= 0x10FFFF ?
120 # - it happens in statically parsed $'' u''
121 # - but not dynamically parsed echo -e / printf, following bash/zsh
122 num_cont_bytes = 3
123
124 bytes_ = [] # type: List[int]
125 for _ in xrange(num_cont_bytes):
126 bytes_.append(0x80 | (code & 0x3F))
127 code >>= 6
128
129 b = (0x1E << (6 - num_cont_bytes)) | (code & (0x3F >> num_cont_bytes))
130 bytes_.append(b)
131 bytes_.reverse()
132
133 # mod 256 because Python ints don't wrap around!
134 tmp = [chr(b & 0xFF) for b in bytes_]
135 return ''.join(tmp)
136
137
138SHOW_CYCLES = 1 << 1 # show as [...] or {...} or (...), with object ID
139LOSSY_JSON_STRINGS = 1 << 3 # JSON may lose data about strings
140INF_NAN_ARE_NULL = 1 << 4 # another lossy json issue
141
142NON_DATA_IS_NULL = 1 << 6
143NON_DATA_IS_ERROR = 1 << 7
144# Otherwise, non-data objects like Eggex will be <Eggex 0xff>
145
146# Hack until we fully translate
147assert pyj8.LOSSY_JSON_STRINGS == LOSSY_JSON_STRINGS
148
149
150def _Print(val, buf, indent, options=0):
151 # type: (value_t, mylib.BufWriter, int, int) -> None
152 """
153 Args:
154 indent: number of spaces to indent, or -1 for everything on one line
155 """
156 p = InstancePrinter(buf, indent, options)
157 p.Print(val)
158
159
160def PrintMessage(val, buf, indent, type_errors):
161 # type: (value_t, mylib.BufWriter, int, bool) -> None
162 """ For json8 write (x) and toJson8()
163
164 Caller must handle error.Encode
165 """
166 options = 0
167 if type_errors:
168 options |= NON_DATA_IS_ERROR
169 else:
170 options |= NON_DATA_IS_NULL
171 _Print(val, buf, indent, options=options)
172
173
174def PrintJsonMessage(val, buf, indent, type_errors):
175 # type: (value_t, mylib.BufWriter, int, bool) -> None
176 """ For json write (x) and toJson()
177
178 Caller must handle error.Encode()
179 Doesn't decay to b'' strings - will use Unicode replacement char.
180 """
181 options = LOSSY_JSON_STRINGS | INF_NAN_ARE_NULL
182 if type_errors:
183 options |= NON_DATA_IS_ERROR
184 else:
185 options |= NON_DATA_IS_NULL
186 _Print(val, buf, indent, options=options)
187
188
189def PrintLine(val, f):
190 # type: (value_t, mylib.Writer) -> None
191 """ For pp test_ (x) """
192
193 # error.Encode should be impossible - we show cycles and non-data
194 buf = mylib.BufWriter()
195
196 _Print(val, buf, -1, options=SHOW_CYCLES)
197
198 f.write(buf.getvalue())
199 f.write('\n')
200
201
202def EncodeString(s, buf, unquoted_ok=False):
203 # type: (str, mylib.BufWriter, bool) -> None
204 """ For pp proc, etc."""
205
206 if unquoted_ok and fastfunc.CanOmitQuotes(s):
207 buf.write(s)
208 return
209
210 _Print(value.Str(s), buf, -1)
211
212
213def MaybeEncodeString(s):
214 # type: (str) -> str
215 """ For write --json8 $s and compexport """
216
217 # TODO: add unquoted_ok here?
218 # /usr/local/foo-bar/x.y/a_b
219
220 buf = mylib.BufWriter()
221 _Print(value.Str(s), buf, -1)
222 return buf.getvalue()
223
224
225def MaybeEncodeJsonString(s):
226 # type: (str) -> str
227 """ For write --json """
228
229 # TODO: add unquoted_ok here?
230 # /usr/local/foo-bar/x.y/a_b
231 buf = mylib.BufWriter()
232 _Print(value.Str(s), buf, -1, options=LOSSY_JSON_STRINGS)
233 return buf.getvalue()
234
235
236class InstancePrinter(object):
237 """Print a value tree as J8/JSON."""
238
239 def __init__(self, buf, indent, options):
240 # type: (mylib.BufWriter, int, int) -> None
241 self.buf = buf
242 self.indent = indent
243 self.options = options
244
245 # Key is vm.HeapValueId(val)
246 self.visiting = {} # type: Dict[int, bool]
247
248 def _ItemIndent(self, level):
249 # type: (int) -> None
250
251 if self.indent == -1:
252 return
253
254 self.buf.write_spaces((level + 1) * self.indent)
255
256 def _BracketIndent(self, level):
257 # type: (int) -> None
258
259 if self.indent == -1:
260 return
261
262 self.buf.write_spaces(level * self.indent)
263
264 def _MaybeNewline(self):
265 # type: () -> None
266 if self.indent == -1:
267 return
268 self.buf.write('\n')
269
270 def _MaybeSpace(self):
271 # type: () -> None
272 if self.indent == -1:
273 return
274 self.buf.write(' ')
275
276 def _PrintList(self, val, level):
277 # type: (value.List, int) -> None
278
279 if len(val.items) == 0: # Special case like Python/JS
280 self.buf.write('[]')
281 else:
282 self.buf.write('[')
283 self._MaybeNewline()
284 for i, item in enumerate(val.items):
285 if i != 0:
286 self.buf.write(',')
287 self._MaybeNewline()
288
289 self._ItemIndent(level)
290 self.Print(item, level + 1)
291 self._MaybeNewline()
292
293 self._BracketIndent(level)
294 self.buf.write(']')
295
296 def _PrintMapping(self, d, left, right, level):
297 # type: (Dict[str, value_t], str, str, int) -> None
298 if len(d) == 0: # Special case like Python/JS
299 self.buf.write(left)
300 self.buf.write(right)
301 else:
302 self.buf.write(left)
303 self._MaybeNewline()
304 i = 0
305 for k, v in iteritems(d):
306 if i != 0:
307 self.buf.write(',')
308 self._MaybeNewline()
309
310 self._ItemIndent(level)
311
312 pyj8.WriteString(k, self.options, self.buf)
313
314 self.buf.write(':')
315 self._MaybeSpace()
316
317 self.Print(v, level + 1)
318
319 i += 1
320
321 self._MaybeNewline()
322 self._BracketIndent(level)
323 self.buf.write(right)
324
325 def _PrintDict(self, val, level):
326 # type: (value.Dict, int) -> None
327 self._PrintMapping(val.d, '{', '}', level)
328
329 def _PrintObj(self, val, level):
330 # type: (Obj, int) -> None
331
332 self._PrintMapping(val.d, '(', ')', level)
333
334 if val.prototype:
335 self.buf.write(' --> ')
336 self._PrintObj(val.prototype, level)
337
338 def _PrintBashPrefix(self, type_str, level):
339 # type: (str, int) -> None
340
341 self.buf.write('{')
342 self._MaybeNewline()
343 self._ItemIndent(level)
344 self.buf.write('"type":')
345 self._MaybeSpace()
346 self.buf.write(
347 type_str) # "InternalStringArray", "BashArray", or "BashAssoc",
348
349 self._MaybeNewline()
350
351 self._ItemIndent(level)
352 self.buf.write('"data":')
353 self._MaybeSpace()
354
355 def _PrintBashSuffix(self, level):
356 # type: (int) -> None
357 self._MaybeNewline()
358 self._BracketIndent(level)
359 self.buf.write('}')
360
361 def _PrintBashArray(self, val, level):
362 # type: (value.BashArray, int) -> None
363
364 self._PrintBashPrefix('"BashArray",', level)
365
366 if bash_impl.BashArray_Count(val) == 0: # Special case like Python/JS
367 self.buf.write('{}')
368 else:
369 self.buf.write('{')
370 self._MaybeNewline()
371
372 i = 0
373 for k in bash_impl.BashArray_GetKeys(val):
374 if i != 0:
375 self.buf.write(',')
376 self._MaybeNewline()
377
378 self._ItemIndent(level + 1)
379 pyj8.WriteString(mops.ToStr(k), self.options, self.buf)
380
381 self.buf.write(':')
382 self._MaybeSpace()
383
384 v, error_code = bash_impl.BashArray_GetElement(val, k)
385 assert error_code == error_code_e.OK, error_code
386 pyj8.WriteString(v, self.options, self.buf)
387
388 i += 1
389
390 self._MaybeNewline()
391
392 self._BracketIndent(level + 1)
393 self.buf.write('}')
394
395 self._PrintBashSuffix(level)
396
397 def _PrintInternalStringArray(self, val, level):
398 # type: (value.InternalStringArray, int) -> None
399
400 self._PrintBashPrefix('"InternalStringArray",', level)
401
402 if bash_impl.InternalStringArray_Count(
403 val) == 0: # Special case like Python/JS
404 self.buf.write('{}')
405 else:
406 self.buf.write('{')
407 self._MaybeNewline()
408
409 first = True
410 for i, s in enumerate(
411 bash_impl.InternalStringArray_GetValues(val)):
412 if s is None:
413 continue
414
415 if not first:
416 self.buf.write(',')
417 self._MaybeNewline()
418
419 self._ItemIndent(level + 1)
420 pyj8.WriteString(str(i), self.options, self.buf)
421
422 self.buf.write(':')
423 self._MaybeSpace()
424
425 pyj8.WriteString(s, self.options, self.buf)
426
427 first = False
428
429 self._MaybeNewline()
430
431 self._BracketIndent(level + 1)
432 self.buf.write('}')
433
434 self._PrintBashSuffix(level)
435
436 def _PrintBashAssoc(self, val, level):
437 # type: (value.BashAssoc, int) -> None
438
439 self._PrintBashPrefix('"BashAssoc",', level)
440
441 if bash_impl.BashAssoc_Count(val) == 0: # Special case like Python/JS
442 self.buf.write('{}')
443 else:
444 self.buf.write('{')
445 self._MaybeNewline()
446
447 i = 0
448 for k2, v2 in iteritems(bash_impl.BashAssoc_GetDict(val)):
449 if i != 0:
450 self.buf.write(',')
451 self._MaybeNewline()
452
453 self._ItemIndent(level + 1)
454 pyj8.WriteString(k2, self.options, self.buf)
455
456 self.buf.write(':')
457 self._MaybeSpace()
458
459 pyj8.WriteString(v2, self.options, self.buf)
460
461 i += 1
462
463 self._MaybeNewline()
464
465 self._BracketIndent(level + 1)
466 self.buf.write('}')
467
468 self._PrintBashSuffix(level)
469
470 def Print(self, val, level=0):
471 # type: (value_t, int) -> None
472
473 # special value that means everything is on one line
474 # It's like
475 # JSON.stringify(d, null, 0)
476 # except we use -1, not 0. 0 can still have newlines.
477
478 UP_val = val
479 with tagswitch(val) as case:
480 if case(value_e.Null):
481 self.buf.write('null')
482
483 elif case(value_e.Bool):
484 val = cast(value.Bool, UP_val)
485 self.buf.write('true' if val.b else 'false')
486
487 elif case(value_e.Int):
488 val = cast(value.Int, UP_val)
489 # TODO: avoid intermediate allocation with
490 # self.buf.WriteBigInt(val.i)
491 #
492 # Or maybe we need pyj8.WriteBigInt() because truly BigInt may
493 # be of arbitrary length, and will need a growth strategy.
494 # Although that is not very common, so we could allocate in
495 # that case.
496
497 self.buf.write(mops.ToStr(val.i))
498
499 elif case(value_e.Float):
500 val = cast(value.Float, UP_val)
501
502 fl = val.f
503 if math.isinf(fl):
504 if self.options & INF_NAN_ARE_NULL:
505 s = 'null' # negative infinity is null too
506 else:
507 s = 'INFINITY'
508 if fl < 0:
509 s = '-' + s
510 elif math.isnan(fl):
511 if self.options & INF_NAN_ARE_NULL:
512 # JavaScript JSON lib behavior: Inf and NaN are null
513 # Python has a bug in the encoder by default, and then
514 # allow_nan=False raises an error
515 s = 'null'
516 else:
517 s = 'NAN'
518 else:
519 # TODO: can we avoid intermediate allocation?
520 # self.buf.WriteFloat(val.f)
521 s = str(fl)
522
523 self.buf.write(s)
524
525 elif case(value_e.Str):
526 val = cast(value.Str, UP_val)
527
528 pyj8.WriteString(val.s, self.options, self.buf)
529
530 elif case(value_e.List):
531 val = cast(value.List, UP_val)
532
533 # Cycle detection, only for containers that can be in cycles
534 heap_id = HeapValueId(val)
535
536 if self.visiting.get(heap_id, False):
537 if self.options & SHOW_CYCLES:
538 # Showing the ID would be nice for pretty printing, but
539 # the problem is we'd have to show it TWICE to make it
540 # meaningful
541 #
542 #self.buf.write('[ -->%s ]' % ValueIdString(val))
543 self.buf.write('[...]')
544 return
545 else:
546 # node.js prints which index closes the cycle
547 raise error.Encode(
548 "Can't encode List%s in object cycle" %
549 ValueIdString(val))
550 else:
551 self.visiting[heap_id] = True
552 self._PrintList(val, level)
553 self.visiting[heap_id] = False
554
555 elif case(value_e.Dict):
556 val = cast(value.Dict, UP_val)
557
558 # Cycle detection, only for containers that can be in cycles
559 heap_id = HeapValueId(val)
560
561 if self.visiting.get(heap_id, False):
562 if self.options & SHOW_CYCLES:
563 self.buf.write('{...}')
564 return
565 else:
566 # node.js prints which key closes the cycle
567 raise error.Encode(
568 "Can't encode Dict%s in object cycle" %
569 ValueIdString(val))
570 else:
571 self.visiting[heap_id] = True
572 self._PrintDict(val, level)
573 self.visiting[heap_id] = False
574
575 elif case(value_e.Obj):
576 val = cast(Obj, UP_val)
577
578 if self.options & NON_DATA_IS_ERROR:
579 raise error.Encode("Can't encode value of type Obj")
580 elif self.options & NON_DATA_IS_NULL:
581 self.buf.write('null')
582 return
583
584 # Cycle detection, only for containers that can be in cycles
585 heap_id = HeapValueId(val)
586
587 if self.visiting.get(heap_id, False):
588 if self.options & SHOW_CYCLES:
589 self.buf.write('(...)')
590 return
591 else:
592 # node.js prints which key closes the cycle
593 raise error.Encode(
594 "Can't encode Obj%s in object cycle" %
595 ValueIdString(val))
596 else:
597 self.visiting[heap_id] = True
598 self._PrintObj(val, level)
599 self.visiting[heap_id] = False
600
601 elif case(value_e.BashArray):
602 val = cast(value.BashArray, UP_val)
603 self._PrintBashArray(val, level)
604
605 elif case(value_e.InternalStringArray):
606 val = cast(value.InternalStringArray, UP_val)
607 self._PrintInternalStringArray(val, level)
608
609 elif case(value_e.BashAssoc):
610 val = cast(value.BashAssoc, UP_val)
611 self._PrintBashAssoc(val, level)
612
613 else:
614 pass # mycpp workaround
615 if self.options & NON_DATA_IS_ERROR:
616 raise error.Encode("Can't serialize object of type %s" %
617 ValType(val))
618 elif self.options & NON_DATA_IS_NULL:
619 self.buf.write('null')
620 else:
621 # Similar to = operator, ui.DebugPrint()
622 # TODO: that prints value.Range in a special way
623 ysh_type = ValType(val)
624 # Don't show ID in 'pp test_'
625 #id_str = ValueIdString(val)
626 self.buf.write('<%s>' % ysh_type)
627
628
629class LexerDecoder(object):
630 """J8 lexer and string decoder.
631
632 Similar interface as SimpleLexer, except we return an optional decoded
633 string
634 """
635
636 def __init__(self, s, is_j8, lang_str):
637 # type: (str, bool, str) -> None
638 self.s = s
639 self.is_j8 = is_j8
640 self.lang_str = lang_str
641
642 self.pos = 0
643
644 # current line being lexed -- for error messages
645 self.cur_line_num = 1
646
647 # Reuse this instance to save GC objects. JSON objects could have
648 # thousands of strings.
649 self.decoded = mylib.BufWriter()
650
651 def _Error(self, msg, end_pos):
652 # type: (str, int) -> error.Decode
653
654 # Use the current position as start pos
655 return error.Decode(msg, self.s, self.pos, end_pos, self.cur_line_num)
656
657 def Next(self):
658 # type: () -> Tuple[Id_t, int, Optional[str]]
659 """ Returns a token and updates self.pos """
660
661 tok_id, end_pos = match.MatchJ8Token(self.s, self.pos)
662
663 if not self.is_j8:
664 if tok_id in (Id.Left_BSingleQuote, Id.Left_USingleQuote):
665 raise self._Error(
666 "Single quotes aren't part of JSON; you may want 'json8 read'",
667 end_pos)
668 if tok_id == Id.Ignored_Comment:
669 raise self._Error(
670 "Comments aren't part of JSON; you may want 'json8 read'",
671 end_pos)
672
673 if tok_id in (Id.Left_DoubleQuote, Id.Left_BSingleQuote,
674 Id.Left_USingleQuote):
675 return self._DecodeString(tok_id, end_pos)
676
677 if tok_id == Id.Left_JDoubleQuote:
678 if self.is_j8:
679 return self._DecodeString(tok_id, end_pos)
680 else:
681 raise self._Error('Pure JSON does not accept j"" prefix',
682 end_pos)
683
684 if tok_id == Id.Ignored_Newline:
685 #log('LINE %d', self.cur_line_num)
686 self.cur_line_num += 1
687
688 self.pos = end_pos
689 return tok_id, end_pos, None
690
691 def NextForLines(self):
692 # type: () -> Tuple[Id_t, int, Optional[str]]
693 """ Like Next(), but for J8 Lines """
694
695 tok_id, end_pos = match.MatchJ8LinesToken(self.s, self.pos)
696
697 if tok_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote,
698 Id.Left_BSingleQuote, Id.Left_USingleQuote):
699 return self._DecodeString(tok_id, end_pos)
700
701 # Check that UNQUOTED lines are valid UTF-8. (_DecodeString() does
702 # this for quoted strings.)
703 if (tok_id == Id.Lit_Chars and
704 not pyj8.PartIsUtf8(self.s, self.pos, end_pos)):
705 raise self._Error(
706 'Invalid UTF-8 in %s string literal' % self.lang_str, end_pos)
707 if tok_id == Id.Char_AsciiControl:
708 raise self._Error(
709 "J8 Lines can't have unescaped ASCII control chars", end_pos)
710
711 if tok_id == Id.J8_Newline:
712 #log('LINE %d', self.cur_line_num)
713 self.cur_line_num += 1
714
715 self.pos = end_pos
716 return tok_id, end_pos, None
717
718 def _DecodeString(self, left_id, str_pos):
719 # type: (Id_t, int) -> Tuple[Id_t, int, Optional[str]]
720 """ Returns a string token and updates self.pos """
721
722 while True:
723 if left_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote):
724 tok_id, str_end = match.MatchJsonStrToken(self.s, str_pos)
725 else:
726 tok_id, str_end = match.MatchJ8StrToken(self.s, str_pos)
727
728 #log('String tok %s %r', Id_str(tok_id), self.s[str_pos:str_end])
729
730 if tok_id == Id.Eol_Tok:
731 # TODO: point to beginning of # quote?
732 raise self._Error(
733 'Unexpected EOF while lexing %s string' % self.lang_str,
734 str_end)
735 if tok_id == Id.Unknown_Backslash:
736 raise self._Error(
737 'Bad backslash escape in %s string' % self.lang_str,
738 str_end)
739 if tok_id == Id.Char_AsciiControl:
740 raise self._Error(
741 "%s strings can't have unescaped ASCII control chars" %
742 self.lang_str, str_end)
743
744 if tok_id in (Id.Right_SingleQuote, Id.Right_DoubleQuote):
745
746 self.pos = str_end
747
748 s = self.decoded.getvalue()
749 self.decoded.clear() # reuse this instance
750
751 #log('decoded %r', self.decoded.getvalue())
752 return Id.J8_String, str_end, s
753
754 #
755 # Now handle each kind of token
756 #
757
758 if tok_id == Id.Lit_Chars: # JSON and J8
759 part = self.s[str_pos:str_end]
760 if not pyj8.PartIsUtf8(self.s, str_pos, str_end):
761 raise self._Error(
762 'Invalid UTF-8 in %s string literal' % self.lang_str,
763 str_end)
764
765 # TODO: would be nice to avoid allocation in all these cases.
766 # But LookupCharC() would have to change.
767
768 elif tok_id == Id.Char_OneChar: # JSON and J8
769 ch = self.s[str_pos + 1]
770 part = consts.LookupCharC(ch)
771
772 elif tok_id == Id.Char_UBraced: # J8 only
773 h = self.s[str_pos + 3:str_end - 1]
774 i = int(h, 16)
775
776 # Same checks in osh/word_compile.py
777 if i > 0x10ffff:
778 raise self._Error(
779 "Code point can't be greater than U+10ffff", str_end)
780 if 0xD800 <= i and i < 0xE000:
781 raise self._Error(
782 r"\u{%s} escape is illegal because it's in the surrogate range"
783 % h, str_end)
784
785 part = Utf8Encode(i)
786
787 elif tok_id == Id.Char_YHex: # J8 only
788 h = self.s[str_pos + 2:str_end]
789
790 # Same check in osh/word_parse.py
791 if left_id != Id.Left_BSingleQuote:
792 assert left_id != Id.Left_BTSingleQuote, "Not handled here"
793 raise self._Error(
794 r"\y%s escapes not allowed in u'' strings" % h,
795 str_end)
796
797 i = int(h, 16)
798 part = chr(i)
799
800 elif tok_id == Id.Char_SurrogatePair:
801 h1 = self.s[str_pos + 2:str_pos + 6]
802 h2 = self.s[str_pos + 8:str_pos + 12]
803
804 # https://www.oilshell.org/blog/2023/06/surrogate-pair.html
805 i1 = int(h1, 16) - 0xD800 # high surrogate
806 i2 = int(h2, 16) - 0xDC00 # low surrogate
807 code_point = 0x10000 + (i1 << 10) + i2
808
809 part = Utf8Encode(code_point)
810
811 elif tok_id == Id.Char_Unicode4: # JSON only, unpaired
812 h = self.s[str_pos + 2:str_end]
813 i = int(h, 16)
814 part = Utf8Encode(i)
815
816 else:
817 # Should never happen
818 raise AssertionError(Id_str(tok_id))
819
820 #log('%s part %r', Id_str(tok_id), part)
821 self.decoded.write(part)
822 str_pos = str_end
823
824
825class _Parser(object):
826
827 def __init__(self, s, is_j8):
828 # type: (str, bool) -> None
829 self.s = s
830 self.is_j8 = is_j8
831 self.lang_str = "J8" if is_j8 else "JSON"
832
833 self.lexer = LexerDecoder(s, is_j8, self.lang_str)
834 self.tok_id = Id.Undefined_Tok
835 self.start_pos = 0
836 self.end_pos = 0
837 self.decoded = '' # decoded J8 string
838
839 def _Next(self):
840 # type: () -> None
841
842 # This isn't the start of a J8_Bool token, it's the END of the token before it
843 while True:
844 self.start_pos = self.end_pos
845 self.tok_id, self.end_pos, self.decoded = self.lexer.Next()
846 if self.tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
847 Id.Ignored_Comment):
848 break
849 # TODO: add Ignored_Newline to count lines, and show line numbers
850 # in errors messages. The position of the last newline and a token
851 # can be used to calculate a column number.
852
853 #log('NEXT %s %s %s %s', Id_str(self.tok_id), self.start_pos, self.end_pos, self.decoded or '-')
854
855 def _Eat(self, tok_id):
856 # type: (Id_t) -> None
857
858 if self.tok_id != tok_id:
859 #log('position %r %d-%d %r', self.s, self.start_pos,
860 # self.end_pos, self.s[self.start_pos:self.end_pos])
861 raise self._ParseError("Expected %s, got %s" %
862 (Id_str(tok_id), Id_str(self.tok_id)))
863 self._Next()
864
865 def _NextForLines(self):
866 # type: () -> None
867 """Like _Next, but use the J8 Lines lexer."""
868 self.start_pos = self.end_pos
869 self.tok_id, self.end_pos, self.decoded = self.lexer.NextForLines()
870
871 def _ParseError(self, msg):
872 # type: (str) -> error.Decode
873 return error.Decode(msg, self.s, self.start_pos, self.end_pos,
874 self.lexer.cur_line_num)
875
876
877class Parser(_Parser):
878 """JSON and JSON8 Parser."""
879
880 def __init__(self, s, is_j8):
881 # type: (str, bool) -> None
882 _Parser.__init__(self, s, is_j8)
883
884 def _ParsePair(self):
885 # type: () -> Tuple[str, value_t]
886
887 k = self.decoded # Save the potential string value
888 self._Eat(Id.J8_String) # Check that it's a string
889 assert k is not None
890
891 self._Eat(Id.J8_Colon)
892
893 v = self._ParseValue()
894 return k, v
895
896 def _ParseDict(self):
897 # type: () -> value_t
898 """
899 pair = string ':' value
900 Dict = '{' '}'
901 | '{' pair (',' pair)* '}'
902 """
903 # precondition
904 assert self.tok_id == Id.J8_LBrace, Id_str(self.tok_id)
905
906 #log('> Dict')
907
908 d = NewDict() # type: Dict[str, value_t]
909
910 self._Next()
911 if self.tok_id == Id.J8_RBrace:
912 self._Next()
913 return value.Dict(d)
914
915 k, v = self._ParsePair()
916 d[k] = v
917 #log(' [1] k %s v %s Id %s', k, v, Id_str(self.tok_id))
918
919 while self.tok_id == Id.J8_Comma:
920 self._Next()
921 k, v = self._ParsePair()
922 d[k] = v
923 #log(' [2] k %s v %s Id %s', k, v, Id_str(self.tok_id))
924
925 self._Eat(Id.J8_RBrace)
926
927 #log('< Dict')
928
929 return value.Dict(d)
930
931 def _ParseList(self):
932 # type: () -> value_t
933 """
934 List = '[' ']'
935 | '[' value (',' value)* ']'
936 """
937 assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
938
939 items = [] # type: List[value_t]
940
941 self._Next()
942 if self.tok_id == Id.J8_RBracket:
943 self._Next()
944 return value.List(items)
945
946 items.append(self._ParseValue())
947
948 while self.tok_id == Id.J8_Comma:
949 self._Next()
950 items.append(self._ParseValue())
951
952 self._Eat(Id.J8_RBracket)
953
954 return value.List(items)
955
956 def _ParseValue(self):
957 # type: () -> value_t
958 if self.tok_id == Id.J8_LBrace:
959 return self._ParseDict()
960
961 elif self.tok_id == Id.J8_LBracket:
962 return self._ParseList()
963
964 elif self.tok_id == Id.J8_Null:
965 self._Next()
966 return value.Null
967
968 elif self.tok_id == Id.J8_Bool:
969 #log('%r %d', self.s[self.start_pos], self.start_pos)
970 b = value.Bool(self.s[self.start_pos] == 't')
971 self._Next()
972 return b
973
974 elif self.tok_id == Id.J8_Int:
975 part = self.s[self.start_pos:self.end_pos]
976 self._Next()
977 ok, big = mops.FromStr2(part)
978 if not ok:
979 raise self._ParseError('Integer is too big')
980 return value.Int(big)
981
982 elif self.tok_id == Id.J8_Float:
983 part = self.s[self.start_pos:self.end_pos]
984 self._Next()
985 return value.Float(float(part))
986
987 # UString, BString too
988 elif self.tok_id == Id.J8_String:
989 str_val = value.Str(self.decoded)
990 #log('d %r', self.decoded)
991 self._Next()
992 return str_val
993
994 elif self.tok_id == Id.Eol_Tok:
995 raise self._ParseError('Unexpected EOF while parsing %s' %
996 self.lang_str)
997
998 else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
999 raise self._ParseError('Invalid token while parsing %s: %s' %
1000 (self.lang_str, Id_str(self.tok_id)))
1001
1002 def ParseValue(self):
1003 # type: () -> value_t
1004 """ Raises error.Decode. """
1005 self._Next()
1006 obj = self._ParseValue()
1007
1008 n = len(self.s)
1009 if self.start_pos != n:
1010 extra = n - self.start_pos
1011 #log('n %d pos %d', n, self.start_pos)
1012 raise self._ParseError(
1013 'Got %d bytes of unexpected trailing input' % extra)
1014 return obj
1015
1016
1017class Nil8Parser(_Parser):
1018 """
1019 Tokens not in JSON8:
1020 LParen RParen Symbol
1021
1022 Tokens not in JSON, but in JSON8 and NIL8:
1023 Identifier (unquoted keys)
1024 Ignored_Comment
1025 """
1026
1027 def __init__(self, s, is_j8):
1028 # type: (str, bool) -> None
1029 _Parser.__init__(self, s, is_j8)
1030
1031 if 0:
1032
1033 def _LookAhead(self):
1034 # type: () -> Id_t
1035 """
1036 Don't need this right now
1037 """
1038 end_pos = self.end_pos # look ahead from last token
1039 while True:
1040 tok_id, end_pos = match.MatchJ8Token(self.s, end_pos)
1041 if tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
1042 Id.Ignored_Comment):
1043 break
1044 return tok_id
1045
1046 def _ParseRecord(self):
1047 # type: () -> nvalue_t
1048 """
1049 Yaks
1050 (self->Next) => (-> self Next)
1051 (self->Next obj.field) => ((-> self Next) (. obj field))
1052
1053 Similar to
1054 ((identity identity) 42) => 42 in Clojure
1055
1056 ASDL
1057 (Node left:(. x4beef2))
1058 (Node left !x4beef2)
1059
1060 # Ambiguous because value can be identifier.
1061 # We have to look ahead to and see if there's a colon :
1062 field =
1063 Identifier ':' value
1064 | value
1065
1066 record = '(' head field* ')'
1067
1068 - Identifier | Symbol are treated the same, it's a side effect of
1069 the lexing style
1070 - do positional args come before named args
1071 - () is invalid? Use [] for empty list
1072 """
1073 assert self.tok_id == Id.J8_LParen, Id_str(self.tok_id)
1074
1075 items = [] # type: List[nvalue_t]
1076
1077 self._Next()
1078 if self.tok_id == Id.J8_RParen:
1079 self._Next()
1080 return nvalue.List(items)
1081
1082 #log('TOK %s', Id_str(self.tok_id))
1083 while self.tok_id != Id.J8_RParen:
1084 items.append(self._ParseNil8())
1085 #log('TOK 2 %s', Id_str(self.tok_id))
1086
1087 self._Eat(Id.J8_RParen)
1088
1089 return nvalue.List(items)
1090
1091 def _ParseList8(self):
1092 # type: () -> nvalue_t
1093 """
1094 List8 = '[' value* ']'
1095
1096 No commas, not even optional ones for now.
1097 """
1098 assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
1099
1100 items = [] # type: List[nvalue_t]
1101
1102 self._Next()
1103 if self.tok_id == Id.J8_RBracket:
1104 self._Next()
1105 return nvalue.List(items)
1106
1107 #log('TOK %s', Id_str(self.tok_id))
1108 while self.tok_id != Id.J8_RBracket:
1109 items.append(self._ParseNil8())
1110 #log('TOK 2 %s', Id_str(self.tok_id))
1111
1112 self._Eat(Id.J8_RBracket)
1113
1114 return nvalue.List(items)
1115
1116 def _ParseNil8(self):
1117 # type: () -> nvalue_t
1118 if self.tok_id == Id.J8_LParen:
1119 obj = self._ParseRecord() # type: nvalue_t
1120 #return obj
1121
1122 elif self.tok_id == Id.J8_LBracket:
1123 obj = self._ParseList8()
1124 #return obj
1125
1126 # Primitives are copied from J8 above.
1127 # TODO: We also want hex literals.
1128 elif self.tok_id == Id.J8_Null:
1129 self._Next()
1130 obj = nvalue.Null
1131
1132 elif self.tok_id == Id.J8_Bool:
1133 b = nvalue.Bool(self.s[self.start_pos] == 't')
1134 self._Next()
1135 obj = b
1136
1137 elif self.tok_id == Id.J8_Int:
1138 part = self.s[self.start_pos:self.end_pos]
1139 self._Next()
1140 obj = nvalue.Int(int(part))
1141
1142 elif self.tok_id == Id.J8_Float:
1143 part = self.s[self.start_pos:self.end_pos]
1144 self._Next()
1145 obj = nvalue.Float(float(part))
1146
1147 elif self.tok_id == Id.J8_String:
1148 str_val = nvalue.Str(self.decoded)
1149 self._Next()
1150 obj = str_val
1151
1152 # <- etc.
1153 elif self.tok_id in (Id.J8_Identifier, Id.J8_Operator, Id.J8_Colon,
1154 Id.J8_Comma):
1155 # unquoted "word" treated like a string
1156 part = self.s[self.start_pos:self.end_pos]
1157 self._Next()
1158 obj = nvalue.Symbol(part)
1159
1160 elif self.tok_id == Id.Eol_Tok:
1161 raise self._ParseError('Unexpected EOF while parsing %s' %
1162 self.lang_str)
1163
1164 else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
1165 raise self._ParseError('Invalid token while parsing %s: %s' %
1166 (self.lang_str, Id_str(self.tok_id)))
1167
1168 #log('YO %s', Id_str(self.tok_id))
1169 if self.tok_id in (Id.J8_Operator, Id.J8_Colon, Id.J8_Comma):
1170 #log('AT %s', Id_str(self.tok_id))
1171
1172 # key: "value" -> (: key "value")
1173 part = self.s[self.start_pos:self.end_pos]
1174 op = nvalue.Symbol(part)
1175
1176 self._Next()
1177 operand2 = self._ParseNil8()
1178 infix = nvalue.List([op, obj, operand2]) # type: nvalue_t
1179 #print("--> INFIX %d %s" % (id(infix), infix))
1180 return infix
1181
1182 #next_id = self._LookAhead()
1183 #print('NEXT %s' % Id_str(next_id))
1184
1185 #raise AssertionError()
1186 #print("--> OBJ %d %s" % (id(obj), obj))
1187 return obj
1188
1189 def ParseNil8(self):
1190 # type: () -> nvalue_t
1191 """ Raises error.Decode. """
1192 self._Next()
1193 #print('yo')
1194 obj = self._ParseNil8()
1195 #print("==> %d %s" % (id(obj), obj))
1196 if self.tok_id != Id.Eol_Tok:
1197 raise self._ParseError('Unexpected trailing input')
1198 return obj
1199
1200
1201class J8LinesParser(_Parser):
1202 """Decode lines from a string with newlines.
1203
1204 We specify this with a grammar, to preserve location info and to reduce
1205 allocations. (But note that unquoted_line is more like a LOOP than it is
1206 grammatical.)
1207
1208 Grammar:
1209
1210 end = J8_Newline | Eol_Tok
1211
1212 empty_line = WS_Space? end
1213
1214 # special case: read until end token, but REMOVE trailing WS_Space
1215 unquoted_line = WS_Space? Lit_Chars ANY* WS_Space? end
1216
1217 j8_line = WS_Space? J8_String WS_Space? end
1218
1219 lines = (empty_line | unquoted_line | j8_line)*
1220
1221 where Lit_Chars is valid UTF-8
1222
1223 Notes:
1224
1225 (1) We disallow multiple strings on a line, like:
1226
1227 "json" "json2"
1228 "json" unquoted
1229
1230 (2) Internal quotes are allowed on unquoted lines. Consider this line:
1231
1232 foo "" u''
1233
1234 The "" and u'' are not a decoded string, because the line started with
1235 Id.Lit_Chars literals.
1236
1237 (3) This is related to TSV8? Similar rules. Does TSV8 have empty cells?
1238 Does it have - for empty cell?
1239 """
1240
1241 def __init__(self, s):
1242 # type: (str) -> None
1243 _Parser.__init__(self, s, True)
1244
1245 def _Show(self, s):
1246 # type: (str) -> None
1247 log('%s tok_id %s %d-%d', s, Id_str(self.tok_id), self.start_pos,
1248 self.end_pos)
1249
1250 def _ParseLine(self, out):
1251 # type: (List[str]) -> None
1252 """ May append a line to 'out' """
1253 #self._Show('1')
1254 if self.tok_id == Id.WS_Space:
1255 self._NextForLines()
1256
1257 # Empty line - return without doing anything
1258 if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1259 self._NextForLines()
1260 return
1261
1262 # Quoted string on line
1263 if self.tok_id == Id.J8_String:
1264 out.append(self.decoded)
1265 self._NextForLines()
1266
1267 if self.tok_id == Id.WS_Space: # trailing whitespace
1268 self._NextForLines()
1269
1270 if self.tok_id not in (Id.J8_Newline, Id.Eol_Tok):
1271 raise self._ParseError('Unexpected text after J8 Line (%s)' %
1272 Id_str(self.tok_id))
1273
1274 self._NextForLines()
1275 return
1276
1277 # Unquoted line
1278 if self.tok_id == Id.Lit_Chars:
1279 # ' unquoted "" text on line ' # read every token until end
1280 string_start = self.start_pos
1281 while True:
1282 # for stripping whitespace
1283 prev_id = self.tok_id
1284 prev_start = self.start_pos
1285
1286 self._NextForLines()
1287
1288 # It would be nicer if "middle" Id.WS_Space tokens didn't have
1289 # \r, but we're sticking with the JSON spec definition of
1290 # whitespace. (As another data point, CPython on Unix allows
1291 # \r in the middle of expressions, treating it as whitespace.)
1292 if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1293 break
1294
1295 if prev_id == Id.WS_Space:
1296 string_end = prev_start # remove trailing whitespace
1297 else:
1298 string_end = self.start_pos
1299
1300 out.append(self.s[string_start:string_end])
1301
1302 self._NextForLines() # past newline
1303 return
1304
1305 raise AssertionError(Id_str(self.tok_id))
1306
1307 def Parse(self):
1308 # type: () -> List[str]
1309 """ Raises error.Decode. """
1310 self._NextForLines()
1311
1312 lines = [] # type: List[str]
1313 while self.tok_id != Id.Eol_Tok:
1314 self._ParseLine(lines)
1315
1316 if self.tok_id != Id.Eol_Tok:
1317 raise self._ParseError('Unexpected trailing input in J8 Lines')
1318
1319 return lines
1320
1321
1322def SplitJ8Lines(s):
1323 # type: (str) -> List[str]
1324 """Used by @(echo split command sub)
1325
1326 Raises:
1327 error.Decode
1328
1329 3 Errors:
1330 - J8 string syntax error inside quotes
1331 - Extra input on line
1332 - unquoted line isn't utf-8
1333 """
1334 p = J8LinesParser(s)
1335 return p.Parse()
1336
1337
1338# vim: sw=4