OILS / osh / string_ops.py View on Github | oils.pub

554 lines, 258 significant
1"""
2string_ops.py - String library functions that can be exposed with a saner syntax.
3
4OSH:
5
6 local y=${x//a*/b}
7
8YSH:
9
10 var y = x => sub('a*', 'b', :ALL)
11
12 Pass x => sub('a*', 'b', :ALL) => var y
13"""
14
15from _devbuild.gen.id_kind_asdl import Id
16from _devbuild.gen.syntax_asdl import loc, Token, suffix_op
17from core import pyutil
18from display import ui
19from core import error
20from core.error import e_die, e_strict
21from mycpp.mylib import log
22from mycpp import mylib
23from osh import glob_
24
25import libc
26import fastfunc
27
28from typing import List, Tuple
29
30_ = log
31
32# Error types returned by fastfunc.Utf8DecodeOne
33# Derived from Utf8Error enum from data_lang/utf8.h
34UTF8_ERR_OVERLONG = -1 # Encodes a codepoint in more bytes than necessary
35UTF8_ERR_SURROGATE = -2 # Encodes a codepoint in the surrogate range (0xD800 to 0xDFFF)
36UTF8_ERR_TOO_LARGE = -3 # Encodes a value greater than the max codepoint U+10FFFF
37UTF8_ERR_BAD_ENCODING = -4 # Encoding doesn't conform to the UTF-8 bit patterns
38UTF8_ERR_TRUNCATED_BYTES = -5 # It looks like there is another codepoint, but it has been truncated
39
40
41def Utf8Error_str(error):
42 # type: (int) -> str
43 if error == UTF8_ERR_OVERLONG:
44 return "UTF-8 decode: Overlong"
45 if error == UTF8_ERR_SURROGATE:
46 return "UTF-8 decode: Surrogate range"
47 if error == UTF8_ERR_TOO_LARGE:
48 return "UTF-8 decode: Integer too large"
49 if error == UTF8_ERR_BAD_ENCODING:
50 return "UTF-8 decode: Bad encoding"
51 if error == UTF8_ERR_TRUNCATED_BYTES:
52 return "UTF-8 decode: Truncated bytes"
53
54 raise AssertionError(0)
55
56
57def DecodeUtf8Char(s, start):
58 # type: (str, int) -> int
59 """Given a string and start index, decode the Unicode char immediately
60 following the start index. The start location is in bytes and should be
61 found using a function like NextUtf8Char or PreviousUtf8Char.
62
63 If the codepoint in invalid, we raise an `error.Expr`. (This is different
64 from {Next,Previous}Utf8Char which raises an `error.Strict` on encoding
65 errors.)
66 """
67 codepoint_or_error, _bytes_read = fastfunc.Utf8DecodeOne(s, start)
68 if codepoint_or_error < 0:
69 raise error.Expr(
70 "%s at offset %d in string of %d bytes" %
71 (Utf8Error_str(codepoint_or_error), start, len(s)), loc.Missing)
72 return codepoint_or_error
73
74
75def NextUtf8Char(s, i):
76 # type: (str, int) -> int
77 """Given a string and a byte offset, returns the byte position after the
78 character at this position. Usually this is the position of the next
79 character, but for the last character in the string, it's the position just
80 past the end of the string.
81
82 Validates UTF-8.
83 """
84 codepoint_or_error, bytes_read = fastfunc.Utf8DecodeOne(s, i)
85 if codepoint_or_error < 0:
86 e_strict(
87 "%s at offset %d in string of %d bytes" %
88 (Utf8Error_str(codepoint_or_error), i, len(s)), loc.Missing)
89 return i + bytes_read
90
91
92_INVALID_START = 'Invalid start of UTF-8 sequence'
93
94
95def _Utf8CharLen(starting_byte):
96 # type: (int) -> int
97 if (starting_byte >> 7) == 0b0:
98 return 1
99 elif (starting_byte >> 5) == 0b110:
100 return 2
101 elif (starting_byte >> 4) == 0b1110:
102 return 3
103 elif (starting_byte >> 3) == 0b11110:
104 return 4
105 else:
106 e_strict(_INVALID_START, loc.Missing)
107
108
109def PreviousUtf8Char(s, i):
110 # type: (str, int) -> int
111 """Given a string and a byte offset, returns the position of the character
112 before that offset. To start (find the first byte of the last character),
113 pass len(s) for the initial value of i.
114
115 Validates UTF-8.
116 """
117 # All bytes in a valid UTF-8 string have one of the following formats:
118 #
119 # 0xxxxxxx (1-byte char)
120 # 110xxxxx (start of 2-byte char)
121 # 1110xxxx (start of 3-byte char)
122 # 11110xxx (start of 4-byte char)
123 # 10xxxxxx (continuation byte)
124 #
125 # Any byte that starts with 10... MUST be a continuation byte,
126 # otherwise it must be the start of a character (or just invalid
127 # data).
128 #
129 # Walking backward, we stop at the first non-continuaton byte
130 # found. We try to interpret it as a valid UTF-8 character starting
131 # byte, and check that it indicates the correct length, based on how
132 # far we've moved from the original byte. Possible problems:
133 # * byte we stopped on does not have a valid value (e.g., 11111111)
134 # * start byte indicates more or fewer continuation bytes than we've seen
135 # * no start byte at beginning of array
136 #
137 # Note that because we are going backward, on malformed input, we
138 # won't error out in the same place as when parsing the string
139 # forwards as normal.
140 orig_i = i
141
142 while i > 0:
143 i -= 1
144 byte_as_int = mylib.ByteAt(s, i)
145 if (byte_as_int >> 6) != 0b10:
146 offset = orig_i - i
147 if offset != _Utf8CharLen(byte_as_int):
148 # Leaving a generic error for now, but if we want to, it's not
149 # hard to calculate the position where things go wrong. Note
150 # that offset might be more than 4, for an invalid utf-8 string.
151 e_strict(_INVALID_START, loc.Missing)
152 return i
153
154 e_strict(_INVALID_START, loc.Missing)
155
156
157def CountUtf8Chars(s):
158 # type: (str) -> int
159 """Returns the number of utf-8 characters in the byte string 's'.
160
161 TODO: Raise exception rather than returning a string, so we can set the exit
162 code of the command to 1 ?
163
164 $ echo ${#bad}
165 Invalid utf-8 at index 3 of string 'bad': 'ab\xffd'
166 $ echo $?
167 1
168 """
169 num_chars = 0
170 num_bytes = len(s)
171 i = 0
172 while i < num_bytes:
173 i = NextUtf8Char(s, i)
174 num_chars += 1
175 return num_chars
176
177
178def AdvanceUtf8Chars(s, num_chars, byte_offset):
179 # type: (str, int, int) -> int
180 """Starting from byte offset, advance by N UTF-8 runes
181
182 Returns a byte offset.
183
184 Used for shell slicing.
185 """
186 num_bytes = len(s)
187 i = byte_offset # current byte position
188
189 for _ in xrange(num_chars):
190 # Neither bash or zsh checks out of bounds for slicing. Either begin or
191 # length.
192 if i >= num_bytes:
193 return i
194 #raise RuntimeError('Out of bounds')
195
196 i = NextUtf8Char(s, i)
197
198 return i
199
200
201# Limited Unicode codepoints for whitespace characters.
202# Oils intentionally does not include characters from <USP>, as that set
203# depends on the version of the Unicode standard used.
204#
205# See discussion on the original pull request which added this list here:
206#
207# https://github.com/oilshell/oil/pull/1836#issuecomment-1942173520
208#
209# See also the Mozilla Javascript documentation, and the note on how
210# changes to the standard affected Javascript:
211#
212# https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Lexical_grammar#white_space
213
214SPACES = [
215 0x0009, # Horizontal tab (\t)
216 0x000A, # Newline (\n)
217 0x000B, # Vertical tab (\v)
218 0x000C, # Form feed (\f)
219 0x000D, # Carriage return (\r)
220 0x0020, # Normal space
221 0x00A0, # No-break space <NBSP>
222 0xFEFF, # Zero-width no-break space <ZWNBSP>
223]
224
225
226def _IsSpace(codepoint):
227 # type: (int) -> bool
228 return codepoint in SPACES
229
230
231def StartsWithWhitespaceByteRange(s):
232 # type: (str) -> Tuple[int, int]
233 """Returns the range of 's' which has leading whitespace characters.
234
235 If 's' has no leading whitespace, an valid but empty range is returned.
236
237 The returned range is given as byte positions, and is a half-open range
238 "[start, end)" which is returned as a tuple.
239
240 Used for shell functions like 'trimStart' to match then trim whitespace.
241 """
242 len_s = len(s)
243 i = 0
244 while i < len_s:
245 codepoint = DecodeUtf8Char(s, i)
246 if not _IsSpace(codepoint):
247 break
248
249 try:
250 i = NextUtf8Char(s, i)
251 except error.Strict:
252 assert False, "DecodeUtf8Char should have caught any encoding errors"
253
254 start = 0
255 end = i
256 return (start, end)
257
258
259def EndsWithWhitespaceByteRange(s):
260 # type: (str) -> Tuple[int, int]
261 """Returns the range of 's' which has trailing whitespace characters.
262
263 If 's' has no leading whitespace, an valid but empty range is returned.
264
265 The returned range is given as byte positions, and is a half-open range
266 "[start, end)" which is returned as a tuple.
267
268 Used for shell functions like 'trimEnd' to match then trim whitespace.
269 """
270 len_s = len(s)
271 i = len_s
272 while i > 0:
273 # TODO: Gracefully handle surrogate pairs and overlong encodings when
274 # finding the start of each character.
275 prev = PreviousUtf8Char(s, i)
276
277 codepoint = DecodeUtf8Char(s, prev)
278 if not _IsSpace(codepoint):
279 break
280
281 i = prev
282
283 start = i
284 end = len_s
285 return (start, end)
286
287
288# Implementation without Python regex:
289#
290# (1) PatSub: I think we fill in GlobToExtendedRegex, then use regcomp and
291# regexec. in a loop. fnmatch() does NOT given positions of matches.
292#
293# (2) Strip -- % %% # ## -
294#
295# a. Fast path for constant strings.
296# b. Convert to POSIX extended regex, to see if it matches at ALL. If it
297# doesn't match, short circuit out? We can't do this with fnmatch.
298# c. If it does match, call fnmatch() iteratively over prefixes / suffixes.
299#
300# - # shortest prefix - [:1], [:2], [:3] until it matches
301# - ## longest prefix - [:-1] [:-2], [:3]. Works because fnmatch does not
302# match prefixes, it matches EXACTLY.
303# - % shortest suffix - [-1:] [-2:] [-3:] ...
304# - %% longest suffix - [1:] [2:] [3:]
305#
306# See remove_pattern() in subst.c for bash, and trimsub() in eval.c for
307# mksh. Dash doesn't implement it.
308
309# TODO:
310# - Unicode support: Convert both pattern, string, and replacement to unicode,
311# then the result back at the end.
312# - Compile time errors for [[:space:]] ?
313
314
315def DoUnarySuffixOp(s, op_tok, arg, is_extglob):
316 # type: (str, Token, str, bool) -> str
317 """Helper for ${x#prefix} and family."""
318
319 id_ = op_tok.id
320
321 # Fast path for constant strings.
322 # TODO: Should be LooksLikeExtendedGlob!
323 if not is_extglob and not glob_.LooksLikeGlob(arg):
324 # It doesn't look like a glob, but we glob-escaped it (e.g. [ -> \[). So
325 # reverse it. NOTE: We also do this check in Globber.Expand(). It would
326 # be nice to somehow store the original string rather than
327 # escaping/unescaping.
328 arg = glob_.GlobUnescape(arg)
329
330 if id_ in (Id.VOp1_Pound, Id.VOp1_DPound): # const prefix
331 # explicit check for non-empty arg (len for mycpp)
332 if len(arg) and s.startswith(arg):
333 return s[len(arg):]
334 else:
335 return s
336
337 elif id_ in (Id.VOp1_Percent, Id.VOp1_DPercent): # const suffix
338 # need explicit check for non-empty arg (len for mycpp)
339 if len(arg) and s.endswith(arg):
340 return s[:-len(arg)]
341 else:
342 return s
343
344 # These operators take glob arguments, we don't implement that obscure case.
345 elif id_ == Id.VOp1_Comma: # Only lowercase the first letter
346 if arg != '':
347 e_die("%s can't have an argument" % ui.PrettyId(id_), op_tok)
348 if len(s):
349 return s[0].lower() + s[1:]
350 else:
351 return s
352
353 elif id_ == Id.VOp1_DComma:
354 if arg != '':
355 e_die("%s can't have an argument" % ui.PrettyId(id_), op_tok)
356 return s.lower()
357
358 elif id_ == Id.VOp1_Caret: # Only uppercase the first letter
359 if arg != '':
360 e_die("%s can't have an argument" % ui.PrettyId(id_), op_tok)
361 if len(s):
362 return s[0].upper() + s[1:]
363 else:
364 return s
365
366 elif id_ == Id.VOp1_DCaret:
367 if arg != '':
368 e_die("%s can't have an argument" % ui.PrettyId(id_), op_tok)
369 return s.upper()
370
371 else: # e.g. ^ ^^ , ,,
372 raise AssertionError(id_)
373
374 # For patterns, do fnmatch() in a loop.
375 #
376 # TODO:
377 # - Another potential fast path:
378 # v=aabbccdd
379 # echo ${v#*b} # strip shortest prefix
380 #
381 # If the whole thing doesn't match '*b*', then no test can succeed. So we
382 # can fail early. Conversely echo ${v%%c*} and '*c*'.
383 #
384 # (Although honestly this whole construct is nuts and should be deprecated.)
385
386 # Bug fix: libc.fnmatch() takes NUL-terminated strings, so truncate it here
387 # to avoid disagreement about the length.
388 i = s.find('\0')
389 if i == -1:
390 n = len(s)
391 else:
392 s = s[:i]
393 n = i
394
395 if id_ == Id.VOp1_Pound: # shortest prefix
396 # 'abcd': match '', 'a', 'ab', 'abc', ...
397 i = 0
398 #log('# %r', s)
399 while True:
400 #log(' i %d', i)
401 assert i <= n
402 #log('Matching pattern %r with %r', arg, s[:i])
403 if libc.fnmatch(arg, s[:i]):
404 return s[i:]
405 if i >= n:
406 break
407 i = NextUtf8Char(s, i)
408 return s
409
410 elif id_ == Id.VOp1_DPound: # longest prefix
411 # 'abcd': match 'abc', 'ab', 'a'
412 i = n
413 while True:
414 assert i >= 0
415 #log('Matching pattern %r with %r', arg, s[:i])
416 if libc.fnmatch(arg, s[:i]):
417 return s[i:]
418 if i == 0:
419 break
420 i = PreviousUtf8Char(s, i)
421 return s
422
423 elif id_ == Id.VOp1_Percent: # shortest suffix
424 # 'abcd': match 'abcd', 'abc', 'ab', 'a'
425 i = n
426 while True:
427 assert i >= 0
428 #log('Matching pattern %r with %r', arg, s[:i])
429 if libc.fnmatch(arg, s[i:]):
430 return s[:i]
431 if i == 0:
432 break
433 i = PreviousUtf8Char(s, i)
434 return s
435
436 elif id_ == Id.VOp1_DPercent: # longest suffix
437 # 'abcd': match 'abc', 'bc', 'c', ...
438 i = 0
439 while True:
440 assert i <= n
441 #log('Matching pattern %r with %r', arg, s[:i])
442 if libc.fnmatch(arg, s[i:]):
443 return s[:i]
444 if i >= n:
445 break
446 i = NextUtf8Char(s, i)
447 return s
448
449 else:
450 raise NotImplementedError(ui.PrettyId(id_))
451
452
453def _AllMatchPositions(s, regex):
454 # type: (str, str) -> List[Tuple[int, int]]
455 """Returns a list of all (start, end) match positions of the regex against
456 s.
457
458 (If there are no matches, it returns the empty list.)
459 """
460 matches = [] # type: List[Tuple[int, int]]
461 pos = 0
462 n = len(s)
463 while pos < n: # needed to prevent infinite loop in (.*) case
464 m = libc.regex_first_group_match(regex, s, pos)
465 if m is None:
466 break
467 matches.append(m)
468 start, end = m
469 pos = end # advance position
470 return matches
471
472
473def _PatSubAll(s, regex, replace_str):
474 # type: (str, str, str) -> str
475 parts = [] # type: List[str]
476 prev_end = 0
477 for start, end in _AllMatchPositions(s, regex):
478 parts.append(s[prev_end:start])
479 parts.append(replace_str)
480 prev_end = end
481 parts.append(s[prev_end:])
482 return ''.join(parts)
483
484
485class GlobReplacer(object):
486
487 def __init__(self, regex, replace_str, slash_tok):
488 # type: (str, str, Token) -> None
489
490 # TODO: It would be nice to cache the compilation of the regex here,
491 # instead of just the string. That would require more sophisticated use of
492 # the Python/C API in libc.c, which we might want to avoid.
493 self.regex = regex
494 self.replace_str = replace_str
495 self.slash_tok = slash_tok
496
497 def __repr__(self):
498 # type: () -> str
499 return '<_GlobReplacer regex %r r %r>' % (self.regex, self.replace_str)
500
501 def Replace(self, s, op):
502 # type: (str, suffix_op.PatSub) -> str
503
504 regex = '(%s)' % self.regex # make it a group
505
506 if op.replace_mode == Id.Lit_Slash:
507 # Avoid infinite loop when replacing all copies of empty string
508 if len(self.regex) == 0:
509 return s
510
511 try:
512 # loop over matches
513 return _PatSubAll(s, regex, self.replace_str)
514 except RuntimeError as e:
515 # Not sure if this is possible since we convert from glob:
516 # libc.regex_first_group_match raises RuntimeError on regex syntax
517 # error.
518 msg = e.message # type: str
519 e_die('Error matching regex %r: %s' % (regex, msg),
520 self.slash_tok)
521
522 if op.replace_mode == Id.Lit_Pound:
523 regex = '^' + regex
524 elif op.replace_mode == Id.Lit_Percent:
525 regex = regex + '$'
526
527 m = libc.regex_first_group_match(regex, s, 0)
528 #log('regex = %r, s = %r, match = %r', regex, s, m)
529 if m is None:
530 return s
531 start, end = m
532 return s[:start] + self.replace_str + s[end:]
533
534
535def ShellQuoteB(s):
536 # type: (str) -> str
537 """Quote by adding backslashes.
538
539 Used for autocompletion, so it's friendlier for display on the
540 command line. We use the strategy above for other use cases.
541 """
542 # There's no way to escape a newline! Bash prints ^J for some reason, but
543 # we're more explicit. This will happen if there's a newline on a file
544 # system or a completion plugin returns a newline.
545
546 # NOTE: tabs CAN be escaped with \.
547 s = s.replace('\r', '<INVALID CR>').replace('\n', '<INVALID NEWLINE>')
548
549 # ~ for home dir
550 # ! for history
551 # * [] ? for glob
552 # {} for brace expansion
553 # space because it separates words
554 return pyutil.BackslashEscape(s, ' `~!$&*()[]{}\\|;\'"<>?')