osh/string_ops.py

OILS / osh / string_ops.py View on Github | oils.pub

554 lines, 258 significant

1	"""
2	string_ops.py - String library functions that can be exposed with a saner syntax.
3
4	OSH:
5
6	local y=${x//a*/b}
7
8	YSH:
9
10	var y = x => sub('a*', 'b', :ALL)
11
12	Pass x => sub('a*', 'b', :ALL) => var y
13	"""
14
15	from _devbuild.gen.id_kind_asdl import Id
16	from _devbuild.gen.syntax_asdl import loc, Token, suffix_op
17	from core import pyutil
18	from display import ui
19	from core import error
20	from core.error import e_die, e_strict
21	from mycpp.mylib import log
22	from mycpp import mylib
23	from osh import glob_
24
25	import libc
26	import fastfunc
27
28	from typing import List, Tuple
29
30	_ = log
31
32	# Error types returned by fastfunc.Utf8DecodeOne
33	# Derived from Utf8Error enum from data_lang/utf8.h
34	UTF8_ERR_OVERLONG = -1 # Encodes a codepoint in more bytes than necessary
35	UTF8_ERR_SURROGATE = -2 # Encodes a codepoint in the surrogate range (0xD800 to 0xDFFF)
36	UTF8_ERR_TOO_LARGE = -3 # Encodes a value greater than the max codepoint U+10FFFF
37	UTF8_ERR_BAD_ENCODING = -4 # Encoding doesn't conform to the UTF-8 bit patterns
38	UTF8_ERR_TRUNCATED_BYTES = -5 # It looks like there is another codepoint, but it has been truncated
39
40
41	def Utf8Error_str(error):
42	# type: (int) -> str
43	if error == UTF8_ERR_OVERLONG:
44	return "UTF-8 decode: Overlong"
45	if error == UTF8_ERR_SURROGATE:
46	return "UTF-8 decode: Surrogate range"
47	if error == UTF8_ERR_TOO_LARGE:
48	return "UTF-8 decode: Integer too large"
49	if error == UTF8_ERR_BAD_ENCODING:
50	return "UTF-8 decode: Bad encoding"
51	if error == UTF8_ERR_TRUNCATED_BYTES:
52	return "UTF-8 decode: Truncated bytes"
53
54	raise AssertionError(0)
55
56
57	def DecodeUtf8Char(s, start):
58	# type: (str, int) -> int
59	"""Given a string and start index, decode the Unicode char immediately
60	following the start index. The start location is in bytes and should be
61	found using a function like NextUtf8Char or PreviousUtf8Char.
62
63	If the codepoint in invalid, we raise an `error.Expr`. (This is different
64	from {Next,Previous}Utf8Char which raises an `error.Strict` on encoding
65	errors.)
66	"""
67	codepoint_or_error, _bytes_read = fastfunc.Utf8DecodeOne(s, start)
68	if codepoint_or_error < 0:
69	raise error.Expr(
70	"%s at offset %d in string of %d bytes" %
71	(Utf8Error_str(codepoint_or_error), start, len(s)), loc.Missing)
72	return codepoint_or_error
73
74
75	def NextUtf8Char(s, i):
76	# type: (str, int) -> int
77	"""Given a string and a byte offset, returns the byte position after the
78	character at this position. Usually this is the position of the next
79	character, but for the last character in the string, it's the position just
80	past the end of the string.
81
82	Validates UTF-8.
83	"""
84	codepoint_or_error, bytes_read = fastfunc.Utf8DecodeOne(s, i)
85	if codepoint_or_error < 0:
86	e_strict(
87	"%s at offset %d in string of %d bytes" %
88	(Utf8Error_str(codepoint_or_error), i, len(s)), loc.Missing)
89	return i + bytes_read
90
91
92	_INVALID_START = 'Invalid start of UTF-8 sequence'
93
94
95	def _Utf8CharLen(starting_byte):
96	# type: (int) -> int
97	if (starting_byte >> 7) == 0b0:
98	return 1
99	elif (starting_byte >> 5) == 0b110:
100	return 2
101	elif (starting_byte >> 4) == 0b1110:
102	return 3
103	elif (starting_byte >> 3) == 0b11110:
104	return 4
105	else:
106	e_strict(_INVALID_START, loc.Missing)
107
108
109	def PreviousUtf8Char(s, i):
110	# type: (str, int) -> int
111	"""Given a string and a byte offset, returns the position of the character
112	before that offset. To start (find the first byte of the last character),
113	pass len(s) for the initial value of i.
114
115	Validates UTF-8.
116	"""
117	# All bytes in a valid UTF-8 string have one of the following formats:
118	#
119	# 0xxxxxxx (1-byte char)
120	# 110xxxxx (start of 2-byte char)
121	# 1110xxxx (start of 3-byte char)
122	# 11110xxx (start of 4-byte char)
123	# 10xxxxxx (continuation byte)
124	#
125	# Any byte that starts with 10... MUST be a continuation byte,
126	# otherwise it must be the start of a character (or just invalid
127	# data).
128	#
129	# Walking backward, we stop at the first non-continuaton byte
130	# found. We try to interpret it as a valid UTF-8 character starting
131	# byte, and check that it indicates the correct length, based on how
132	# far we've moved from the original byte. Possible problems:
133	# * byte we stopped on does not have a valid value (e.g., 11111111)
134	# * start byte indicates more or fewer continuation bytes than we've seen
135	# * no start byte at beginning of array
136	#
137	# Note that because we are going backward, on malformed input, we
138	# won't error out in the same place as when parsing the string
139	# forwards as normal.
140	orig_i = i
141
142	while i > 0:
143	i -= 1
144	byte_as_int = mylib.ByteAt(s, i)
145	if (byte_as_int >> 6) != 0b10:
146	offset = orig_i - i
147	if offset != _Utf8CharLen(byte_as_int):
148	# Leaving a generic error for now, but if we want to, it's not
149	# hard to calculate the position where things go wrong. Note
150	# that offset might be more than 4, for an invalid utf-8 string.
151	e_strict(_INVALID_START, loc.Missing)
152	return i
153
154	e_strict(_INVALID_START, loc.Missing)
155
156
157	def CountUtf8Chars(s):
158	# type: (str) -> int
159	"""Returns the number of utf-8 characters in the byte string 's'.
160
161	TODO: Raise exception rather than returning a string, so we can set the exit
162	code of the command to 1 ?
163
164	$ echo ${#bad}
165	Invalid utf-8 at index 3 of string 'bad': 'ab\xffd'
166	$ echo $?
167	1
168	"""
169	num_chars = 0
170	num_bytes = len(s)
171	i = 0
172	while i < num_bytes:
173	i = NextUtf8Char(s, i)
174	num_chars += 1
175	return num_chars
176
177
178	def AdvanceUtf8Chars(s, num_chars, byte_offset):
179	# type: (str, int, int) -> int
180	"""Starting from byte offset, advance by N UTF-8 runes
181
182	Returns a byte offset.
183
184	Used for shell slicing.
185	"""
186	num_bytes = len(s)
187	i = byte_offset # current byte position
188
189	for _ in xrange(num_chars):
190	# Neither bash or zsh checks out of bounds for slicing. Either begin or
191	# length.
192	if i >= num_bytes:
193	return i
194	#raise RuntimeError('Out of bounds')
195
196	i = NextUtf8Char(s, i)
197
198	return i
199
200
201	# Limited Unicode codepoints for whitespace characters.
202	# Oils intentionally does not include characters from <USP>, as that set
203	# depends on the version of the Unicode standard used.
204	#
205	# See discussion on the original pull request which added this list here:
206	#
207	# https://github.com/oilshell/oil/pull/1836#issuecomment-1942173520
208	#
209	# See also the Mozilla Javascript documentation, and the note on how
210	# changes to the standard affected Javascript:
211	#
212	# https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Lexical_grammar#white_space
213
214	SPACES = [
215	0x0009, # Horizontal tab (\t)
216	0x000A, # Newline (\n)
217	0x000B, # Vertical tab (\v)
218	0x000C, # Form feed (\f)
219	0x000D, # Carriage return (\r)
220	0x0020, # Normal space
221	0x00A0, # No-break space <NBSP>
222	0xFEFF, # Zero-width no-break space <ZWNBSP>
223	]
224
225
226	def _IsSpace(codepoint):
227	# type: (int) -> bool
228	return codepoint in SPACES
229
230
231	def StartsWithWhitespaceByteRange(s):
232	# type: (str) -> Tuple[int, int]
233	"""Returns the range of 's' which has leading whitespace characters.
234
235	If 's' has no leading whitespace, an valid but empty range is returned.
236
237	The returned range is given as byte positions, and is a half-open range
238	"[start, end)" which is returned as a tuple.
239
240	Used for shell functions like 'trimStart' to match then trim whitespace.
241	"""
242	len_s = len(s)
243	i = 0
244	while i < len_s:
245	codepoint = DecodeUtf8Char(s, i)
246	if not _IsSpace(codepoint):
247	break
248
249	try:
250	i = NextUtf8Char(s, i)
251	except error.Strict:
252	assert False, "DecodeUtf8Char should have caught any encoding errors"
253
254	start = 0
255	end = i
256	return (start, end)
257
258
259	def EndsWithWhitespaceByteRange(s):
260	# type: (str) -> Tuple[int, int]
261	"""Returns the range of 's' which has trailing whitespace characters.
262
263	If 's' has no leading whitespace, an valid but empty range is returned.
264
265	The returned range is given as byte positions, and is a half-open range
266	"[start, end)" which is returned as a tuple.
267
268	Used for shell functions like 'trimEnd' to match then trim whitespace.
269	"""
270	len_s = len(s)
271	i = len_s
272	while i > 0:
273	# TODO: Gracefully handle surrogate pairs and overlong encodings when
274	# finding the start of each character.
275	prev = PreviousUtf8Char(s, i)
276
277	codepoint = DecodeUtf8Char(s, prev)
278	if not _IsSpace(codepoint):
279	break
280
281	i = prev
282
283	start = i
284	end = len_s
285	return (start, end)
286
287
288	# Implementation without Python regex:
289	#
290	# (1) PatSub: I think we fill in GlobToExtendedRegex, then use regcomp and
291	# regexec. in a loop. fnmatch() does NOT given positions of matches.
292	#
293	# (2) Strip -- % %% # ## -
294	#
295	# a. Fast path for constant strings.
296	# b. Convert to POSIX extended regex, to see if it matches at ALL. If it
297	# doesn't match, short circuit out? We can't do this with fnmatch.
298	# c. If it does match, call fnmatch() iteratively over prefixes / suffixes.
299	#
300	# - # shortest prefix - [:1], [:2], [:3] until it matches
301	# - ## longest prefix - [:-1] [:-2], [:3]. Works because fnmatch does not
302	# match prefixes, it matches EXACTLY.
303	# - % shortest suffix - [-1:] [-2:] [-3:] ...
304	# - %% longest suffix - [1:] [2:] [3:]
305	#
306	# See remove_pattern() in subst.c for bash, and trimsub() in eval.c for
307	# mksh. Dash doesn't implement it.
308
309	# TODO:
310	# - Unicode support: Convert both pattern, string, and replacement to unicode,
311	# then the result back at the end.
312	# - Compile time errors for [[:space:]] ?
313
314
315	def DoUnarySuffixOp(s, op_tok, arg, is_extglob):
316	# type: (str, Token, str, bool) -> str
317	"""Helper for ${x#prefix} and family."""
318
319	id_ = op_tok.id
320
321	# Fast path for constant strings.
322	# TODO: Should be LooksLikeExtendedGlob!
323	if not is_extglob and not glob_.LooksLikeGlob(arg):
324	# It doesn't look like a glob, but we glob-escaped it (e.g. [ -> \[). So
325	# reverse it. NOTE: We also do this check in Globber.Expand(). It would
326	# be nice to somehow store the original string rather than
327	# escaping/unescaping.
328	arg = glob_.GlobUnescape(arg)
329
330	if id_ in (Id.VOp1_Pound, Id.VOp1_DPound): # const prefix
331	# explicit check for non-empty arg (len for mycpp)
332	if len(arg) and s.startswith(arg):
333	return s[len(arg):]
334	else:
335	return s
336
337	elif id_ in (Id.VOp1_Percent, Id.VOp1_DPercent): # const suffix
338	# need explicit check for non-empty arg (len for mycpp)
339	if len(arg) and s.endswith(arg):
340	return s[:-len(arg)]
341	else:
342	return s
343
344	# These operators take glob arguments, we don't implement that obscure case.
345	elif id_ == Id.VOp1_Comma: # Only lowercase the first letter
346	if arg != '':
347	e_die("%s can't have an argument" % ui.PrettyId(id_), op_tok)
348	if len(s):
349	return s[0].lower() + s[1:]
350	else:
351	return s
352
353	elif id_ == Id.VOp1_DComma:
354	if arg != '':
355	e_die("%s can't have an argument" % ui.PrettyId(id_), op_tok)
356	return s.lower()
357
358	elif id_ == Id.VOp1_Caret: # Only uppercase the first letter
359	if arg != '':
360	e_die("%s can't have an argument" % ui.PrettyId(id_), op_tok)
361	if len(s):
362	return s[0].upper() + s[1:]
363	else:
364	return s
365
366	elif id_ == Id.VOp1_DCaret:
367	if arg != '':
368	e_die("%s can't have an argument" % ui.PrettyId(id_), op_tok)
369	return s.upper()
370
371	else: # e.g. ^ ^^ , ,,
372	raise AssertionError(id_)
373
374	# For patterns, do fnmatch() in a loop.
375	#
376	# TODO:
377	# - Another potential fast path:
378	# v=aabbccdd
379	# echo ${v#*b} # strip shortest prefix
380	#
381	# If the whole thing doesn't match 'b', then no test can succeed. So we
382	# can fail early. Conversely echo ${v%%c} and 'c*'.
383	#
384	# (Although honestly this whole construct is nuts and should be deprecated.)
385
386	# Bug fix: libc.fnmatch() takes NUL-terminated strings, so truncate it here
387	# to avoid disagreement about the length.
388	i = s.find('\0')
389	if i == -1:
390	n = len(s)
391	else:
392	s = s[:i]
393	n = i
394
395	if id_ == Id.VOp1_Pound: # shortest prefix
396	# 'abcd': match '', 'a', 'ab', 'abc', ...
397	i = 0
398	#log('# %r', s)
399	while True:
400	#log(' i %d', i)
401	assert i <= n
402	#log('Matching pattern %r with %r', arg, s[:i])
403	if libc.fnmatch(arg, s[:i]):
404	return s[i:]
405	if i >= n:
406	break
407	i = NextUtf8Char(s, i)
408	return s
409
410	elif id_ == Id.VOp1_DPound: # longest prefix
411	# 'abcd': match 'abc', 'ab', 'a'
412	i = n
413	while True:
414	assert i >= 0
415	#log('Matching pattern %r with %r', arg, s[:i])
416	if libc.fnmatch(arg, s[:i]):
417	return s[i:]
418	if i == 0:
419	break
420	i = PreviousUtf8Char(s, i)
421	return s
422
423	elif id_ == Id.VOp1_Percent: # shortest suffix
424	# 'abcd': match 'abcd', 'abc', 'ab', 'a'
425	i = n
426	while True:
427	assert i >= 0
428	#log('Matching pattern %r with %r', arg, s[:i])
429	if libc.fnmatch(arg, s[i:]):
430	return s[:i]
431	if i == 0:
432	break
433	i = PreviousUtf8Char(s, i)
434	return s
435
436	elif id_ == Id.VOp1_DPercent: # longest suffix
437	# 'abcd': match 'abc', 'bc', 'c', ...
438	i = 0
439	while True:
440	assert i <= n
441	#log('Matching pattern %r with %r', arg, s[:i])
442	if libc.fnmatch(arg, s[i:]):
443	return s[:i]
444	if i >= n:
445	break
446	i = NextUtf8Char(s, i)
447	return s
448
449	else:
450	raise NotImplementedError(ui.PrettyId(id_))
451
452
453	def _AllMatchPositions(s, regex):
454	# type: (str, str) -> List[Tuple[int, int]]
455	"""Returns a list of all (start, end) match positions of the regex against
456	s.
457
458	(If there are no matches, it returns the empty list.)
459	"""
460	matches = [] # type: List[Tuple[int, int]]
461	pos = 0
462	n = len(s)
463	while pos < n: # needed to prevent infinite loop in (.*) case
464	m = libc.regex_first_group_match(regex, s, pos)
465	if m is None:
466	break
467	matches.append(m)
468	start, end = m
469	pos = end # advance position
470	return matches
471
472
473	def _PatSubAll(s, regex, replace_str):
474	# type: (str, str, str) -> str
475	parts = [] # type: List[str]
476	prev_end = 0
477	for start, end in _AllMatchPositions(s, regex):
478	parts.append(s[prev_end:start])
479	parts.append(replace_str)
480	prev_end = end
481	parts.append(s[prev_end:])
482	return ''.join(parts)
483
484
485	class GlobReplacer(object):
486
487	def __init__(self, regex, replace_str, slash_tok):
488	# type: (str, str, Token) -> None
489
490	# TODO: It would be nice to cache the compilation of the regex here,
491	# instead of just the string. That would require more sophisticated use of
492	# the Python/C API in libc.c, which we might want to avoid.
493	self.regex = regex
494	self.replace_str = replace_str
495	self.slash_tok = slash_tok
496
497	def __repr__(self):
498	# type: () -> str
499	return '<_GlobReplacer regex %r r %r>' % (self.regex, self.replace_str)
500
501	def Replace(self, s, op):
502	# type: (str, suffix_op.PatSub) -> str
503
504	regex = '(%s)' % self.regex # make it a group
505
506	if op.replace_mode == Id.Lit_Slash:
507	# Avoid infinite loop when replacing all copies of empty string
508	if len(self.regex) == 0:
509	return s
510
511	try:
512	# loop over matches
513	return _PatSubAll(s, regex, self.replace_str)
514	except RuntimeError as e:
515	# Not sure if this is possible since we convert from glob:
516	# libc.regex_first_group_match raises RuntimeError on regex syntax
517	# error.
518	msg = e.message # type: str
519	e_die('Error matching regex %r: %s' % (regex, msg),
520	self.slash_tok)
521
522	if op.replace_mode == Id.Lit_Pound:
523	regex = '^' + regex
524	elif op.replace_mode == Id.Lit_Percent:
525	regex = regex + '$'
526
527	m = libc.regex_first_group_match(regex, s, 0)
528	#log('regex = %r, s = %r, match = %r', regex, s, m)
529	if m is None:
530	return s
531	start, end = m
532	return s[:start] + self.replace_str + s[end:]
533
534
535	def ShellQuoteB(s):
536	# type: (str) -> str
537	"""Quote by adding backslashes.
538
539	Used for autocompletion, so it's friendlier for display on the
540	command line. We use the strategy above for other use cases.
541	"""
542	# There's no way to escape a newline! Bash prints ^J for some reason, but
543	# we're more explicit. This will happen if there's a newline on a file
544	# system or a completion plugin returns a newline.
545
546	# NOTE: tabs CAN be escaped with \.
547	s = s.replace('\r', '<INVALID CR>').replace('\n', '<INVALID NEWLINE>')
548
549	# ~ for home dir
550	# ! for history
551	# * [] ? for glob
552	# {} for brace expansion
553	# space because it separates words
554	return pyutil.BackslashEscape(s, ' `~!$&*()[]{}\\\|;\'"<>?')