data_lang/htm8.py

OILS / data_lang / htm8.py View on Github | oils.pub

754 lines, 356 significant

1	"""data_lang/htm8.py
2
3	TODO
4
5	- would be nice: migrate everything off of TagLexer()
6	- oils_doc.py and help_gen.py
7	- this old API is stateful and uses Python iterators, which is problematic
8	- maybe we can use a better CSS selector abstraction
9
10	API:
11	- Get rid of Reset()?
12
13	Features:
14
15	- work on ToXml() test cases? This is another text of AttrLexer
16
17	Docs:
18
19	- Copy all errors into doc/ref/chap-errors.md
20	- This helps understand the language
21
22	C++:
23	- UTF-8 check, like JSON8
24	- re2c
25	- port lexer, which will fix static typing issues
26	- the abstraction needs to support submatch?
27	- for finding the end of a tag, etc.?
28	- and what about no match?
29
30	- harmonize LexError and ParseError with data_lang/j8.py, which uses
31	error.Decode(msg, ..., cur_line_num)
32	"""
33
34	import re
35
36	from typing import Dict, List, Tuple, Optional, IO, Any
37
38	from _devbuild.gen.htm8_asdl import (h8_id, h8_id_t, attr_name, attr_name_t,
39	attr_name_str, attr_value_e, attr_value_t,
40	h8_val_id)
41	from doctools.util import log
42
43
44	class LexError(Exception):
45	"""
46	Examples of lex errors:
47
48	- h8_id.Invalid, like <> or &&
49	- Unclosed <!-- <? <![CDATA[ <script> <style>
50	"""
51
52	def __init__(self, msg, code_str, start_pos):
53	# type: (str, str, int) -> None
54	self.msg = msg
55	self.code_str = code_str
56	self.start_pos = start_pos
57
58	def __str__(self):
59	# type: () -> str
60	return '(LexError %r %r)' % (
61	self.msg, self.code_str[self.start_pos:self.start_pos + 20])
62
63
64	def _FindLineNum(s, error_pos):
65	# type: (str, int) -> int
66	current_pos = 0
67	line_num = 1
68	while True:
69	newline_pos = s.find('\n', current_pos)
70	#log('current = %d, N %d, line %d', current_pos, newline_pos, line_num)
71
72	if newline_pos == -1: # this is the last line
73	return line_num
74	if newline_pos >= error_pos:
75	return line_num
76	line_num += 1
77	current_pos = newline_pos + 1
78
79
80	class ParseError(Exception):
81	"""
82	Examples of parse errors
83
84	- unbalanced tag structure
85	- ul_table.py errors
86	"""
87
88	def __init__(self, msg, s=None, start_pos=-1):
89	# type: (str, Optional[str], int) -> None
90	self.msg = msg
91	self.s = s
92	self.start_pos = start_pos
93
94	def __str__(self):
95	# type: () -> str
96	if self.s is not None:
97	assert self.start_pos != -1, self.start_pos
98	snippet = (self.s[self.start_pos:self.start_pos + 20])
99
100	line_num = _FindLineNum(self.s, self.start_pos)
101	else:
102	snippet = ''
103	line_num = -1
104	msg = 'line %d: %r %r' % (line_num, self.msg, snippet)
105	return msg
106
107
108	class Output(object):
109	"""Output for sed-like "replacement" model.
110
111	Takes an underlying input buffer and an output file. Maintains a position
112	in the input buffer.
113
114	Print FROM the input or print new text to the output.
115	"""
116
117	def __init__(self, s, f, left_pos=0, right_pos=-1):
118	# type: (str, IO[str], int, int) -> None
119	self.s = s
120	self.f = f
121	self.pos = left_pos
122	self.right_pos = len(s) if right_pos == -1 else right_pos
123
124	def SkipTo(self, pos):
125	# type: (int) -> None
126	"""Skip to a position."""
127	self.pos = pos
128
129	def PrintUntil(self, pos):
130	# type: (int) -> None
131	"""Print until a position."""
132	piece = self.s[self.pos:pos]
133	self.f.write(piece)
134	self.pos = pos
135
136	def PrintTheRest(self):
137	# type: () -> None
138	"""Print until the end of the string."""
139	self.PrintUntil(self.right_pos)
140
141	def Print(self, s):
142	# type: (str) -> None
143	"""Print text to the underlying buffer."""
144	self.f.write(s)
145
146
147	def MakeLexer(rules):
148	return [(re.compile(pat, re.VERBOSE), i) for (pat, i) in rules]
149
150
151	#
152	# Lexers
153	#
154
155	_NAME_RE = r'[a-zA-Z][a-zA-Z0-9:_\-]*' # must start with letter
156
157	CHAR_LEX = [
158	# Characters
159	# https://www.w3.org/TR/xml/#sec-references
160	(r'&\# [0-9]+ ;', h8_id.DecChar),
161	(r'&\# x[0-9a-fA-F]+ ;', h8_id.HexChar),
162	# TODO: shouldn't use _NAME_RE? Just letters
163	(r'& %s ;' % _NAME_RE, h8_id.CharEntity),
164	# Allow unquoted, and quoted
165	(r'&', h8_id.BadAmpersand),
166	]
167
168	HTM8_LEX = CHAR_LEX + [
169	# TODO: CommentBegin, ProcessingBegin, CDataBegin could have an additional
170	# action associated with them? The ending substring
171	(r'<!--', h8_id.CommentBegin),
172
173	# Processing instruction are used for the XML header:
174	# <?xml version="1.0" encoding="UTF-8"?>
175	# They are technically XML-only, but in HTML5, they are another kind of
176	# comment:
177	#
178	# https://developer.mozilla.org/en-US/docs/Web/API/ProcessingInstruction
179	#
180	(r'<\?', h8_id.ProcessingBegin),
181	# Not necessary in HTML5, but occurs in XML
182	(r'<!\[CDATA\[', h8_id.CDataBegin), # <![CDATA[
183
184	# Markup declarations
185	# - In HTML5, there is only <!DOCTYPE html>
186	# - XML has 4 more declarations: <!ELEMENT ...> ATTLIST ENTITY NOTATION
187	# - these seem to be part of DTD
188	# - it's useful to skip these, and be able to parse the rest of the document
189	# - Note: < is allowed?
190	(r'<! [^>\x00]+ >', h8_id.Decl),
191
192	# Tags
193	# Notes:
194	# - We look for a valid tag name, but we don't validate attributes.
195	# That's done in the tag lexer.
196	# - We don't allow leading whitespace
197	(r'</ (%s) >' % _NAME_RE, h8_id.EndTag),
198	# self-closing <br/> comes before StartTag
199	# could/should these be collapsed into one rule?
200	(r'< (%s) [^>\x00]* />' % _NAME_RE, h8_id.StartEndTag), # end </a>
201	(r'< (%s) [^>\x00]* >' % _NAME_RE, h8_id.StartTag), # start <a>
202
203	# HTML5 allows unescaped > in raw data, but < is not allowed.
204	# https://stackoverflow.com/questions/10462348/right-angle-bracket-in-html
205	#
206	# - My early blog has THREE errors when disallowing >
207	# - So do some .wwz files
208	(r'[^&<>\x00]+', h8_id.RawData),
209	(r'>', h8_id.BadGreaterThan),
210	# NUL is the end, an accomodation for re2c. Like we do in frontend/match.
211	(r'\x00', h8_id.EndOfStream),
212	# This includes < - it is not BadLessThan because it's NOT recoverable
213	(r'.', h8_id.Invalid),
214	]
215
216	# Old notes:
217	#
218	# Non-greedy matches are regular and can be matched in linear time
219	# with RE2.
220	#
221	# https://news.ycombinator.com/item?id=27099798
222	#
223
224	# This person tried to do it with a regex:
225	#
226	# https://skeptric.com/html-comment-regexp/index.html
227
228	# . is any char except newline
229	# https://re2c.org/manual/manual_c.html
230
231	# Discarded options
232	#(r'<!-- .*? -->', h8_id.Comment),
233
234	# Hack from Claude: \s\S instead of re.DOTALL. I don't like this
235	#(r'<!-- [\s\S]*? -->', h8_id.Comment),
236	#(r'<!-- (?:.\|[\n])*? -->', h8_id.Comment),
237
238	HTM8_LEX_COMPILED = MakeLexer(HTM8_LEX)
239
240
241	class Lexer(object):
242
243	def __init__(self, s, left_pos=0, right_pos=-1, no_special_tags=False):
244	# type: (str, int, int, bool) -> None
245	self.s = s
246	self.pos = left_pos
247	self.right_pos = len(s) if right_pos == -1 else right_pos
248	self.no_special_tags = no_special_tags
249
250	# string -> compiled regex pattern object
251	self.cache = {} # type: Dict[str, Any]
252
253	# either </script> or </style> - we search until we see that
254	self.search_state = None # type: Optional[str]
255
256	# Position of tag name, if applicable
257	# - Set after you get a StartTag, EndTag, or StartEndTag
258	# - Unset on other tags
259	self.tag_pos_left = -1
260	self.tag_pos_right = -1
261
262	def _Read(self):
263	# type: () -> Tuple[h8_id_t, int]
264	if self.pos == self.right_pos:
265	return h8_id.EndOfStream, self.pos
266
267	assert self.pos < self.right_pos, self.pos
268
269	if self.search_state is not None and not self.no_special_tags:
270	# TODO: case-insensitive search for </SCRIPT> <SCRipt> ?
271	#
272	# Another strategy: enter a mode where we find ONLY the end tag
273	# regex, and any data that's not <, and then check the canonical
274	# tag name for 'script' or 'style'.
275	pos = self.s.find(self.search_state, self.pos)
276	if pos == -1:
277	raise LexError('Unterminated <script> or <style>', self.s,
278	self.pos)
279	self.search_state = None
280	# beginning
281	return h8_id.HtmlCData, pos
282
283	# Find the first match.
284	# Note: frontend/match.py uses _LongestMatch(), which is different!
285	# TODO: reconcile them. This lexer should be expressible in re2c.
286
287	for pat, tok_id in HTM8_LEX_COMPILED:
288	m = pat.match(self.s, self.pos)
289	if m:
290	if tok_id in (h8_id.StartTag, h8_id.EndTag, h8_id.StartEndTag):
291	self.tag_pos_left = m.start(1)
292	self.tag_pos_right = m.end(1)
293	else:
294	# Reset state
295	self.tag_pos_left = -1
296	self.tag_pos_right = -1
297
298	if tok_id == h8_id.CommentBegin:
299	pos = self.s.find('-->', self.pos)
300	if pos == -1:
301	raise LexError('Unterminated <!--', self.s, self.pos)
302	return h8_id.Comment, pos + 3 # -->
303
304	if tok_id == h8_id.ProcessingBegin:
305	pos = self.s.find('?>', self.pos)
306	if pos == -1:
307	raise LexError('Unterminated <?', self.s, self.pos)
308	return h8_id.Processing, pos + 2 # ?>
309
310	if tok_id == h8_id.CDataBegin:
311	pos = self.s.find(']]>', self.pos)
312	if pos == -1:
313	# unterminated <![CDATA[
314	raise LexError('Unterminated <![CDATA[', self.s,
315	self.pos)
316	return h8_id.CData, pos + 3 # ]]>
317
318	if tok_id == h8_id.StartTag:
319	# TODO: reduce allocations
320	if (self.TagNameEquals('script') or
321	self.TagNameEquals('style')):
322	# <SCRipt a=b> -> </SCRipt>
323	self.search_state = '</' + self._LiteralTagName() + '>'
324
325	return tok_id, m.end()
326	else:
327	raise AssertionError('h8_id.Invalid rule should have matched')
328
329	def TagNamePos(self):
330	# type: () -> int
331	"""The right position of the tag pos"""
332	assert self.tag_pos_right != -1, self.tag_pos_right
333	return self.tag_pos_right
334
335	def TagNameEquals(self, expected):
336	# type: (str) -> bool
337	assert self.tag_pos_left != -1, self.tag_pos_left
338	assert self.tag_pos_right != -1, self.tag_pos_right
339
340	# TODO: In C++, this does not need an allocation. Can we test
341	# directly?
342	return expected == self.CanonicalTagName()
343
344	def _LiteralTagName(self):
345	# type: () -> str
346	assert self.tag_pos_left != -1, self.tag_pos_left
347	assert self.tag_pos_right != -1, self.tag_pos_right
348
349	return self.s[self.tag_pos_left:self.tag_pos_right]
350
351	def CanonicalTagName(self):
352	# type: () -> str
353	tag_name = self._LiteralTagName()
354	# Most tags are already lower case, so avoid allocation with this conditional
355	# TODO: this could go in the mycpp runtime?
356	if tag_name.islower():
357	return tag_name
358	else:
359	return tag_name.lower()
360
361	def Read(self):
362	# type: () -> Tuple[h8_id_t, int]
363	tok_id, end_pos = self._Read()
364	self.pos = end_pos # advance
365	return tok_id, end_pos
366
367	def LookAhead(self, regex):
368	# type: (str) -> bool
369	"""
370	Currently used for ul_table.py. But taking a dynamic regex string is
371	not the right interface.
372	"""
373	# Cache the regex compilation. This could also be LookAheadFor(THEAD)
374	# or something.
375	pat = self.cache.get(regex)
376	if pat is None:
377	pat = re.compile(regex)
378	self.cache[regex] = pat
379
380	m = pat.match(self.s, self.pos)
381	return m is not None
382
383
384	A_NAME_LEX = [
385	# Leading whitespace is required, to separate attributes.
386	#
387	# If the = is not present, then we set the lexer in a state for
388	# attr_value_e.Missing.
389	(r'\s+ (%s) \s* (=)? \s*' % _NAME_RE, attr_name.Ok),
390	# unexpected EOF
391
392	# The closing > or /> is treated as end of stream, and it's not an error.
393	(r'\s* /? >', attr_name.Done),
394
395	# NUL should not be possible, because the top-level
396
397	# This includes < - it is not BadLessThan because it's NOT recoverable
398	(r'.', attr_name.Invalid),
399	]
400
401	A_NAME_LEX_COMPILED = MakeLexer(A_NAME_LEX)
402
403	# Here we just loop on regular tokens
404	#
405	# Examples:
406	# <a href = unquoted&foo >
407	# <a href = unquoted&foo > # BadAmpersand is allowed I guess
408	# <a href ="unquoted&foo" > # double quoted
409	# <a href ='unquoted&foo' > # single quoted
410	# <a href = what"foo" > # HTML5 allows this, but we could disallow it if
411	# it's not common. It opens up the j"" and $"" extensions
412	# <a href = what'foo' > # ditto
413
414	_UNQUOTED_VALUE = r'''[^ \t\r\n<>&"'\x00]+'''
415
416	# What comes after = ?
417	A_VALUE_LEX = [
418	(r'"', h8_val_id.DoubleQuote),
419	(r"'", h8_val_id.SingleQuote),
420	(_UNQUOTED_VALUE, h8_val_id.UnquotedVal),
421	(r'.', h8_val_id.NoMatch),
422	]
423
424	A_VALUE_LEX_COMPILED = MakeLexer(A_VALUE_LEX)
425
426	# What's inside "" or '' ?
427	QUOTED_VALUE_LEX = CHAR_LEX + [
428	(r'"', h8_id.DoubleQuote),
429	(r"'", h8_id.SingleQuote),
430	(r'<', h8_id.BadLessThan), # BadAmpersand is in CharLex
431
432	# TODO: think about whitespace for efficient class= queries?
433	#(r'[ \r\n\t]', h8_id.Whitespace), # terminates unquoted values
434	(r'''[^"'<>&\x00]+''', h8_id.RawData),
435	# This includes > - it is not BadGreaterThan because it's NOT recoverable
436	(r'.', h8_id.Invalid),
437	]
438
439	QUOTED_VALUE_LEX_COMPILED = MakeLexer(QUOTED_VALUE_LEX)
440
441
442	class AttrLexer(object):
443	"""
444	Typical usage:
445
446	while True:
447	n, start_pos, end_pos = attr_lx.ReadName()
448	if n == attr_name.Ok:
449	if attr_lx.AttrNameEquals('div'):
450	print('div')
451
452	# TODO: also pass Optional[List[]] out_tokens?
453	v, start_pos, end_pos = attr_lx.ReadValue()
454	"""
455
456	def __init__(self, s):
457	# type: (str) -> None
458	self.s = s
459
460	self.tok_id = h8_id.Invalid # Uninitialized
461	self.tag_name_pos = -1 # Invalid
462	self.tag_end_pos = -1
463	self.must_not_exceed_pos = -1
464
465	self.pos = -1
466
467	self.name_start = -1
468	self.name_end = -1
469	self.equal_end = -1
470	self.next_value_is_missing = False
471
472	self.init_t = -1
473	self.init_e = -1
474
475	def Init(self, tok_id, tag_name_pos, end_pos):
476	# type: (h8_id_t, int, int) -> None
477	"""Initialize so we can read names and values.
478
479	Example:
480	'x <a y>' # tag_name_pos=4, end_pos=6
481	'x <a>' # tag_name_pos=4, end_pos=4
482
483	The Init() method is used to reuse instances of the AttrLexer object.
484	"""
485	assert tag_name_pos >= 0, tag_name_pos
486	assert end_pos >= 0, end_pos
487
488	#log('TAG NAME POS %d', tag_name_pos)
489
490	self.tok_id = tok_id
491	self.tag_name_pos = tag_name_pos
492	self.end_pos = end_pos
493
494	# Check for ambiguous <img src=/>
495	if tok_id == h8_id.StartTag:
496	self.must_not_exceed_pos = end_pos - 1 # account for >
497	elif tok_id == h8_id.StartEndTag:
498	self.must_not_exceed_pos = end_pos - 2 # account for />
499	else:
500	raise AssertionError(tok_id)
501
502	self.pos = tag_name_pos
503
504	# For Reset()
505	self.init_t = tag_name_pos
506	self.init_e = end_pos
507
508	def Reset(self):
509	# type: () -> None
510
511	# TODO: maybe GetAttrRaw() should call this directly? But not any of
512	# the AllAttrs() methods?
513	self.tag_name_pos = self.init_t
514	self.end_pos = self.init_e
515	self.pos = self.init_t
516
517	def ReadName(self):
518	# type: () -> Tuple[attr_name_t, int, int, int]
519	"""Reads the attribute name
520
521	EOF case:
522	<a>
523	<a >
524
525	Error case:
526	<a !>
527	<a foo=bar !>
528	"""
529	for pat, a in A_NAME_LEX_COMPILED:
530	m = pat.match(self.s, self.pos)
531	#log('ReadName() matching %r at %d', self.s, self.pos)
532	if m:
533	#log('ReadName() tag_name_pos %d pos, %d %s', self.tag_name_pos, self.pos, m.groups())
534	if a == attr_name.Invalid:
535	#log('m.groups %s', m.groups())
536	return attr_name.Invalid, -1, -1, -1
537
538	self.pos = m.end(0) # Advance if it's not invalid
539
540	if a == attr_name.Ok:
541	#log('%r', m.groups())
542	self.name_start = m.start(1)
543	self.name_end = m.end(1)
544	self.equal_end = m.end(0) # XML conversion needs this
545	# Is the equals sign missing? Set state.
546	if m.group(2) is None:
547	self.next_value_is_missing = True
548	# HACK: REWIND, since we don't want to consume whitespace
549	self.pos = self.name_end
550	else:
551	self.next_value_is_missing = False
552	return attr_name.Ok, self.name_start, self.name_end, self.equal_end
553	else:
554	# Reset state - e.g. you must call AttrNameEquals
555	self.name_start = -1
556	self.name_end = -1
557
558	if a == attr_name.Done:
559	return attr_name.Done, -1, -1, -1
560	else:
561	context = self.s[self.pos:]
562	#log('s %r %d', self.s, self.pos)
563	raise AssertionError('h8_id.Invalid rule should have matched %r' %
564	context)
565
566	def _CanonicalAttrName(self):
567	# type: () -> str
568	"""Return the lower case attribute name.
569
570	Must call after ReadName()
571	"""
572	assert self.name_start >= 0, self.name_start
573	assert self.name_end >= 0, self.name_end
574
575	attr_name = self.s[self.name_start:self.name_end]
576	if attr_name.islower():
577	return attr_name
578	else:
579	return attr_name.lower()
580
581	def AttrNameEquals(self, expected):
582	# type: (str) -> bool
583	"""
584	Must call after ReadName()
585
586	TODO: This can be optimized to be "in place", with zero allocs.
587	"""
588	return expected == self._CanonicalAttrName()
589
590	def _QuotedRead(self):
591	# type: () -> Tuple[h8_id_t, int]
592
593	for pat, tok_id in QUOTED_VALUE_LEX_COMPILED:
594	# BUG: We can OVER-READ what the segement lexer gave us, e.g. with
595	# <a href=">"> - the inside > ends it
596	m = pat.match(self.s, self.pos)
597	if m:
598	end_pos = m.end(0) # Advance
599	#log('_QuotedRead %r', self.s[self.pos:end_pos])
600	return tok_id, end_pos
601	else:
602	context = self.s[self.pos:self.pos + 10]
603	raise AssertionError('h8_id.Invalid rule should have matched %r' %
604	context)
605
606	def ReadValue(self, tokens_out=None):
607	# type: (Optional[List[Tuple[h8_id, int]]]) -> Tuple[attr_value_t, int, int]
608	"""Read the attribute value.
609
610	In general, it is escaped or "raw"
611
612	Can only be called after a SUCCESSFUL ReadName().
613	Assuming ReadName() returned a value, this should NOT fail.
614	"""
615	# ReadName() invariant
616	assert self.name_start >= 0, self.name_start
617	assert self.name_end >= 0, self.name_end
618
619	self.name_start = -1
620	self.name_end = -1
621
622	if self.next_value_is_missing:
623	# Do not advance self.pos
624	#log('-> MISSING pos %d : %r', self.pos, self.s[self.pos:])
625	return attr_value_e.Missing, -1, -1
626
627	# Now read " ', unquoted or empty= is valid too.
628	for pat, a in A_VALUE_LEX_COMPILED:
629	m = pat.match(self.s, self.pos)
630	if m:
631	first_end_pos = m.end(0)
632	# We shouldn't go past the end
633	assert first_end_pos <= self.end_pos, \
634	'first_end_pos = %d should be less than self.end_pos = %d' % (first_end_pos, self.end_pos)
635	#log('m %s', m.groups())
636
637	# Note: Unquoted value can't contain & etc. now, so there
638	# is no unquoting, and no respecting tokens_raw.
639	if a == h8_val_id.UnquotedVal:
640	if first_end_pos > self.must_not_exceed_pos:
641	#log('first_end_pos %d', first_end_pos)
642	#log('must_not_exceed_pos %d', self.must_not_exceed_pos)
643	raise LexError(
644	'Ambiguous slash: last attribute should be quoted',
645	self.s, first_end_pos)
646	self.pos = first_end_pos # Advance
647	return attr_value_e.Unquoted, m.start(0), first_end_pos
648
649	# TODO: respect tokens_out
650	if a == h8_val_id.DoubleQuote:
651	self.pos = first_end_pos
652	while True:
653	tok_id, q_end_pos = self._QuotedRead()
654	#log('self.pos %d q_end_pos %d', self.pos, q_end_pos)
655	if tok_id == h8_id.Invalid:
656	raise LexError(
657	'ReadValue() got invalid token (DQ)', self.s,
658	self.pos)
659	if tok_id == h8_id.DoubleQuote:
660	right_pos = self.pos
661	self.pos = q_end_pos # Advance past "
662	return attr_value_e.DoubleQuoted, first_end_pos, right_pos
663	self.pos = q_end_pos # Advance _QuotedRead
664
665	# TODO: respect tokens_out
666	if a == h8_val_id.SingleQuote:
667	self.pos = first_end_pos
668	while True:
669	tok_id, q_end_pos = self._QuotedRead()
670	if tok_id == h8_id.Invalid:
671	raise LexError(
672	'ReadValue() got invalid token (SQ)', self.s,
673	self.pos)
674	if tok_id == h8_id.SingleQuote:
675	right_pos = self.pos
676	self.pos = q_end_pos # Advance past "
677	return attr_value_e.SingleQuoted, first_end_pos, right_pos
678	self.pos = q_end_pos # Advance _QuotedRead
679
680	if a == h8_val_id.NoMatch:
681	# <a foo = >
682	return attr_value_e.Empty, -1, -1
683	else:
684	raise AssertionError('h8_val_id.NoMatch rule should have matched')
685
686
687	def GetAttrRaw(attr_lx, name):
688	# type: (AttrLexer, str) -> Optional[str]
689	while True:
690	n, name_start, name_end, _ = attr_lx.ReadName()
691	#log('==> ReadName %s %d %d', attr_name_str(n), name_start, name_end)
692	if n == attr_name.Ok:
693	if attr_lx.AttrNameEquals(name):
694	v, val_start, val_end = attr_lx.ReadValue()
695	return attr_lx.s[val_start:val_end]
696	else:
697	# Problem with stateful API: You are forced to either ReadValue()
698	# or SkipVlaue()
699	attr_lx.ReadValue()
700	elif n == attr_name.Done:
701	break
702	elif n == attr_name.Invalid:
703	raise LexError('GetAttrRaw() got invalid token', attr_lx.s,
704	attr_lx.pos)
705	else:
706	raise AssertionError()
707
708	return None
709
710
711	def AllAttrsRawSlice(attr_lx):
712	# type: (AttrLexer) -> List[Tuple[int, int, int, attr_value_t, int, int]]
713	result = []
714	while True:
715	n, name_start, name_end, equal_end = attr_lx.ReadName()
716	if 0:
717	log(' AllAttrsRaw ==> ReadName %s %d %d %r', attr_name_str(n),
718	name_start, name_end, attr_lx.s[attr_lx.pos:attr_lx.pos + 10])
719	if n == attr_name.Ok:
720	#name = attr_lx.s[name_start:name_end]
721	#log(' Name %r', name)
722
723	v, val_start, val_end = attr_lx.ReadValue()
724	#val = attr_lx.s[val_start:val_end]
725	#log(' ReadValue %r', val)
726	result.append(
727	(name_start, name_end, equal_end, v, val_start, val_end))
728	elif n == attr_name.Done:
729	break
730	elif n == attr_name.Invalid:
731	raise LexError('AllAttrsRaw() got invalid token', attr_lx.s,
732	attr_lx.pos)
733	else:
734	raise AssertionError()
735
736	return result
737
738
739	def AllAttrsRaw(attr_lx):
740	# type: (AttrLexer) -> List[Tuple[str, str]]
741	"""
742	Get a list of pairs [('class', 'foo'), ('href', '?foo=1&bar=2')]
743
744	The quoted values may be escaped. We would need another lexer to
745	unescape them.
746	"""
747	slices = AllAttrsRawSlice(attr_lx)
748	pairs = []
749	s = attr_lx.s
750	for name_start, name_end, equal_end, val_id, val_start, val_end in slices:
751	n = s[name_start:name_end]
752	v = s[val_start:val_end]
753	pairs.append((n, v))
754	return pairs