data_lang/htm8_util

OILS / data_lang / htm8_util_test.py View on Github | oils.pub

234 lines, 134 significant

1	#!/usr/bin/env python2
2	from __future__ import print_function
3
4	import unittest
5
6	from data_lang import htm8
7	from data_lang import htm8_util
8	#from doctools.util import log
9
10
11	class LexerTest(unittest.TestCase):
12
13	def testInvalid(self):
14	# type: () -> None
15	from data_lang.htm8_test import ValidTokenList
16	for s in INVALID_LEX:
17	try:
18	tokens = ValidTokenList(s)
19	except htm8.LexError as e:
20	print(e)
21	else:
22	self.fail('Expected LexError %r' % s)
23
24	def testValid(self):
25	# type: () -> None
26
27	from data_lang.htm8_test import Lex
28
29	for s, _ in VALID_LEX:
30	tokens = Lex(s)
31	print()
32
33
34	INVALID_LEX = [
35	'< >',
36	'<a><',
37	'&amp<',
38	'&<',
39	# Hm > is allowed?
40	#'a > b',
41	'a < b',
42	'<!-- unfinished comment',
43	'<? unfinished processing',
44	'</div bad=attr> <a> <b>',
45
46	# not allowed, but 3 > 4 is allowed
47	'<a> 3 < 4 </a>',
48	# Not a CDATA tag
49	'<STYLEz><</STYLEz>',
50	]
51
52	SKIP = 0
53	UNCHANGED = 1
54
55	VALID_LEX = [
56	# TODO: convert these to XML
57	('<foo></foo>', UNCHANGED),
58	('<foo x=y></foo>', '<foo x="y"></foo>'),
59	#('<foo x="&"></foo>', '<foo x="&"></foo>'),
60	('<foo x="&"></foo>', ''),
61
62	# Allowed with BadAmpersand
63	('<p> x & y </p>', '<p> x & y </p>'),
64
65	# No ambiguity
66	('<img src=/ >', '<img src="/" >'),
67	('<img src="/">', UNCHANGED),
68	('<img src=foo/ >', '<img src="foo/" >'),
69	]
70
71	INVALID_PARSE = [
72	'<a></b>',
73	'<a>', # missing closing tag
74	'<meta></meta>', # this is a self-closing tag
75	]
76
77	INVALID_ATTR_LEX = [
78	# Ambiguous, should be ""
79	'<img src=/>',
80	'<img src= />',
81	'<img src=foo/>',
82	'<img src= foo/>',
83
84	# Quoting
85	'<img src=x"y">',
86	"<img src=j''>",
87	]
88
89	VALID_PARSE = [
90	('<!DOCTYPE html>\n', ''),
91	('<!DOCTYPE>', ''),
92
93	# empty strings
94	('<p x=""></p>', UNCHANGED),
95	("<p x=''></p>", UNCHANGED),
96	('<self-closing a="b" />', UNCHANGED),
97
98	# We could also normalize CDATA?
99	# Note that CDATA has an escaping problem: you need to handle it ]]> with
100	# concatenation. It just "pushes the problem around".
101	# So I think it's better to use ONE kind of escaping, which is <
102	('<script><![CDATA[ <wtf> >< ]]></script>', UNCHANGED),
103
104	# allowed, but 3 < 4 is not allowed
105	('<a> 3 > 4 </a>', '<a> 3 > 4 </a>'),
106	# allowed, but 3 > 4 is not allowed
107	('<p x="3 < 4"></p>', ''),
108	('<b><a href="foo">link</a></b>', UNCHANGED),
109
110	# TODO: should be self-closing
111	#('<meta><a></a>', '<meta/><a></a>'),
112	('<meta><a></a>', ''),
113
114	# no attribute
115	('<button disabled></button>', '<button disabled=""></button>'),
116	('<button disabled=></button>', '<button disabled=""></button>'),
117	('<button disabled= ></button>', '<button disabled= ""></button>'),
118
119	# single quoted is pretty common
120	("<a href='single'></a>", ''),
121
122	# Conceding to reality - I used these myself
123	('<a href=ble.sh></a>', '<a href="ble.sh"></a>'),
124	('<a href=foo.html></a>', '<a href="foo.html"></a>'),
125	('<foo x="&"></foo>', ''),
126
127	# caps
128	('<foo></FOO>', ''),
129	('<Foo></fOO>', ''),
130
131	# capital VOID tag
132	('<META><a></a>', ''),
133	('<script><</script>', ''),
134	# matching
135	('<SCRipt><</SCRipt>', ''),
136	('<SCRIPT><</SCRIPT>', ''),
137	('<STYLE><</STYLE>', ''),
138	#'<SCRipt><</script>',
139
140	# Regression test from blog
141	('<script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>',
142	'')
143
144	# Note: Python HTMLParser.py does DYNAMIC compilation of regex with re.I
145	# flag to handle this! Gah I want something faster.
146	#'<script><</SCRIPT>',
147
148	# TODO: Test <svg> and <math> ?
149	]
150
151	VALID_XML = [
152	'<meta></meta>',
153	]
154
155	INVALID_TAG_LEX = [
156	# bad attr
157	'<a foo=bar !></a>',
158
159	# BUG: are we "overshooting" here? We don't have a sentinel
160	# I wonder if a one-pass lex is just simpler:
161	# - It works with micro-syntax
162	# - And it doesn't have this problem, as well as the stupid / problem
163	# - You can add a sentinel, but then you mess up COW of forked processes,
164	# potentially
165	# - As long as you don't allocate, I think it's not going to be any faster
166	# to skip the attributes
167	# - We could also handle <a href=">"> then
168
169	# Not allowed, but 3 < 4 is allowed
170	'<p x="3 > 4"></p>',
171	# with single quotes
172	"<p x='3 > 4'></p>",
173	# Same thing
174	'<a href=">"></a>',
175	]
176
177
178	class ValidateTest(unittest.TestCase):
179
180	def testInvalid(self):
181	# type: () -> None
182	counters = htm8_util.Counters()
183	for s in INVALID_LEX + INVALID_TAG_LEX + INVALID_ATTR_LEX:
184	try:
185	htm8_util.Validate(s, htm8_util.BALANCED_TAGS, counters)
186	except htm8.LexError as e:
187	print(e)
188	else:
189	self.fail('Expected LexError %r' % s)
190
191	for s in INVALID_PARSE:
192	try:
193	htm8_util.Validate(s, htm8_util.BALANCED_TAGS, counters)
194	except htm8.ParseError as e:
195	print(e)
196	else:
197	self.fail('Expected ParseError')
198
199	def testValid(self):
200	# type: () -> None
201	counters = htm8_util.Counters()
202	for s, _ in VALID_PARSE:
203	print('HTML5 %r' % s)
204	htm8_util.Validate(s, htm8_util.BALANCED_TAGS, counters)
205	#print('HTML5 attrs %r' % counters.debug_attrs)
206
207	def testValidXml(self):
208	# type: () -> None
209	counters = htm8_util.Counters()
210	for s in VALID_XML:
211	print('XML %r' % s)
212	htm8_util.Validate(
213	s, htm8_util.BALANCED_TAGS \| htm8_util.NO_SPECIAL_TAGS,
214	counters)
215	#print('XML attrs %r' % counters.debug_attrs)
216
217
218	class XmlTest(unittest.TestCase):
219
220	def testValid(self):
221	# type: () -> None
222	counters = htm8_util.Counters()
223	for h, expected_xml in VALID_LEX + VALID_PARSE:
224	actual = htm8_util.ToXml(h)
225	if expected_xml == UNCHANGED: # Unchanged
226	self.assertEqual(h, actual)
227	elif expected_xml == '': # Skip
228	pass
229	else:
230	self.assertEqual(expected_xml, actual)
231
232
233	if __name__ == '__main__':
234	unittest.main()