| 1 | #!/usr/bin/env python2
|
| 2 | from __future__ import print_function
|
| 3 |
|
| 4 | import unittest
|
| 5 |
|
| 6 | from data_lang import htm8
|
| 7 | from data_lang import htm8_util
|
| 8 | #from doctools.util import log
|
| 9 |
|
| 10 |
|
| 11 | class LexerTest(unittest.TestCase):
|
| 12 |
|
| 13 | def testInvalid(self):
|
| 14 | # type: () -> None
|
| 15 | from data_lang.htm8_test import ValidTokenList
|
| 16 | for s in INVALID_LEX:
|
| 17 | try:
|
| 18 | tokens = ValidTokenList(s)
|
| 19 | except htm8.LexError as e:
|
| 20 | print(e)
|
| 21 | else:
|
| 22 | self.fail('Expected LexError %r' % s)
|
| 23 |
|
| 24 | def testValid(self):
|
| 25 | # type: () -> None
|
| 26 |
|
| 27 | from data_lang.htm8_test import Lex
|
| 28 |
|
| 29 | for s, _ in VALID_LEX:
|
| 30 | tokens = Lex(s)
|
| 31 | print()
|
| 32 |
|
| 33 |
|
| 34 | INVALID_LEX = [
|
| 35 | '< >',
|
| 36 | '<a><',
|
| 37 | '&<',
|
| 38 | '&<',
|
| 39 | # Hm > is allowed?
|
| 40 | #'a > b',
|
| 41 | 'a < b',
|
| 42 | '<!-- unfinished comment',
|
| 43 | '<? unfinished processing',
|
| 44 | '</div bad=attr> <a> <b>',
|
| 45 |
|
| 46 | # not allowed, but 3 > 4 is allowed
|
| 47 | '<a> 3 < 4 </a>',
|
| 48 | # Not a CDATA tag
|
| 49 | '<STYLEz><</STYLEz>',
|
| 50 | ]
|
| 51 |
|
| 52 | SKIP = 0
|
| 53 | UNCHANGED = 1
|
| 54 |
|
| 55 | VALID_LEX = [
|
| 56 | # TODO: convert these to XML
|
| 57 | ('<foo></foo>', UNCHANGED),
|
| 58 | ('<foo x=y></foo>', '<foo x="y"></foo>'),
|
| 59 | #('<foo x="&"></foo>', '<foo x="&"></foo>'),
|
| 60 | ('<foo x="&"></foo>', ''),
|
| 61 |
|
| 62 | # Allowed with BadAmpersand
|
| 63 | ('<p> x & y </p>', '<p> x & y </p>'),
|
| 64 |
|
| 65 | # No ambiguity
|
| 66 | ('<img src=/ >', '<img src="/" >'),
|
| 67 | ('<img src="/">', UNCHANGED),
|
| 68 | ('<img src=foo/ >', '<img src="foo/" >'),
|
| 69 | ]
|
| 70 |
|
| 71 | INVALID_PARSE = [
|
| 72 | '<a></b>',
|
| 73 | '<a>', # missing closing tag
|
| 74 | '<meta></meta>', # this is a self-closing tag
|
| 75 | ]
|
| 76 |
|
| 77 | INVALID_ATTR_LEX = [
|
| 78 | # Ambiguous, should be ""
|
| 79 | '<img src=/>',
|
| 80 | '<img src= />',
|
| 81 | '<img src=foo/>',
|
| 82 | '<img src= foo/>',
|
| 83 |
|
| 84 | # Quoting
|
| 85 | '<img src=x"y">',
|
| 86 | "<img src=j''>",
|
| 87 | ]
|
| 88 |
|
| 89 | VALID_PARSE = [
|
| 90 | ('<!DOCTYPE html>\n', ''),
|
| 91 | ('<!DOCTYPE>', ''),
|
| 92 |
|
| 93 | # empty strings
|
| 94 | ('<p x=""></p>', UNCHANGED),
|
| 95 | ("<p x=''></p>", UNCHANGED),
|
| 96 | ('<self-closing a="b" />', UNCHANGED),
|
| 97 |
|
| 98 | # We could also normalize CDATA?
|
| 99 | # Note that CDATA has an escaping problem: you need to handle it ]]> with
|
| 100 | # concatenation. It just "pushes the problem around".
|
| 101 | # So I think it's better to use ONE kind of escaping, which is <
|
| 102 | ('<script><![CDATA[ <wtf> >< ]]></script>', UNCHANGED),
|
| 103 |
|
| 104 | # allowed, but 3 < 4 is not allowed
|
| 105 | ('<a> 3 > 4 </a>', '<a> 3 > 4 </a>'),
|
| 106 | # allowed, but 3 > 4 is not allowed
|
| 107 | ('<p x="3 < 4"></p>', ''),
|
| 108 | ('<b><a href="foo">link</a></b>', UNCHANGED),
|
| 109 |
|
| 110 | # TODO: should be self-closing
|
| 111 | #('<meta><a></a>', '<meta/><a></a>'),
|
| 112 | ('<meta><a></a>', ''),
|
| 113 |
|
| 114 | # no attribute
|
| 115 | ('<button disabled></button>', '<button disabled=""></button>'),
|
| 116 | ('<button disabled=></button>', '<button disabled=""></button>'),
|
| 117 | ('<button disabled= ></button>', '<button disabled= ""></button>'),
|
| 118 |
|
| 119 | # single quoted is pretty common
|
| 120 | ("<a href='single'></a>", ''),
|
| 121 |
|
| 122 | # Conceding to reality - I used these myself
|
| 123 | ('<a href=ble.sh></a>', '<a href="ble.sh"></a>'),
|
| 124 | ('<a href=foo.html></a>', '<a href="foo.html"></a>'),
|
| 125 | ('<foo x="&"></foo>', ''),
|
| 126 |
|
| 127 | # caps
|
| 128 | ('<foo></FOO>', ''),
|
| 129 | ('<Foo></fOO>', ''),
|
| 130 |
|
| 131 | # capital VOID tag
|
| 132 | ('<META><a></a>', ''),
|
| 133 | ('<script><</script>', ''),
|
| 134 | # matching
|
| 135 | ('<SCRipt><</SCRipt>', ''),
|
| 136 | ('<SCRIPT><</SCRIPT>', ''),
|
| 137 | ('<STYLE><</STYLE>', ''),
|
| 138 | #'<SCRipt><</script>',
|
| 139 |
|
| 140 | # Regression test from blog
|
| 141 | ('<script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>',
|
| 142 | '')
|
| 143 |
|
| 144 | # Note: Python HTMLParser.py does DYNAMIC compilation of regex with re.I
|
| 145 | # flag to handle this! Gah I want something faster.
|
| 146 | #'<script><</SCRIPT>',
|
| 147 |
|
| 148 | # TODO: Test <svg> and <math> ?
|
| 149 | ]
|
| 150 |
|
| 151 | VALID_XML = [
|
| 152 | '<meta></meta>',
|
| 153 | ]
|
| 154 |
|
| 155 | INVALID_TAG_LEX = [
|
| 156 | # bad attr
|
| 157 | '<a foo=bar !></a>',
|
| 158 |
|
| 159 | # BUG: are we "overshooting" here? We don't have a sentinel
|
| 160 | # I wonder if a one-pass lex is just simpler:
|
| 161 | # - It works with micro-syntax
|
| 162 | # - And it doesn't have this problem, as well as the stupid / problem
|
| 163 | # - You can add a sentinel, but then you mess up COW of forked processes,
|
| 164 | # potentially
|
| 165 | # - As long as you don't allocate, I think it's not going to be any faster
|
| 166 | # to skip the attributes
|
| 167 | # - We could also handle <a href=">"> then
|
| 168 |
|
| 169 | # Not allowed, but 3 < 4 is allowed
|
| 170 | '<p x="3 > 4"></p>',
|
| 171 | # with single quotes
|
| 172 | "<p x='3 > 4'></p>",
|
| 173 | # Same thing
|
| 174 | '<a href=">"></a>',
|
| 175 | ]
|
| 176 |
|
| 177 |
|
| 178 | class ValidateTest(unittest.TestCase):
|
| 179 |
|
| 180 | def testInvalid(self):
|
| 181 | # type: () -> None
|
| 182 | counters = htm8_util.Counters()
|
| 183 | for s in INVALID_LEX + INVALID_TAG_LEX + INVALID_ATTR_LEX:
|
| 184 | try:
|
| 185 | htm8_util.Validate(s, htm8_util.BALANCED_TAGS, counters)
|
| 186 | except htm8.LexError as e:
|
| 187 | print(e)
|
| 188 | else:
|
| 189 | self.fail('Expected LexError %r' % s)
|
| 190 |
|
| 191 | for s in INVALID_PARSE:
|
| 192 | try:
|
| 193 | htm8_util.Validate(s, htm8_util.BALANCED_TAGS, counters)
|
| 194 | except htm8.ParseError as e:
|
| 195 | print(e)
|
| 196 | else:
|
| 197 | self.fail('Expected ParseError')
|
| 198 |
|
| 199 | def testValid(self):
|
| 200 | # type: () -> None
|
| 201 | counters = htm8_util.Counters()
|
| 202 | for s, _ in VALID_PARSE:
|
| 203 | print('HTML5 %r' % s)
|
| 204 | htm8_util.Validate(s, htm8_util.BALANCED_TAGS, counters)
|
| 205 | #print('HTML5 attrs %r' % counters.debug_attrs)
|
| 206 |
|
| 207 | def testValidXml(self):
|
| 208 | # type: () -> None
|
| 209 | counters = htm8_util.Counters()
|
| 210 | for s in VALID_XML:
|
| 211 | print('XML %r' % s)
|
| 212 | htm8_util.Validate(
|
| 213 | s, htm8_util.BALANCED_TAGS | htm8_util.NO_SPECIAL_TAGS,
|
| 214 | counters)
|
| 215 | #print('XML attrs %r' % counters.debug_attrs)
|
| 216 |
|
| 217 |
|
| 218 | class XmlTest(unittest.TestCase):
|
| 219 |
|
| 220 | def testValid(self):
|
| 221 | # type: () -> None
|
| 222 | counters = htm8_util.Counters()
|
| 223 | for h, expected_xml in VALID_LEX + VALID_PARSE:
|
| 224 | actual = htm8_util.ToXml(h)
|
| 225 | if expected_xml == UNCHANGED: # Unchanged
|
| 226 | self.assertEqual(h, actual)
|
| 227 | elif expected_xml == '': # Skip
|
| 228 | pass
|
| 229 | else:
|
| 230 | self.assertEqual(expected_xml, actual)
|
| 231 |
|
| 232 |
|
| 233 | if __name__ == '__main__':
|
| 234 | unittest.main()
|