| 1 | #!/usr/bin/env python2
|
| 2 |
|
| 3 | try:
|
| 4 | from cStringIO import StringIO
|
| 5 | except ImportError:
|
| 6 | # for python3
|
| 7 | from io import StringIO # type: ignore
|
| 8 | import sys
|
| 9 |
|
| 10 | from typing import List
|
| 11 |
|
| 12 | from _devbuild.gen.htm8_asdl import (h8_id, h8_id_str, attr_value_e)
|
| 13 | from data_lang import htm8
|
| 14 | from data_lang.htm8 import (Lexer, LexError, ParseError, Output)
|
| 15 | from doctools.util import log
|
| 16 |
|
| 17 | # https://developer.mozilla.org/en-US/docs/Glossary/Void_element
|
| 18 | VOID_ELEMENTS = [
|
| 19 | 'area',
|
| 20 | 'base',
|
| 21 | 'br',
|
| 22 | 'col',
|
| 23 | 'embed',
|
| 24 | 'hr',
|
| 25 | 'img',
|
| 26 | 'input',
|
| 27 | 'link',
|
| 28 | 'meta',
|
| 29 | 'param',
|
| 30 | 'source',
|
| 31 | 'track',
|
| 32 | 'wbr',
|
| 33 | ]
|
| 34 |
|
| 35 | NO_LEX_ATTRS = 1 << 1 # skip href="?x=42&y=99"
|
| 36 | NO_SPECIAL_TAGS = 1 << 2 # <script> <style>, VOID tags, etc.
|
| 37 | BALANCED_TAGS = 1 << 3 # are tags balanced?
|
| 38 |
|
| 39 |
|
| 40 | def Validate(contents, flags, counters):
|
| 41 | # type: (str, int, Counters) -> None
|
| 42 |
|
| 43 | attr_lx = htm8.AttrLexer(contents)
|
| 44 |
|
| 45 | no_special_tags = bool(flags & NO_SPECIAL_TAGS)
|
| 46 | lx = htm8.Lexer(contents, no_special_tags=no_special_tags)
|
| 47 | tokens = []
|
| 48 | start_pos = 0
|
| 49 | tag_stack = []
|
| 50 | while True:
|
| 51 | tok_id, end_pos = lx.Read()
|
| 52 | #log('TOP %s %r', h8_id_str(tok_id), contents[start_pos:end_pos])
|
| 53 |
|
| 54 | if tok_id == h8_id.Invalid:
|
| 55 | raise LexError('Validate() got invalid token', contents, start_pos)
|
| 56 | if tok_id == h8_id.EndOfStream:
|
| 57 | break
|
| 58 |
|
| 59 | tokens.append((tok_id, end_pos))
|
| 60 |
|
| 61 | if tok_id == h8_id.StartEndTag:
|
| 62 | counters.num_start_end_tags += 1
|
| 63 |
|
| 64 | attr_lx.Init(tok_id, lx.TagNamePos(), end_pos)
|
| 65 | if not bool(flags & NO_LEX_ATTRS):
|
| 66 | all_attrs = htm8.AllAttrsRaw(attr_lx)
|
| 67 | counters.num_attrs += len(all_attrs)
|
| 68 | # TODO: val_lexer.NumTokens() can be replaced with tokens_out
|
| 69 |
|
| 70 | elif tok_id == h8_id.StartTag:
|
| 71 | counters.num_start_tags += 1
|
| 72 |
|
| 73 | attr_lx.Init(tok_id, lx.TagNamePos(), end_pos)
|
| 74 | if not bool(flags & NO_LEX_ATTRS):
|
| 75 | all_attrs = htm8.AllAttrsRaw(attr_lx)
|
| 76 | counters.num_attrs += len(all_attrs)
|
| 77 |
|
| 78 | #counters.debug_attrs.extend(all_attrs)
|
| 79 |
|
| 80 | if flags & BALANCED_TAGS:
|
| 81 | tag_name = lx.CanonicalTagName()
|
| 82 | if flags & NO_SPECIAL_TAGS:
|
| 83 | tag_stack.append(tag_name)
|
| 84 | else:
|
| 85 | # e.g. <meta> is considered self-closing, like <meta/>
|
| 86 | if tag_name not in VOID_ELEMENTS:
|
| 87 | tag_stack.append(tag_name)
|
| 88 |
|
| 89 | counters.max_tag_stack = max(counters.max_tag_stack,
|
| 90 | len(tag_stack))
|
| 91 | elif tok_id == h8_id.EndTag:
|
| 92 | if flags & BALANCED_TAGS:
|
| 93 | try:
|
| 94 | expected = tag_stack.pop()
|
| 95 | except IndexError:
|
| 96 | raise ParseError('Tag stack empty',
|
| 97 | s=contents,
|
| 98 | start_pos=start_pos)
|
| 99 |
|
| 100 | actual = lx.CanonicalTagName()
|
| 101 | if expected != actual:
|
| 102 | raise ParseError(
|
| 103 | 'Got unexpected closing tag %r; opening tag was %r' %
|
| 104 | (contents[start_pos:end_pos], expected),
|
| 105 | s=contents,
|
| 106 | start_pos=start_pos)
|
| 107 |
|
| 108 | start_pos = end_pos
|
| 109 |
|
| 110 | if len(tag_stack) != 0:
|
| 111 | raise ParseError('Missing closing tags at end of doc: %s' %
|
| 112 | ' '.join(tag_stack),
|
| 113 | s=contents,
|
| 114 | start_pos=start_pos)
|
| 115 |
|
| 116 | counters.num_tokens += len(tokens)
|
| 117 |
|
| 118 |
|
| 119 | def ToXml(htm8_str):
|
| 120 | # type: (str) -> str
|
| 121 |
|
| 122 | # TODO:
|
| 123 | # 1. Lex it
|
| 124 | # 2. < & > must be escaped
|
| 125 | # a. in raw data
|
| 126 | # b. in quoted strings
|
| 127 | # 3. <script> turned into CDATA
|
| 128 | # 4. void tags turned into self-closing tags
|
| 129 | # 5. case-sensitive tag matching - not sure about this
|
| 130 |
|
| 131 | attr_lexer = htm8.AttrLexer(htm8_str)
|
| 132 |
|
| 133 | f = StringIO()
|
| 134 | out = Output(htm8_str, f)
|
| 135 |
|
| 136 | lx = Lexer(htm8_str)
|
| 137 |
|
| 138 | pos = 0
|
| 139 | while True:
|
| 140 | tok_id, end_pos = lx.Read()
|
| 141 |
|
| 142 | if tok_id == h8_id.Invalid:
|
| 143 | raise LexError('ToXml() got invalid token', htm8_str, pos)
|
| 144 | if tok_id == h8_id.EndOfStream:
|
| 145 | break
|
| 146 |
|
| 147 | if tok_id in (h8_id.RawData, h8_id.CharEntity, h8_id.HexChar,
|
| 148 | h8_id.DecChar):
|
| 149 | out.PrintUntil(end_pos)
|
| 150 | elif tok_id in (h8_id.StartTag, h8_id.StartEndTag):
|
| 151 | attr_lexer.Init(tok_id, lx.TagNamePos(), end_pos)
|
| 152 | all_attrs = htm8.AllAttrsRawSlice(attr_lexer)
|
| 153 | for name_start, name_end, equal_end, v, val_start, val_end in all_attrs:
|
| 154 | if v == attr_value_e.Missing: # <a missing>
|
| 155 | out.PrintUntil(name_end)
|
| 156 | out.Print('=""')
|
| 157 | elif v == attr_value_e.Empty: # <a empty=>
|
| 158 | out.PrintUntil(equal_end)
|
| 159 | out.Print('""')
|
| 160 | elif v == attr_value_e.Unquoted: # <a foo=bar>
|
| 161 | # Because we disallow ", we can just surround with quotes
|
| 162 | out.PrintUntil(val_start)
|
| 163 | out.Print('"')
|
| 164 | out.PrintUntil(val_end)
|
| 165 | out.Print('"')
|
| 166 |
|
| 167 | #val_lexer.Reset(val_start, val_end)
|
| 168 | pass
|
| 169 | # TODO: get the kind of string
|
| 170 | #
|
| 171 | # Quoted: we need to replace & with & and < with <
|
| 172 | # note > is not allowed
|
| 173 | # Unquoted: right now, we can just surround with double quotes
|
| 174 | # because we don't allow any bad chars
|
| 175 | # Empty : add "", so empty= becomes =""
|
| 176 | # Missing : add ="", so missing becomes missing=""
|
| 177 |
|
| 178 | tag_name = lx.CanonicalTagName()
|
| 179 | if tok_id == h8_id.StartTag and tag_name in VOID_ELEMENTS:
|
| 180 | # TODO: instead of closing >, print />
|
| 181 | pass
|
| 182 |
|
| 183 | elif tok_id == h8_id.BadAmpersand:
|
| 184 | #out.SkipTo(pos)
|
| 185 | out.Print('&')
|
| 186 | out.SkipTo(end_pos)
|
| 187 |
|
| 188 | elif tok_id == h8_id.BadGreaterThan:
|
| 189 | #out.SkipTo(pos)
|
| 190 | out.Print('>')
|
| 191 | out.SkipTo(end_pos)
|
| 192 | else:
|
| 193 | out.PrintUntil(end_pos)
|
| 194 |
|
| 195 | pos = end_pos
|
| 196 |
|
| 197 | out.PrintTheRest()
|
| 198 | return f.getvalue()
|
| 199 |
|
| 200 |
|
| 201 | class Counters(object):
|
| 202 |
|
| 203 | def __init__(self):
|
| 204 | # type: () -> None
|
| 205 | self.num_tokens = 0
|
| 206 | self.num_start_tags = 0
|
| 207 | self.num_start_end_tags = 0
|
| 208 | self.num_attrs = 0
|
| 209 | self.max_tag_stack = 0
|
| 210 | self.num_val_tokens = 0
|
| 211 |
|
| 212 | #self.debug_attrs = []
|
| 213 |
|
| 214 |
|
| 215 | def main(argv):
|
| 216 | # type: (List[str]) -> int
|
| 217 | action = argv[1]
|
| 218 |
|
| 219 | if action == 'tokens':
|
| 220 | contents = sys.stdin.read()
|
| 221 |
|
| 222 | lx = Lexer(contents)
|
| 223 | start_pos = 0
|
| 224 | while True:
|
| 225 | tok_id, end_pos = lx.Read()
|
| 226 | if tok_id == h8_id.Invalid:
|
| 227 | raise LexError('Invalid token', contents, start_pos)
|
| 228 | if tok_id == h8_id.EndOfStream:
|
| 229 | break
|
| 230 |
|
| 231 | frag = contents[start_pos:end_pos]
|
| 232 | log('%d %s %r', end_pos, h8_id_str(tok_id), frag)
|
| 233 | start_pos = end_pos
|
| 234 |
|
| 235 | return 0
|
| 236 |
|
| 237 | elif action in ('quick-scan', 'lex-htm8', 'parse-htm8', 'parse-xml'):
|
| 238 |
|
| 239 | errors = []
|
| 240 | counters = Counters()
|
| 241 |
|
| 242 | flags = 0
|
| 243 | if action == 'quick-scan':
|
| 244 | flags |= NO_LEX_ATTRS
|
| 245 | if action.startswith('parse-'):
|
| 246 | flags |= BALANCED_TAGS
|
| 247 | if action == 'parse-xml':
|
| 248 | flags |= NO_SPECIAL_TAGS
|
| 249 |
|
| 250 | i = 0
|
| 251 | for line in sys.stdin:
|
| 252 | filename = line.strip()
|
| 253 | with open(filename) as f:
|
| 254 | contents = f.read()
|
| 255 |
|
| 256 | try:
|
| 257 | Validate(contents, flags, counters)
|
| 258 | except LexError as e:
|
| 259 | log('Lex error in %r: %s', filename, e)
|
| 260 | errors.append((filename, e))
|
| 261 | except ParseError as e:
|
| 262 | log('Parse error in %r: %s', filename, e)
|
| 263 | errors.append((filename, e))
|
| 264 | i += 1
|
| 265 |
|
| 266 | log('')
|
| 267 | log('%10d tokens', counters.num_tokens)
|
| 268 | log('%10d start/end tags', counters.num_start_end_tags)
|
| 269 | log('%10d start tags', counters.num_start_tags)
|
| 270 | log('%10d attrs', counters.num_attrs)
|
| 271 | log('%10d max tag stack depth', counters.max_tag_stack)
|
| 272 | log('%10d attr val tokens', counters.num_val_tokens)
|
| 273 | log('%10d errors', len(errors))
|
| 274 | if len(errors):
|
| 275 | return 1
|
| 276 | return 0
|
| 277 |
|
| 278 | elif action == 'todo':
|
| 279 | # Other algorithms:
|
| 280 | #
|
| 281 | # - select first subtree with given ID
|
| 282 | # - this requires understanding the void tags I suppose
|
| 283 | # - select all subtrees that have a class
|
| 284 | # - materialize DOM
|
| 285 |
|
| 286 | # Safe-HTM8? This is a filter
|
| 287 | return 0
|
| 288 |
|
| 289 | else:
|
| 290 | raise RuntimeError('Invalid action %r' % action)
|
| 291 |
|
| 292 |
|
| 293 | if __name__ == '__main__':
|
| 294 | sys.exit(main(sys.argv))
|