OILS / doctools / cmark.py View on Github | oils.pub

526 lines, 297 significant
1#!/usr/bin/env python2
2"""Convert Markdown to HTML, with our enhancements
3
4- Parse the HTML
5- insert a TOC
6- <pstrip> hack - this is obsolete with ul-table?
7- Expand $xref links
8- Highlight code blocks
9
10I started from cmark-0.28.3/wrappers/wrapper.py.
11"""
12from __future__ import print_function
13
14try:
15 from HTMLParser import HTMLParser
16except ImportError:
17 # python3
18 from html.parser import HTMLParser # type: ignore
19import json
20import optparse
21import os
22import pprint
23import subprocess
24import sys
25
26from doctools import html_lib
27from doctools import doc_html # templates
28from doctools import oils_doc
29from doctools import ul_table
30from data_lang import htm8
31
32if sys.version_info.major == 2:
33 from typing import Any, List, Dict, Tuple, Union, Optional, IO
34
35
36def log(msg, *args):
37 # type: (str, Any) -> None
38 if args:
39 msg = msg % args
40
41 if 0:
42 print(msg, file=sys.stderr)
43
44
45CMARK_WEDGE_DIR = '/wedge/oils-for-unix.org/pkg/cmark/0.29.0'
46
47
48def cmark_bin(md):
49 # type: (str) -> str
50 b = os.path.join(CMARK_WEDGE_DIR, 'bin/cmark')
51 # Need to render raw HTML
52 p = subprocess.Popen([b, '--unsafe'],
53 stdin=subprocess.PIPE,
54 stdout=subprocess.PIPE)
55 stdout, _ = p.communicate(input=md)
56 return stdout
57
58
59class TocExtractor(HTMLParser):
60 """Extract Table of Contents
61
62 When we hit h_tags (h2, h3, h4, etc.), append to self.headings, recording
63 the line number.
64
65 Later, we insert two things:
66 - <a name=""> before each heading (may be obsolete, <h2 id=""> is OK)
67 - The TOC after <div id="toc">
68 """
69
70 def __init__(self):
71 # type: () -> None
72 HTMLParser.__init__(self)
73
74 # make targets for these, regardless of whether the TOC links to them.
75 self.h_tags = ['h2', 'h3', 'h4']
76 self.indent = 0
77
78 # The TOC will be inserted after this.
79 self.toc_begin_line = -1
80 self.dense_toc_begin_line = -1
81
82 self.capturing = False
83
84 # Flat list of (line_num, tag, id, HTML)?
85 # HTML is like innerHTML. There can be <code> annotations and so forth.
86 # id is optional -- it can be used for generating headings.
87 self.headings = []
88
89 def handle_starttag(self, tag, attrs):
90 # type: (str, List[Tuple[str, str]]) -> None
91 if tag == 'div':
92 if attrs == [('id', 'toc')]:
93 log('%s> %s %s', self.indent * ' ', tag, attrs)
94 self.indent += 1
95 self.toc_begin_line, _ = self.getpos()
96 elif attrs == [('id', 'dense-toc')]:
97 self.indent += 1
98 self.dense_toc_begin_line, _ = self.getpos()
99
100 # Can't have nested <a> tags
101 if self.capturing and tag != 'a':
102 self._AppendHtml('<%s%s>' % (tag, html_lib.AttrsToString(attrs)))
103
104 if tag in self.h_tags:
105 log('%s> %s %s', self.indent * ' ', tag, attrs)
106 self.indent += 1
107 line_num, _ = self.getpos()
108
109 css_id = None
110 for k, v in attrs:
111 if k == 'id':
112 css_id = v
113 break
114 self.headings.append((line_num, tag, css_id, [], []))
115 self.capturing = True # record the text inside <h2></h2> etc.
116
117 def handle_endtag(self, tag):
118 # type: (str) -> None
119 # Debug print
120 if tag == 'div':
121 self.indent -= 1
122 log('%s< %s', self.indent * ' ', tag)
123
124 if tag in self.h_tags:
125 self.indent -= 1
126 log('%s< %s', self.indent * ' ', tag)
127 self.capturing = False
128
129 # Can't have nested <a> tags
130 if self.capturing and tag != 'a':
131 self._AppendHtml('</%s>' % tag)
132
133 def handle_entityref(self, data):
134 # type: (str) -> None
135 """
136 From Python docs:
137 This method is called to process a named character reference of the form
138 &name; (e.g. &gt;), where name is a general entity reference (e.g. 'gt').
139 """
140 # BUG FIX: For when we have say &quot; or &lt; in subheadings
141 if self.capturing:
142 self._AppendHtml('&%s;' % data)
143
144 def handle_data(self, data):
145 # type: (str) -> None
146 # Debug print
147 if self.indent > 0:
148 log('%s| %r', self.indent * ' ', data)
149
150 if self.capturing:
151 self._AppendHtml(data)
152 self._AppendText(data)
153
154 def _AppendText(self, text):
155 # type: (str) -> None
156 """Accumulate text of the last heading."""
157 _, _, _, _, text_parts = self.headings[-1]
158 text_parts.append(text)
159
160 def _AppendHtml(self, html):
161 # type: (str) -> None
162 """Accumulate HTML of the last heading."""
163 _, _, _, html_parts, _ = self.headings[-1]
164 html_parts.append(html)
165
166
167TAG_TO_CSS = {'h2': 'toclevel1', 'h3': 'toclevel2', 'h4': 'toclevel3'}
168
169# We could just add <h2 id="foo"> attribute! I didn't know those are valid
170# anchors.
171# But it's easier to insert an entire line, rather than part ofa line.
172ANCHOR_FMT = '<a name="%s"></a>\n'
173
174
175def _MakeTocInsertions(
176 opts, # type: Any
177 toc_tags, # type: Union[List[str], Tuple[str, str]]
178 headings, # type: List[Tuple[int, str, None, List[str], List[str]]]
179 toc_pos, # type: int
180 preserve_anchor_case, # type: bool
181):
182 # type: (...) -> List[Tuple[int, str]]
183 """Given extract headings list and TOC position, return a list of insertions.
184
185 The insertions <div> for the TOC itself, and <a name=""> for the targets.
186
187 Args:
188 toc_tags: List of HTML tags ['h2', 'h3'] to SHOW in TOC. But we LINK to
189 all of them.
190 """
191 # Example:
192 # <div class="toclevel2"><a href="#_toc_0">Introduction</a></div>
193 #
194 # Yeah it's just a flat list, and then indentation is done with CSS. Hm
195 # that's easy.
196
197 toc_lines = ['<div id="toctitle">Table of Contents</div>\n']
198 insertions = []
199
200 i = 0
201 for line_num, tag, css_id, html_parts, text_parts in headings:
202 css_class = TAG_TO_CSS[tag]
203
204 # Add BOTH href, for stability.
205 numeric_href = 'toc_%d' % i
206
207 # If there was an explicit CSS ID written by the user, use that as the href.
208 # I used this in the blog a few times.
209
210 pretty_href = html_lib.PrettyHref(
211 ''.join(text_parts), preserve_anchor_case=preserve_anchor_case)
212
213 if css_id: # A FEW OLD BLOG POSTS USE an explicit CSS ID
214 toc_href = css_id
215 else:
216 # Always use the pretty version now. The old numeric version is still a
217 # target, but not in the TOC.
218 toc_href = pretty_href
219
220 line = ' <div class="%s"><a href="#%s">%s</a></div>\n' % (
221 css_class, toc_href, ''.join(html_parts))
222 if tag in toc_tags:
223 toc_lines.append(line)
224
225 targets = []
226 if opts.toc_pretty_href: # NEW WAY
227 targets.append(ANCHOR_FMT % pretty_href)
228 elif css_id: # Old blog explicit
229 targets.append(ANCHOR_FMT % css_id)
230 targets.append(ANCHOR_FMT % numeric_href)
231 else: # Old blog implicit
232 targets.append(ANCHOR_FMT % pretty_href) # Include the NEW WAY too
233 targets.append(ANCHOR_FMT % numeric_href)
234
235 insertions.append((line_num, ''.join(targets)))
236
237 i += 1
238
239 # +1 to insert AFTER the <div>
240 toc_insert = (toc_pos + 1, ''.join(toc_lines))
241 insertions.insert(0, toc_insert) # The first insertion is TOC
242
243 return insertions
244
245
246def _MakeTocInsertionsDense(
247 headings, # type: List[Tuple[int, str, Optional[str], List[str], List[str]]]
248 toc_pos, # type: int
249 preserve_anchor_case, # type: bool
250):
251 # type: (...) -> List[Tuple[int, str]]
252 """For the dense-toc style with columns, used by doc/ref
253
254 The style above is simpler: it outputs a div for every line:
255
256 <div id="toctitle">Table of Contents</div>
257
258 <div class="toclevel1><a ...> Level 1 </a></div>
259 <div class="toclevel2><a ...> 1.A </a></div>
260 <div class="toclevel2><a ...> 1.B </a></div>
261 <div class="toclevel1><a ...> Level 2 </a></div>
262 ...
263
264 We want something like this:
265
266 <div id="dense-toc-title">Table of Contents</div>
267
268 <div class="dense-toc-group">
269 <a ...> Level 1 </a> <br/>
270
271 <a class="dense-toc-h3" ...> 1.A </a> <br/>
272 <a class="dense-toc-h3" ...> 1.B </a> <br/>
273
274 </div> # NO BREAKING within this div
275
276 <div class="dense-toc-group">
277 <a ...> Level 2 </a> <br/>
278 </div>
279 """
280
281 heading_tree = []
282 current_h2 = None
283
284 insertions = []
285
286 for line_num, tag, css_id, html_parts, text_parts in headings:
287
288 pretty_href = html_lib.PrettyHref(
289 ''.join(text_parts), preserve_anchor_case=preserve_anchor_case)
290
291 if css_id: # doc/ref can use <h3 id="explicit"></h3>
292 toc_href = css_id
293 else:
294 # Always use the pretty version now. The old numeric version is still a
295 # target, but not in the TOC.
296 toc_href = pretty_href
297
298 anchor_html = ''.join(html_parts)
299
300 # Create a two level tree
301 if tag == 'h2':
302 current_h2 = (anchor_html, toc_href, [])
303 heading_tree.append(current_h2)
304 elif tag == 'h3':
305 assert current_h2 is not None, "h3 shouldn't come before any h2"
306 current_h2[2].append((anchor_html, toc_href))
307
308 # Insert the target <a name="">
309 insertions.append((line_num, ANCHOR_FMT % pretty_href))
310
311 #print('%d %s %s %s %s' % (line_num, tag, css_id, html_parts, text_parts))
312
313 if 1:
314 log('Heading Tree:')
315 log(pprint.pformat(heading_tree))
316 log('')
317
318 toc_lines = ['<div id="dense-toc-title">In This Chapter</div>\n']
319 toc_lines.append('<div id="dense-toc-cols">\n')
320
321 for h2_html, h2_href, children in heading_tree:
322 toc_lines.append('<div class="dense-toc-group">\n')
323 toc_lines.append(' <a href="#%s">%s</a> <br/>\n' % (h2_href, h2_html))
324 for h3_html, h3_href in children:
325 toc_lines.append(
326 ' <a class="dense-toc-h3" href="#%s">%s</a> <br/>\n' %
327 (h3_href, h3_html))
328 toc_lines.append('</div>\n')
329
330 toc_lines.append('</div>\n')
331
332 if 1:
333 log('TOC lines')
334 log(pprint.pformat(toc_lines))
335 log('')
336
337 # +1 to insert AFTER the <div>
338 toc_insert = (toc_pos + 1, ''.join(toc_lines))
339 insertions.insert(0, toc_insert) # The first insertion is TOC
340
341 return insertions
342
343
344def _ApplyInsertions(lines, insertions, out_file):
345 # type: (List[str], List[Tuple[int, str]], IO[str]) -> None
346 assert insertions, "Should be at least one insertion"
347 j = 0
348 n = len(insertions)
349
350 for i, line in enumerate(lines):
351 current_line = i + 1 # 1-based
352
353 if j < n:
354 line_num, s = insertions[j]
355 if current_line == line_num:
356 out_file.write(s)
357 j += 1
358
359 out_file.write(line)
360
361
362def Render(
363 opts, # type: Any
364 meta, # type: Dict
365 in_file, # type: IO[str]
366 out_file, # type: IO[str]
367 use_fastlex=True, # type: bool
368 debug_out=None, # type: Optional[Any]
369):
370 # type: (...) -> None
371 if debug_out is None:
372 debug_out = []
373
374 # First convert to HTML
375 html = cmark_bin(in_file.read())
376 #print(html, file=sys.stderr)
377
378 # Now process HTML with oils_doc
379 if use_fastlex:
380 # Note: extract code BEFORE doing the HTML highlighting.
381 if opts.code_block_output:
382 with open(opts.code_block_output, 'w') as f:
383 f.write('# %s: code blocks extracted from Markdown/HTML\n\n' %
384 opts.code_block_output)
385 text = oils_doc.ExtractCode(html, f)
386
387 html = ul_table.RemoveComments(html)
388
389 # Hack for allowing tables without <p> in cells, which CommonMark seems
390 # to require?
391 html = html.replace('<p><pstrip>', '')
392 html = html.replace('</pstrip></p>', '')
393
394 try:
395 html = ul_table.ReplaceTables(html)
396 except htm8.ParseError as e:
397 print('Error rendering file %r' % in_file, file=sys.stderr)
398 raise
399
400 # Expand $xref, etc.
401 html = oils_doc.ExpandLinks(html)
402
403 # <code> blocks
404 # Including class=language-oil-help-topics
405 html = oils_doc.HighlightCode(html,
406 meta.get('default_highlighter'),
407 debug_out=debug_out)
408
409 # h2 is the title. h1 is unused.
410 if opts.toc_tags:
411 toc_tags = opts.toc_tags
412 else:
413 toc_tags = ('h3', 'h4')
414
415 parser = TocExtractor()
416 parser.feed(html)
417
418 log('')
419 log('*** HTML headings:')
420 for heading in parser.headings:
421 log(heading)
422
423 preserve_anchor_case = bool(meta.get('preserve_anchor_case', ''))
424
425 if parser.toc_begin_line != -1:
426 insertions = _MakeTocInsertions(opts, toc_tags, parser.headings,
427 parser.toc_begin_line,
428 preserve_anchor_case)
429 elif parser.dense_toc_begin_line != -1:
430 insertions = _MakeTocInsertionsDense(parser.headings,
431 parser.dense_toc_begin_line,
432 preserve_anchor_case)
433 else: # No TOC found Not found!
434 out_file.write(html) # Pass through
435 return
436
437 log('')
438 log('*** Text Insertions:')
439 for ins in insertions:
440 log(ins)
441
442 log('')
443 log('*** Output:')
444
445 lines = html.splitlines(True) # keep newlines
446 _ApplyInsertions(lines, insertions, out_file)
447
448
449def Options():
450 # type: () -> Any
451 p = optparse.OptionParser('cmark.py [options]')
452
453 p.add_option('--common-mark',
454 action='store_true',
455 default=False,
456 help='Only do CommonMark conversion')
457
458 p.add_option(
459 '--toc-pretty-href',
460 action='store_true',
461 default=False,
462 help='Generate textual hrefs #like-this rather than like #toc10')
463 p.add_option('--toc-tag',
464 dest='toc_tags',
465 action='append',
466 default=[],
467 help='h tags to include in the TOC, e.g. h2 h3')
468 p.add_option('--disable-fastlex',
469 dest='disable_fastlex',
470 action='store_true',
471 default=False,
472 help='Hack for old blog posts')
473
474 p.add_option('--code-block-output',
475 dest='code_block_output',
476 default=None,
477 help='Extract and print code blocks to this file')
478
479 return p
480
481
482# width 40 by default
483DEFAULT_META = {'body_css_class': 'width40'}
484
485
486def main(argv):
487 o = Options()
488 opts, argv = o.parse_args(argv)
489 assert all(tag.startswith('h') for tag in opts.toc_tags), opts.toc_tags
490
491 if opts.common_mark:
492 print(cmark_bin(sys.stdin.read()))
493 return
494
495 meta = dict(DEFAULT_META)
496
497 if len(argv) == 3:
498 # Oils docs take 2 args: JSON and content HTML
499 with open(argv[1]) as f:
500 meta.update(json.load(f))
501
502 # Docs have a special header and footer.
503 with open(argv[2]) as content_f:
504 doc_html.Header(meta, sys.stdout, draft_warning=True)
505 Render(opts, meta, content_f, sys.stdout)
506 doc_html.Footer(meta, sys.stdout)
507 else:
508 # Filter usage for blog and for benchmarks.
509
510 # Metadata is optional here
511 try:
512 with open(argv[1]) as f:
513 meta.update(json.load(f))
514 except IndexError:
515 pass
516
517 # Old style for blog: it's a filter
518 Render(opts,
519 meta,
520 sys.stdin,
521 sys.stdout,
522 use_fastlex=not opts.disable_fastlex)
523
524
525if __name__ == '__main__':
526 main(sys.argv)