doctools/cmark.py

OILS / doctools / cmark.py View on Github | oils.pub

526 lines, 297 significant

1	#!/usr/bin/env python2
2	"""Convert Markdown to HTML, with our enhancements
3
4	- Parse the HTML
5	- insert a TOC
6	- <pstrip> hack - this is obsolete with ul-table?
7	- Expand $xref links
8	- Highlight code blocks
9
10	I started from cmark-0.28.3/wrappers/wrapper.py.
11	"""
12	from __future__ import print_function
13
14	try:
15	from HTMLParser import HTMLParser
16	except ImportError:
17	# python3
18	from html.parser import HTMLParser # type: ignore
19	import json
20	import optparse
21	import os
22	import pprint
23	import subprocess
24	import sys
25
26	from doctools import html_lib
27	from doctools import doc_html # templates
28	from doctools import oils_doc
29	from doctools import ul_table
30	from data_lang import htm8
31
32	if sys.version_info.major == 2:
33	from typing import Any, List, Dict, Tuple, Union, Optional, IO
34
35
36	def log(msg, *args):
37	# type: (str, Any) -> None
38	if args:
39	msg = msg % args
40
41	if 0:
42	print(msg, file=sys.stderr)
43
44
45	CMARK_WEDGE_DIR = '/wedge/oils-for-unix.org/pkg/cmark/0.29.0'
46
47
48	def cmark_bin(md):
49	# type: (str) -> str
50	b = os.path.join(CMARK_WEDGE_DIR, 'bin/cmark')
51	# Need to render raw HTML
52	p = subprocess.Popen([b, '--unsafe'],
53	stdin=subprocess.PIPE,
54	stdout=subprocess.PIPE)
55	stdout, _ = p.communicate(input=md)
56	return stdout
57
58
59	class TocExtractor(HTMLParser):
60	"""Extract Table of Contents
61
62	When we hit h_tags (h2, h3, h4, etc.), append to self.headings, recording
63	the line number.
64
65	Later, we insert two things:
66	- <a name=""> before each heading (may be obsolete, <h2 id=""> is OK)
67	- The TOC after <div id="toc">
68	"""
69
70	def __init__(self):
71	# type: () -> None
72	HTMLParser.__init__(self)
73
74	# make targets for these, regardless of whether the TOC links to them.
75	self.h_tags = ['h2', 'h3', 'h4']
76	self.indent = 0
77
78	# The TOC will be inserted after this.
79	self.toc_begin_line = -1
80	self.dense_toc_begin_line = -1
81
82	self.capturing = False
83
84	# Flat list of (line_num, tag, id, HTML)?
85	# HTML is like innerHTML. There can be <code> annotations and so forth.
86	# id is optional -- it can be used for generating headings.
87	self.headings = []
88
89	def handle_starttag(self, tag, attrs):
90	# type: (str, List[Tuple[str, str]]) -> None
91	if tag == 'div':
92	if attrs == [('id', 'toc')]:
93	log('%s> %s %s', self.indent * ' ', tag, attrs)
94	self.indent += 1
95	self.toc_begin_line, _ = self.getpos()
96	elif attrs == [('id', 'dense-toc')]:
97	self.indent += 1
98	self.dense_toc_begin_line, _ = self.getpos()
99
100	# Can't have nested <a> tags
101	if self.capturing and tag != 'a':
102	self._AppendHtml('<%s%s>' % (tag, html_lib.AttrsToString(attrs)))
103
104	if tag in self.h_tags:
105	log('%s> %s %s', self.indent * ' ', tag, attrs)
106	self.indent += 1
107	line_num, _ = self.getpos()
108
109	css_id = None
110	for k, v in attrs:
111	if k == 'id':
112	css_id = v
113	break
114	self.headings.append((line_num, tag, css_id, [], []))
115	self.capturing = True # record the text inside <h2></h2> etc.
116
117	def handle_endtag(self, tag):
118	# type: (str) -> None
119	# Debug print
120	if tag == 'div':
121	self.indent -= 1
122	log('%s< %s', self.indent * ' ', tag)
123
124	if tag in self.h_tags:
125	self.indent -= 1
126	log('%s< %s', self.indent * ' ', tag)
127	self.capturing = False
128
129	# Can't have nested <a> tags
130	if self.capturing and tag != 'a':
131	self._AppendHtml('</%s>' % tag)
132
133	def handle_entityref(self, data):
134	# type: (str) -> None
135	"""
136	From Python docs:
137	This method is called to process a named character reference of the form
138	&name; (e.g. >), where name is a general entity reference (e.g. 'gt').
139	"""
140	# BUG FIX: For when we have say " or < in subheadings
141	if self.capturing:
142	self._AppendHtml('&%s;' % data)
143
144	def handle_data(self, data):
145	# type: (str) -> None
146	# Debug print
147	if self.indent > 0:
148	log('%s\| %r', self.indent * ' ', data)
149
150	if self.capturing:
151	self._AppendHtml(data)
152	self._AppendText(data)
153
154	def _AppendText(self, text):
155	# type: (str) -> None
156	"""Accumulate text of the last heading."""
157	_, _, _, _, text_parts = self.headings[-1]
158	text_parts.append(text)
159
160	def _AppendHtml(self, html):
161	# type: (str) -> None
162	"""Accumulate HTML of the last heading."""
163	_, _, _, html_parts, _ = self.headings[-1]
164	html_parts.append(html)
165
166
167	TAG_TO_CSS = {'h2': 'toclevel1', 'h3': 'toclevel2', 'h4': 'toclevel3'}
168
169	# We could just add <h2 id="foo"> attribute! I didn't know those are valid
170	# anchors.
171	# But it's easier to insert an entire line, rather than part ofa line.
172	ANCHOR_FMT = '<a name="%s"></a>\n'
173
174
175	def _MakeTocInsertions(
176	opts, # type: Any
177	toc_tags, # type: Union[List[str], Tuple[str, str]]
178	headings, # type: List[Tuple[int, str, None, List[str], List[str]]]
179	toc_pos, # type: int
180	preserve_anchor_case, # type: bool
181	):
182	# type: (...) -> List[Tuple[int, str]]
183	"""Given extract headings list and TOC position, return a list of insertions.
184
185	The insertions <div> for the TOC itself, and <a name=""> for the targets.
186
187	Args:
188	toc_tags: List of HTML tags ['h2', 'h3'] to SHOW in TOC. But we LINK to
189	all of them.
190	"""
191	# Example:
192	# <div class="toclevel2"><a href="#_toc_0">Introduction</a></div>
193	#
194	# Yeah it's just a flat list, and then indentation is done with CSS. Hm
195	# that's easy.
196
197	toc_lines = ['<div id="toctitle">Table of Contents</div>\n']
198	insertions = []
199
200	i = 0
201	for line_num, tag, css_id, html_parts, text_parts in headings:
202	css_class = TAG_TO_CSS[tag]
203
204	# Add BOTH href, for stability.
205	numeric_href = 'toc_%d' % i
206
207	# If there was an explicit CSS ID written by the user, use that as the href.
208	# I used this in the blog a few times.
209
210	pretty_href = html_lib.PrettyHref(
211	''.join(text_parts), preserve_anchor_case=preserve_anchor_case)
212
213	if css_id: # A FEW OLD BLOG POSTS USE an explicit CSS ID
214	toc_href = css_id
215	else:
216	# Always use the pretty version now. The old numeric version is still a
217	# target, but not in the TOC.
218	toc_href = pretty_href
219
220	line = ' <div class="%s"><a href="#%s">%s</a></div>\n' % (
221	css_class, toc_href, ''.join(html_parts))
222	if tag in toc_tags:
223	toc_lines.append(line)
224
225	targets = []
226	if opts.toc_pretty_href: # NEW WAY
227	targets.append(ANCHOR_FMT % pretty_href)
228	elif css_id: # Old blog explicit
229	targets.append(ANCHOR_FMT % css_id)
230	targets.append(ANCHOR_FMT % numeric_href)
231	else: # Old blog implicit
232	targets.append(ANCHOR_FMT % pretty_href) # Include the NEW WAY too
233	targets.append(ANCHOR_FMT % numeric_href)
234
235	insertions.append((line_num, ''.join(targets)))
236
237	i += 1
238
239	# +1 to insert AFTER the <div>
240	toc_insert = (toc_pos + 1, ''.join(toc_lines))
241	insertions.insert(0, toc_insert) # The first insertion is TOC
242
243	return insertions
244
245
246	def _MakeTocInsertionsDense(
247	headings, # type: List[Tuple[int, str, Optional[str], List[str], List[str]]]
248	toc_pos, # type: int
249	preserve_anchor_case, # type: bool
250	):
251	# type: (...) -> List[Tuple[int, str]]
252	"""For the dense-toc style with columns, used by doc/ref
253
254	The style above is simpler: it outputs a div for every line:
255
256	<div id="toctitle">Table of Contents</div>
257
258	<div class="toclevel1><a ...> Level 1 </a></div>
259	<div class="toclevel2><a ...> 1.A </a></div>
260	<div class="toclevel2><a ...> 1.B </a></div>
261	<div class="toclevel1><a ...> Level 2 </a></div>
262	...
263
264	We want something like this:
265
266	<div id="dense-toc-title">Table of Contents</div>
267
268	<div class="dense-toc-group">
269	<a ...> Level 1 </a> <br/>
270
271	<a class="dense-toc-h3" ...> 1.A </a> <br/>
272	<a class="dense-toc-h3" ...> 1.B </a> <br/>
273
274	</div> # NO BREAKING within this div
275
276	<div class="dense-toc-group">
277	<a ...> Level 2 </a> <br/>
278	</div>
279	"""
280
281	heading_tree = []
282	current_h2 = None
283
284	insertions = []
285
286	for line_num, tag, css_id, html_parts, text_parts in headings:
287
288	pretty_href = html_lib.PrettyHref(
289	''.join(text_parts), preserve_anchor_case=preserve_anchor_case)
290
291	if css_id: # doc/ref can use <h3 id="explicit"></h3>
292	toc_href = css_id
293	else:
294	# Always use the pretty version now. The old numeric version is still a
295	# target, but not in the TOC.
296	toc_href = pretty_href
297
298	anchor_html = ''.join(html_parts)
299
300	# Create a two level tree
301	if tag == 'h2':
302	current_h2 = (anchor_html, toc_href, [])
303	heading_tree.append(current_h2)
304	elif tag == 'h3':
305	assert current_h2 is not None, "h3 shouldn't come before any h2"
306	current_h2[2].append((anchor_html, toc_href))
307
308	# Insert the target <a name="">
309	insertions.append((line_num, ANCHOR_FMT % pretty_href))
310
311	#print('%d %s %s %s %s' % (line_num, tag, css_id, html_parts, text_parts))
312
313	if 1:
314	log('Heading Tree:')
315	log(pprint.pformat(heading_tree))
316	log('')
317
318	toc_lines = ['<div id="dense-toc-title">In This Chapter</div>\n']
319	toc_lines.append('<div id="dense-toc-cols">\n')
320
321	for h2_html, h2_href, children in heading_tree:
322	toc_lines.append('<div class="dense-toc-group">\n')
323	toc_lines.append(' <a href="#%s">%s</a> <br/>\n' % (h2_href, h2_html))
324	for h3_html, h3_href in children:
325	toc_lines.append(
326	' <a class="dense-toc-h3" href="#%s">%s</a> <br/>\n' %
327	(h3_href, h3_html))
328	toc_lines.append('</div>\n')
329
330	toc_lines.append('</div>\n')
331
332	if 1:
333	log('TOC lines')
334	log(pprint.pformat(toc_lines))
335	log('')
336
337	# +1 to insert AFTER the <div>
338	toc_insert = (toc_pos + 1, ''.join(toc_lines))
339	insertions.insert(0, toc_insert) # The first insertion is TOC
340
341	return insertions
342
343
344	def _ApplyInsertions(lines, insertions, out_file):
345	# type: (List[str], List[Tuple[int, str]], IO[str]) -> None
346	assert insertions, "Should be at least one insertion"
347	j = 0
348	n = len(insertions)
349
350	for i, line in enumerate(lines):
351	current_line = i + 1 # 1-based
352
353	if j < n:
354	line_num, s = insertions[j]
355	if current_line == line_num:
356	out_file.write(s)
357	j += 1
358
359	out_file.write(line)
360
361
362	def Render(
363	opts, # type: Any
364	meta, # type: Dict
365	in_file, # type: IO[str]
366	out_file, # type: IO[str]
367	use_fastlex=True, # type: bool
368	debug_out=None, # type: Optional[Any]
369	):
370	# type: (...) -> None
371	if debug_out is None:
372	debug_out = []
373
374	# First convert to HTML
375	html = cmark_bin(in_file.read())
376	#print(html, file=sys.stderr)
377
378	# Now process HTML with oils_doc
379	if use_fastlex:
380	# Note: extract code BEFORE doing the HTML highlighting.
381	if opts.code_block_output:
382	with open(opts.code_block_output, 'w') as f:
383	f.write('# %s: code blocks extracted from Markdown/HTML\n\n' %
384	opts.code_block_output)
385	text = oils_doc.ExtractCode(html, f)
386
387	html = ul_table.RemoveComments(html)
388
389	# Hack for allowing tables without <p> in cells, which CommonMark seems
390	# to require?
391	html = html.replace('<p><pstrip>', '')
392	html = html.replace('</pstrip></p>', '')
393
394	try:
395	html = ul_table.ReplaceTables(html)
396	except htm8.ParseError as e:
397	print('Error rendering file %r' % in_file, file=sys.stderr)
398	raise
399
400	# Expand $xref, etc.
401	html = oils_doc.ExpandLinks(html)
402
403	# <code> blocks
404	# Including class=language-oil-help-topics
405	html = oils_doc.HighlightCode(html,
406	meta.get('default_highlighter'),
407	debug_out=debug_out)
408
409	# h2 is the title. h1 is unused.
410	if opts.toc_tags:
411	toc_tags = opts.toc_tags
412	else:
413	toc_tags = ('h3', 'h4')
414
415	parser = TocExtractor()
416	parser.feed(html)
417
418	log('')
419	log('*** HTML headings:')
420	for heading in parser.headings:
421	log(heading)
422
423	preserve_anchor_case = bool(meta.get('preserve_anchor_case', ''))
424
425	if parser.toc_begin_line != -1:
426	insertions = _MakeTocInsertions(opts, toc_tags, parser.headings,
427	parser.toc_begin_line,
428	preserve_anchor_case)
429	elif parser.dense_toc_begin_line != -1:
430	insertions = _MakeTocInsertionsDense(parser.headings,
431	parser.dense_toc_begin_line,
432	preserve_anchor_case)
433	else: # No TOC found Not found!
434	out_file.write(html) # Pass through
435	return
436
437	log('')
438	log('*** Text Insertions:')
439	for ins in insertions:
440	log(ins)
441
442	log('')
443	log('*** Output:')
444
445	lines = html.splitlines(True) # keep newlines
446	_ApplyInsertions(lines, insertions, out_file)
447
448
449	def Options():
450	# type: () -> Any
451	p = optparse.OptionParser('cmark.py [options]')
452
453	p.add_option('--common-mark',
454	action='store_true',
455	default=False,
456	help='Only do CommonMark conversion')
457
458	p.add_option(
459	'--toc-pretty-href',
460	action='store_true',
461	default=False,
462	help='Generate textual hrefs #like-this rather than like #toc10')
463	p.add_option('--toc-tag',
464	dest='toc_tags',
465	action='append',
466	default=[],
467	help='h tags to include in the TOC, e.g. h2 h3')
468	p.add_option('--disable-fastlex',
469	dest='disable_fastlex',
470	action='store_true',
471	default=False,
472	help='Hack for old blog posts')
473
474	p.add_option('--code-block-output',
475	dest='code_block_output',
476	default=None,
477	help='Extract and print code blocks to this file')
478
479	return p
480
481
482	# width 40 by default
483	DEFAULT_META = {'body_css_class': 'width40'}
484
485
486	def main(argv):
487	o = Options()
488	opts, argv = o.parse_args(argv)
489	assert all(tag.startswith('h') for tag in opts.toc_tags), opts.toc_tags
490
491	if opts.common_mark:
492	print(cmark_bin(sys.stdin.read()))
493	return
494
495	meta = dict(DEFAULT_META)
496
497	if len(argv) == 3:
498	# Oils docs take 2 args: JSON and content HTML
499	with open(argv[1]) as f:
500	meta.update(json.load(f))
501
502	# Docs have a special header and footer.
503	with open(argv[2]) as content_f:
504	doc_html.Header(meta, sys.stdout, draft_warning=True)
505	Render(opts, meta, content_f, sys.stdout)
506	doc_html.Footer(meta, sys.stdout)
507	else:
508	# Filter usage for blog and for benchmarks.
509
510	# Metadata is optional here
511	try:
512	with open(argv[1]) as f:
513	meta.update(json.load(f))
514	except IndexError:
515	pass
516
517	# Old style for blog: it's a filter
518	Render(opts,
519	meta,
520	sys.stdin,
521	sys.stdout,
522	use_fastlex=not opts.disable_fastlex)
523
524
525	if __name__ == '__main__':
526	main(sys.argv)