benchmarks/report.R

OILS / benchmarks / report.R View on Github | oils.pub

1390 lines, 964 significant

1	#!/usr/bin/env Rscript
2	#
3	# benchmarks/report.R -- Analyze data collected by shell scripts.
4	#
5	# Usage:
6	# benchmarks/report.R OUT_DIR [TIMES_CSV...]
7
8	# Suppress warnings about functions masked from 'package:stats' and 'package:base'
9	# filter, lag
10	# intersect, setdiff, setequal, union
11	library(dplyr, warn.conflicts = FALSE)
12	library(tidyr) # spread()
13	library(stringr)
14
15	source('benchmarks/common.R')
16
17	options(stringsAsFactors = F)
18
19	# For pretty printing
20	commas = function(x) {
21	format(x, big.mark=',')
22	}
23
24	sourceUrl = function(path) {
25	sprintf('https://github.com/oilshell/oil/blob/master/%s', path)
26	}
27
28	# Takes a filename, not a path.
29	sourceUrl2 = function(filename) {
30	sprintf(
31	'https://github.com/oilshell/oil/blob/master/benchmarks/testdata/%s',
32	filename)
33	}
34
35	mycppUrl = function(name) {
36	sprintf('https://github.com/oilshell/oil/blob/master/mycpp/examples/%s.py', name)
37	}
38
39	genUrl = function(name) {
40	sprintf('../../_gen/mycpp/examples/%s.mycpp.cc', name)
41	}
42
43
44	# TODO: Set up cgit because Github links are slow.
45	benchmarkDataLink = function(subdir, name, suffix) {
46	#sprintf('../../../../benchmark-data/shell-id/%s', shell_id)
47	sprintf('https://github.com/oilshell/benchmark-data/blob/master/%s/%s%s',
48	subdir, name, suffix)
49	}
50
51	provenanceLink = function(subdir, name, suffix) {
52	sprintf('../%s/%s%s', subdir, name, suffix)
53	}
54
55
56	GetOshLabel = function(shell_hash, prov_dir) {
57	### Given a string, return another string.
58
59	path = sprintf('%s/shell-id/osh-%s/sh-path.txt', prov_dir, shell_hash)
60
61	if (file.exists(path)) {
62	Log('Reading %s', path)
63	lines = readLines(path)
64	if (length(grep('_bin/osh', lines)) > 0) {
65	label = 'osh-ovm'
66	} else if (length(grep('bin/osh', lines)) > 0) {
67	label = 'osh-cpython'
68	} else if (length(grep('_bin/.*/mycpp-souffle/osh', lines)) > 0) {
69	label = 'osh-souffle'
70	} else if (length(grep('_bin/.*/osh', lines)) > 0) {
71	label = 'osh-native'
72	} else {
73	stop("Expected _bin/osh, bin/osh, or _bin/.*/osh")
74	}
75	} else {
76	stop(sprintf("%s doesn't exist", path))
77	}
78	return(label)
79	}
80
81	osh_opt_suffix1 = '_bin/cxx-opt/osh'
82	osh_opt_suffix2 = '_bin/cxx-opt-sh/osh'
83
84	osh_souffle_suffix1 = '_bin/cxx-opt/mycpp-souffle/osh'
85	osh_souffle_suffix2 = '_bin/cxx-opt-sh/mycpp-souffle/osh'
86
87	ysh_souffle_suffix1 = '_bin/cxx-opt/mycpp-souffle/ysh'
88	ysh_souffle_suffix2 = '_bin/cxx-opt-sh/mycpp-souffle/ysh'
89
90	ysh_opt_suffix1 = '_bin/cxx-opt/ysh'
91	ysh_opt_suffix2 = '_bin/cxx-opt-sh/ysh'
92
93	ShellLabels = function(shell_name, shell_hash, num_hosts) {
94	### Given 2 vectors, return a vector of readable labels.
95
96	# TODO: Clean up callers. Some metrics all this function with a
97	# shell/runtime BASENAME, and others a PATH
98	# - e.g. ComputeReport calls this with runtime_name which is actually a PATH
99
100	#Log('name %s', shell_name)
101	#Log('hash %s', shell_hash)
102
103	if (num_hosts == 1) {
104	prov_dir = '_tmp'
105	} else {
106	prov_dir = '../benchmark-data/'
107	}
108
109	labels = c()
110	for (i in 1:length(shell_name)) {
111	sh = shell_name[i]
112	if (sh == 'osh') {
113	label = GetOshLabel(shell_hash[i], prov_dir)
114
115	} else if (endsWith(sh, 'osh-static')) {
116	label = 'osh-static'
117	} else if (endsWith(sh, 'ysh-static')) {
118	label = 'ysh-static'
119
120	} else if (endsWith(sh, osh_opt_suffix1) \|\| endsWith(sh, osh_opt_suffix2)) {
121	label = 'opt/osh'
122	} else if (endsWith(sh, ysh_opt_suffix1) \|\| endsWith(sh, ysh_opt_suffix2)) {
123	label = 'opt/ysh'
124
125	} else if (endsWith(sh, osh_souffle_suffix1) \|\| endsWith(sh, osh_souffle_suffix2)) {
126	label = 'opt/osh-souffle'
127	} else if (endsWith(sh, ysh_souffle_suffix1) \|\| endsWith(sh, ysh_souffle_suffix2)) {
128	label = 'opt/ysh-souffle'
129
130	} else if (endsWith(sh, '_bin/cxx-opt+bumpleak/osh')) {
131	label = 'bumpleak/osh'
132
133	} else {
134	label = sh
135	}
136
137	Log('[%s] [%s]', shell_name[i], label)
138	labels = c(labels, label)
139	}
140
141	return(labels)
142	}
143
144	# Simple version of the above, used by benchmarks/{gc,osh-runtime}
145	ShellLabelFromPath = function(sh_path) {
146	labels = c()
147	for (i in 1:length(sh_path)) {
148	sh = sh_path[i]
149
150	if (endsWith(sh, osh_opt_suffix1) \|\| endsWith(sh, osh_opt_suffix2)) {
151	# the opt binary is called osh-native - the osh-runtime report relies on this
152	label = 'osh-native'
153
154	} else if (endsWith(sh, ysh_opt_suffix1) \|\| endsWith(sh, ysh_opt_suffix2)) {
155	label = 'opt/ysh'
156
157	} else if (endsWith(sh, 'osh-static')) {
158	label = 'osh-static'
159	} else if (endsWith(sh, 'ysh-static')) {
160	label = 'ysh-static'
161
162	} else if (endsWith(sh, osh_souffle_suffix1) \|\| endsWith(sh, osh_souffle_suffix2)) {
163	label = 'osh-souffle'
164
165	} else if (endsWith(sh, '_bin/cxx-opt+bumpleak/osh')) {
166	label = 'bumpleak/osh'
167
168	} else if (endsWith(sh, '_bin/osh')) { # the app bundle
169	label = 'osh-ovm'
170
171	} else if (endsWith(sh, 'bin/osh')) {
172	label = 'osh-cpython'
173
174	} else {
175	label = sh
176	}
177	labels = c(labels, label)
178	}
179	return(labels)
180	}
181
182	DistinctHosts = function(t) {
183	t %>% distinct(host_name, host_hash) -> distinct_hosts
184	# The label is just the name
185	distinct_hosts$host_label = distinct_hosts$host_name
186	return(distinct_hosts)
187	}
188
189	DistinctShells = function(t, num_hosts = -1) {
190	t %>% distinct(shell_name, shell_hash) -> distinct_shells
191
192	Log('')
193	Log('Labeling shells')
194
195	# Calculate it if not passed
196	if (num_hosts == -1) {
197	num_hosts = nrow(DistinctHosts(t))
198	}
199
200	distinct_shells$shell_label = ShellLabels(distinct_shells$shell_name,
201	distinct_shells$shell_hash,
202	num_hosts)
203	return(distinct_shells)
204	}
205
206	ParserReport = function(in_dir, out_dir) {
207	times = read.csv(file.path(in_dir, 'times.csv'))
208	lines = read.csv(file.path(in_dir, 'lines.csv'))
209	raw_data = read.csv(file.path(in_dir, 'raw-data.csv'))
210
211	cachegrind = readTsv(file.path(in_dir, 'cachegrind.tsv'))
212
213	# For joining by filename
214	lines_by_filename = tibble(
215	num_lines = lines$num_lines,
216	filename = basename(lines$path)
217	)
218
219	# Remove failures
220	times %>% filter(status == 0) %>% select(-c(status)) -> times
221	cachegrind %>% filter(status == 0) %>% select(-c(status)) -> cachegrind
222
223	# Add the number of lines, joining on path, and compute lines/ms
224	times %>%
225	left_join(lines, by = c('path')) %>%
226	mutate(filename = basename(path), filename_HREF = sourceUrl(path),
227	max_rss_MB = max_rss_KiB * 1024 / 1e6,
228	elapsed_ms = elapsed_secs * 1000,
229	user_ms = user_secs * 1000,
230	sys_ms = sys_secs * 1000,
231	lines_per_ms = num_lines / elapsed_ms) %>%
232	select(-c(path, max_rss_KiB, elapsed_secs, user_secs, sys_secs)) ->
233	joined_times
234
235	#print(head(times))
236	#print(head(lines))
237	#print(head(vm))
238	#print(head(joined_times))
239
240	print(summary(joined_times))
241
242	#
243	# Find distinct shells and hosts, and label them for readability.
244	#
245
246	distinct_hosts = DistinctHosts(joined_times)
247	Log('')
248	Log('Distinct hosts')
249	print(distinct_hosts)
250
251	distinct_shells = DistinctShells(joined_times)
252	Log('')
253	Log('Distinct shells')
254	print(distinct_shells)
255
256	# Replace name/hash combinations with labels.
257	joined_times %>%
258	left_join(distinct_hosts, by = c('host_name', 'host_hash')) %>%
259	left_join(distinct_shells, by = c('shell_name', 'shell_hash')) %>%
260	select(-c(host_name, host_hash, shell_name, shell_hash)) ->
261	joined_times
262
263	# Like 'times', but do shell_label as one step
264	# Hack: we know benchmarks/auto.sh runs this on one machine
265	distinct_shells_2 = DistinctShells(cachegrind, num_hosts = nrow(distinct_hosts))
266	cachegrind %>%
267	left_join(lines, by = c('path')) %>%
268	select(-c(elapsed_secs, user_secs, sys_secs, max_rss_KiB)) %>%
269	left_join(distinct_shells_2, by = c('shell_name', 'shell_hash')) %>%
270	select(-c(shell_name, shell_hash)) %>%
271	mutate(filename = basename(path), filename_HREF = sourceUrl(path)) %>%
272	select(-c(path)) ->
273	joined_cachegrind
274
275	Log('summary(joined_times):')
276	print(summary(joined_times))
277	Log('head(joined_times):')
278	print(head(joined_times))
279
280	# Summarize rates by platform/shell
281	joined_times %>%
282	mutate(host_label = paste("host", host_label)) %>%
283	group_by(host_label, shell_label) %>%
284	summarize(total_lines = sum(num_lines), total_ms = sum(elapsed_ms)) %>%
285	mutate(lines_per_ms = total_lines / total_ms) %>%
286	select(-c(total_ms)) %>%
287	spread(key = host_label, value = lines_per_ms) ->
288	times_summary
289
290	# Sort by parsing rate on machine 1
291	if ("host hoover" %in% colnames(times_summary)) {
292	times_summary %>% arrange(desc(`host hoover`)) -> times_summary
293	} else {
294	times_summary %>% arrange(desc(`host no-host`)) -> times_summary
295	}
296
297	Log('times_summary:')
298	print(times_summary)
299
300	# Summarize cachegrind by platform/shell
301	# Bug fix: as.numeric(irefs) avoids 32-bit integer overflow!
302	joined_cachegrind %>%
303	group_by(shell_label) %>%
304	summarize(total_lines = sum(num_lines), total_irefs = sum(as.numeric(irefs))) %>%
305	mutate(thousand_irefs_per_line = total_irefs / total_lines / 1000) %>%
306	select(-c(total_irefs)) ->
307	cachegrind_summary
308
309	if ("no-host" %in% distinct_hosts$host_label) {
310
311	# We don't have all the shells
312	elapsed = NULL
313	rate = NULL
314	max_rss = NULL
315	instructions = NULL
316
317	joined_times %>%
318	select(c(shell_label, elapsed_ms, user_ms, sys_ms, max_rss_MB,
319	num_lines, filename, filename_HREF)) %>%
320	arrange(filename, elapsed_ms) ->
321	times_flat
322
323	joined_cachegrind %>%
324	select(c(shell_label, irefs, num_lines, filename, filename_HREF)) %>%
325	arrange(filename, irefs) ->
326	cachegrind_flat
327
328	} else {
329
330	times_flat = NULL
331	cachegrind_flat = NULL
332
333	# Hack for release. TODO: unify with SoilAdd commentMore actions
334	if (Sys.getenv("OILS_NO_SOUFFLE") == "") {
335	souffle_col = c('osh-souffle')
336	} else {
337	souffle_col = c()
338	}
339
340	cols1 = c('host_label', 'bash', 'dash', 'mksh', 'zsh',
341	'osh-ovm', 'osh-cpython', 'osh-native', souffle_col,
342	'osh_to_bash_ratio', 'num_lines', 'filename', 'filename_HREF')
343
344	# Elapsed seconds for each shell by platform and file
345	joined_times %>%
346	select(-c(lines_per_ms, user_ms, sys_ms, max_rss_MB)) %>%
347	spread(key = shell_label, value = elapsed_ms) %>%
348	arrange(host_label, num_lines) %>%
349	mutate(osh_to_bash_ratio = `osh-native` / bash) %>%
350	select(all_of(cols1)) ->
351	elapsed
352
353	Log('\n')
354	Log('ELAPSED')
355	print(elapsed)
356
357	cols2 = c('host_label', 'bash', 'dash', 'mksh', 'zsh',
358	'osh-ovm', 'osh-cpython', 'osh-native', souffle_col,
359	'num_lines', 'filename', 'filename_HREF')
360	# Rates by file and shell
361	joined_times %>%
362	select(-c(elapsed_ms, user_ms, sys_ms, max_rss_MB)) %>%
363	spread(key = shell_label, value = lines_per_ms) %>%
364	arrange(host_label, num_lines) %>%
365	select(all_of(cols2)) ->
366	rate
367
368	Log('\n')
369	Log('RATE')
370	print(rate)
371
372	# Memory usage by file
373	joined_times %>%
374	select(-c(elapsed_ms, lines_per_ms, user_ms, sys_ms)) %>%
375	spread(key = shell_label, value = max_rss_MB) %>%
376	arrange(host_label, num_lines) %>%
377	select(all_of(cols2)) ->
378	max_rss
379
380	Log('\n')
381	Log('MAX RSS')
382	print(max_rss)
383
384	Log('\n')
385	Log('joined_cachegrind has %d rows', nrow(joined_cachegrind))
386	print(joined_cachegrind)
387	#print(joined_cachegrind %>% filter(path == 'benchmarks/testdata/configure-helper.sh'))
388
389	cols3 = c('bash', 'dash', 'mksh', 'osh-native', souffle_col,
390	'num_lines', 'filename', 'filename_HREF')
391
392	# Cachegrind instructions by file
393	joined_cachegrind %>%
394	mutate(thousand_irefs_per_line = irefs / num_lines / 1000) %>%
395	select(-c(irefs)) %>%
396	spread(key = shell_label, value = thousand_irefs_per_line) %>%
397	arrange(num_lines) %>%
398	select(all_of(cols3)) ->
399	instructions
400
401	Log('\n')
402	Log('instructions has %d rows', nrow(instructions))
403	print(instructions)
404	}
405
406	WriteProvenance(distinct_hosts, distinct_shells, out_dir)
407
408	raw_data_table = tibble(
409	filename = basename(as.character(raw_data$path)),
410	filename_HREF = benchmarkDataLink('osh-parser', filename, '')
411	)
412	#print(raw_data_table)
413
414	writeCsv(raw_data_table, file.path(out_dir, 'raw-data'))
415
416	precision = SamePrecision(0) # lines per ms
417	writeCsv(times_summary, file.path(out_dir, 'summary'), precision)
418
419	precision = ColumnPrecision(list(), default = 1)
420	writeTsv(cachegrind_summary, file.path(out_dir, 'cachegrind_summary'), precision)
421
422	if (!is.null(times_flat)) {
423	precision = SamePrecision(0)
424	writeTsv(times_flat, file.path(out_dir, 'times_flat'), precision)
425	}
426
427	if (!is.null(cachegrind_flat)) {
428	precision = SamePrecision(0)
429	writeTsv(cachegrind_flat, file.path(out_dir, 'cachegrind_flat'), precision)
430	}
431
432	if (!is.null(elapsed)) { # equivalent to no-host
433	# Round to nearest millisecond, but the ratio has a decimal point.
434	precision = ColumnPrecision(list(osh_to_bash_ratio = 1), default = 0)
435	writeCsv(elapsed, file.path(out_dir, 'elapsed'), precision)
436
437	precision = SamePrecision(0)
438	writeCsv(rate, file.path(out_dir, 'rate'), precision)
439
440	writeCsv(max_rss, file.path(out_dir, 'max_rss'))
441
442	precision = SamePrecision(1)
443	writeTsv(instructions, file.path(out_dir, 'instructions'), precision)
444	}
445
446	Log('Wrote %s', out_dir)
447	}
448
449	WriteProvenance = function(distinct_hosts, distinct_shells, out_dir, tsv = F) {
450
451	num_hosts = nrow(distinct_hosts)
452	if (num_hosts == 1) {
453	linkify = provenanceLink
454	} else {
455	linkify = benchmarkDataLink
456	}
457
458	Log('distinct_hosts')
459	print(distinct_hosts)
460	Log('')
461
462	Log('distinct_shells')
463	print(distinct_shells)
464	Log('')
465
466	# Should be:
467	# host_id_url
468	# And then csv_to_html will be smart enough? It should take --url flag?
469	host_table = tibble(
470	host_label = distinct_hosts$host_label,
471	host_id = paste(distinct_hosts$host_name,
472	distinct_hosts$host_hash, sep='-'),
473	host_id_HREF = linkify('host-id', host_id, '/')
474	)
475	Log('host_table')
476	print(host_table)
477	Log('')
478
479	shell_table = tibble(
480	shell_label = distinct_shells$shell_label,
481	shell_id = paste(distinct_shells$shell_name,
482	distinct_shells$shell_hash, sep='-'),
483	shell_id_HREF = linkify('shell-id', shell_id, '/')
484	)
485
486	Log('shell_table')
487	print(shell_table)
488	Log('')
489
490	if (tsv) {
491	writeTsv(host_table, file.path(out_dir, 'hosts'))
492	writeTsv(shell_table, file.path(out_dir, 'shells'))
493	} else {
494	writeCsv(host_table, file.path(out_dir, 'hosts'))
495	writeCsv(shell_table, file.path(out_dir, 'shells'))
496	}
497	}
498
499	WriteSimpleProvenance = function(provenance, out_dir) {
500	Log('provenance')
501	print(provenance)
502	Log('')
503
504	# Legacy: add $shell_name, because "$shell_basename-$shell_hash" is what
505	# benchmarks/id.sh publish-shell-id uses
506	provenance %>%
507	mutate(shell_name = basename(sh_path)) %>%
508	distinct(shell_label, shell_name, shell_hash) ->
509	distinct_shells
510
511	Log('distinct_shells')
512	print(distinct_shells)
513	Log('')
514
515	provenance %>% distinct(host_label, host_name, host_hash) -> distinct_hosts
516
517	WriteProvenance(distinct_hosts, distinct_shells, out_dir, tsv = T)
518	}
519
520	RuntimeReport = function(in_dir, out_dir) {
521	times = readTsv(file.path(in_dir, 'times.tsv'))
522
523	gc_stats = readTsv(file.path(in_dir, 'gc_stats.tsv'))
524	provenance = readTsv(file.path(in_dir, 'provenance.tsv'))
525
526	times %>% filter(status != 0) -> failed
527	if (nrow(failed) != 0) {
528	print(failed)
529	stop('Some osh-runtime tasks failed')
530	}
531
532	# Joins:
533	# times <= sh_path => provenance
534	# times <= join_id, host_name => gc_stats
535
536	# TODO: provenance may have rows from 2 machines. Could validate them and
537	# deduplicate.
538
539	# It should have (host_label, host_name, host_hash)
540	# (shell_label, sh_path, shell_hash)
541	provenance %>%
542	mutate(host_label = host_name, shell_label = ShellLabelFromPath(sh_path)) ->
543	provenance
544
545	provenance %>% distinct(sh_path, shell_label) -> label_lookup
546
547	Log('label_lookup')
548	print(label_lookup)
549
550	# Join with provenance for host label and shell label
551	times %>%
552	select(c(elapsed_secs, user_secs, sys_secs, max_rss_KiB, task_id,
553	host_name, sh_path, workload)) %>%
554	mutate(elapsed_ms = elapsed_secs * 1000,
555	user_ms = user_secs * 1000,
556	sys_ms = sys_secs * 1000,
557	max_rss_MB = max_rss_KiB * 1024 / 1e6) %>%
558	select(-c(elapsed_secs, user_secs, sys_secs, max_rss_KiB)) %>%
559	left_join(label_lookup, by = c('sh_path')) %>%
560	select(-c(sh_path)) %>%
561	# we want to compare workloads on adjacent rows
562	arrange(workload) ->
563	details
564
565	times %>%
566	select(c(task_id, host_name, sh_path, workload, minor_faults, major_faults, swaps, in_block, out_block, signals, voluntary_ctx, involuntary_ctx)) %>%
567	left_join(label_lookup, by = c('sh_path')) %>%
568	select(-c(sh_path)) %>%
569	# we want to compare workloads on adjacent rows
570	arrange(workload) ->
571	details_io
572
573	Log('details')
574	print(details)
575
576	cols2 = c('workload', 'host_name',
577	'bash', 'dash', 'osh-native', 'osh-souffle', 'osh-static',
578	'osh_bash_ratio', 'static_bash_ratio')
579
580	# Elapsed time comparison
581	details %>%
582	select(-c(task_id, user_ms, sys_ms, max_rss_MB)) %>%
583	spread(key = shell_label, value = elapsed_ms) %>%
584	mutate(osh_bash_ratio = `osh-native` / bash) %>%
585	mutate(static_bash_ratio = `osh-static` / bash) %>%
586	arrange(workload, host_name) %>%
587	select(all_of(cols2)) ->
588	elapsed
589
590	Log('elapsed')
591	print(elapsed)
592
593	# Minor Page Faults Comparison
594	details_io %>%
595	select(c(host_name, shell_label, workload, minor_faults)) %>%
596	spread(key = shell_label, value = minor_faults) %>%
597	mutate(osh_bash_ratio = `osh-native` / bash) %>%
598	mutate(static_bash_ratio = `osh-static` / bash) %>%
599	arrange(workload, host_name) %>%
600	select(all_of(cols2)) ->
601	page_faults
602
603	Log('page_faults')
604	print(page_faults)
605
606	# Max RSS comparison
607	details %>%
608	select(c(host_name, shell_label, workload, max_rss_MB)) %>%
609	spread(key = shell_label, value = max_rss_MB) %>%
610	mutate(osh_bash_ratio = `osh-native` / bash) %>%
611	mutate(static_bash_ratio = `osh-static` / bash) %>%
612	arrange(workload, host_name) %>%
613	select(all_of(cols2)) ->
614	max_rss
615
616	Log('max rss')
617	print(max_rss)
618
619	details %>%
620	select(c(task_id, host_name, workload, elapsed_ms, max_rss_MB)) %>%
621	mutate(join_id = sprintf("gc-%d", task_id)) %>%
622	select(-c(task_id)) ->
623	gc_details
624
625	Log('GC details')
626	print(gc_details)
627	Log('')
628
629	Log('GC stats')
630	print(gc_stats)
631	Log('')
632
633	gc_stats %>%
634	left_join(gc_details, by = c('join_id', 'host_name')) %>%
635	select(-c(join_id, roots_capacity, objs_capacity)) %>%
636	# Do same transformations as GcReport()
637	mutate(allocated_MB = bytes_allocated / 1e6) %>%
638	select(-c(bytes_allocated)) %>%
639	rename(num_gc_done = num_collections) %>%
640	# Put these columns first
641	relocate(workload, host_name,
642	elapsed_ms, max_gc_millis, total_gc_millis,
643	allocated_MB, max_rss_MB, num_allocated) ->
644	gc_stats
645
646	Log('After GC stats')
647	print(gc_stats)
648	Log('')
649
650	WriteSimpleProvenance(provenance, out_dir)
651
652	# milliseconds don't need decimal digit
653	precision = ColumnPrecision(list(bash = 0, dash = 0, `osh-cpython` = 0,
654	`osh-native` = 0, `osh-souffle` = 0, `osh-static` = 0,
655	osh_bash_ratio = 2,
656	static_bash_ratio = 2))
657	writeTsv(elapsed, file.path(out_dir, 'elapsed'), precision)
658	writeTsv(page_faults, file.path(out_dir, 'page_faults'), precision)
659
660	precision2 = ColumnPrecision(list(osh_bash_ratio = 2, static_bash_ratio = 2))
661	writeTsv(max_rss, file.path(out_dir, 'max_rss'), precision2)
662
663	precision3 = ColumnPrecision(list(max_rss_MB = 1, allocated_MB = 1),
664	default = 0)
665	writeTsv(gc_stats, file.path(out_dir, 'gc_stats'), precision3)
666
667	writeTsv(details, file.path(out_dir, 'details'), precision3)
668	writeTsv(details_io, file.path(out_dir, 'details_io'))
669
670	Log('Wrote %s', out_dir)
671	}
672
673	VmBaselineReport = function(in_dir, out_dir) {
674	vm = readTsv(file.path(in_dir, 'vm-baseline.tsv'))
675	#print(vm)
676
677	# Not using DistinctHosts() because field host_hash isn't collected
678	num_hosts = nrow(vm %>% distinct(host))
679
680	vm %>%
681	rename(kib = metric_value) %>%
682	mutate(shell_label = ShellLabels(shell_name, shell_hash, num_hosts),
683	megabytes = kib * 1024 / 1e6) %>%
684	select(-c(shell_name, kib)) %>%
685	spread(key = c(metric_name), value = megabytes) %>%
686	rename(VmPeak_MB = VmPeak, VmRSS_MB = VmRSS) %>%
687	select(c(shell_label, shell_hash, host, VmRSS_MB, VmPeak_MB)) %>%
688	arrange(shell_label, shell_hash, host, VmPeak_MB) ->
689	vm
690
691	print(vm)
692
693	writeTsv(vm, file.path(out_dir, 'vm-baseline'))
694	}
695
696	WriteOvmBuildDetails = function(distinct_hosts, distinct_compilers, out_dir) {
697	host_table = tibble(
698	host_label = distinct_hosts$host_label,
699	host_id = paste(distinct_hosts$host_name,
700	distinct_hosts$host_hash, sep='-'),
701	host_id_HREF = benchmarkDataLink('host-id', host_id, '/')
702	)
703	print(host_table)
704
705	dc = distinct_compilers
706	compiler_table = tibble(
707	compiler_label = dc$compiler_label,
708	compiler_id = paste(dc$compiler_label, dc$compiler_hash, sep='-'),
709	compiler_id_HREF = benchmarkDataLink('compiler-id', compiler_id, '/')
710	)
711	print(compiler_table)
712
713	writeTsv(host_table, file.path(out_dir, 'hosts'))
714	writeTsv(compiler_table, file.path(out_dir, 'compilers'))
715	}
716
717	OvmBuildReport = function(in_dir, out_dir) {
718	times = readTsv(file.path(in_dir, 'times.tsv'))
719	native_sizes = readTsv(file.path(in_dir, 'native-sizes.tsv'))
720	#raw_data = readTsv(file.path(in_dir, 'raw-data.tsv'))
721
722	times %>% filter(status != 0) -> failed
723	if (nrow(failed) != 0) {
724	print(failed)
725	stop('Some ovm-build tasks failed')
726	}
727
728	times %>% distinct(host_name, host_hash) -> distinct_hosts
729	distinct_hosts$host_label = distinct_hosts$host_name
730
731	times %>% distinct(compiler_path, compiler_hash) -> distinct_compilers
732	distinct_compilers$compiler_label = basename(distinct_compilers$compiler_path)
733
734	#print(distinct_hosts)
735	#print(distinct_compilers)
736
737	WriteOvmBuildDetails(distinct_hosts, distinct_compilers, out_dir)
738
739	times %>%
740	select(-c(status)) %>%
741	left_join(distinct_hosts, by = c('host_name', 'host_hash')) %>%
742	left_join(distinct_compilers, by = c('compiler_path', 'compiler_hash')) %>%
743	select(-c(host_name, host_hash, compiler_path, compiler_hash)) %>%
744	mutate(src_dir = basename(src_dir),
745	host_label = paste("host ", host_label),
746	is_conf = str_detect(action, 'configure'),
747	is_ovm = str_detect(action, 'oil.ovm'),
748	is_dbg = str_detect(action, 'dbg'),
749	) %>%
750	select(host_label, src_dir, compiler_label, action, is_conf, is_ovm, is_dbg,
751	elapsed_secs) %>%
752	spread(key = c(host_label), value = elapsed_secs) %>%
753	arrange(src_dir, compiler_label, desc(is_conf), is_ovm, desc(is_dbg)) %>%
754	select(-c(is_conf, is_ovm, is_dbg)) ->
755	times
756
757	#print(times)
758
759	# paths look like _tmp/ovm-build/bin/clang/oils_cpp.stripped
760	native_sizes %>%
761	select(c(host_label, path, num_bytes)) %>%
762	mutate(host_label = paste("host ", host_label),
763	binary = basename(path),
764	compiler = basename(dirname(path)),
765	) %>%
766	select(-c(path)) %>%
767	spread(key = c(host_label), value = num_bytes) %>%
768	arrange(compiler, binary) ->
769	native_sizes
770
771	# NOTE: These don't have the host and compiler.
772	writeTsv(times, file.path(out_dir, 'times'))
773	writeTsv(native_sizes, file.path(out_dir, 'native-sizes'))
774
775	# TODO: I want a size report too
776	#writeCsv(sizes, file.path(out_dir, 'sizes'))
777	}
778
779	unique_stdout_md5sum = function(t, num_expected) {
780	u = n_distinct(t$stdout_md5sum)
781	if (u != num_expected) {
782	t %>% select(c(host_name, task_name, arg1, arg2, runtime_name, stdout_md5sum)) %>% print()
783	stop(sprintf('Expected %d unique md5sums, got %d', num_expected, u))
784	}
785	}
786
787	ComputeReport = function(in_dir, out_dir) {
788	# TSV file, not CSV
789	times = read.table(file.path(in_dir, 'times.tsv'), header=T)
790	print(times)
791
792	times %>% filter(status != 0) -> failed
793	if (nrow(failed) != 0) {
794	print(failed)
795	stop('Some compute tasks failed')
796	}
797
798	#
799	# Check correctness
800	#
801
802	times %>% filter(task_name == 'hello') %>% unique_stdout_md5sum(1)
803	times %>% filter(task_name == 'fib') %>% unique_stdout_md5sum(1)
804	times %>% filter(task_name == 'for_loop') %>% unique_stdout_md5sum(1)
805	times %>% filter(task_name == 'control_flow') %>% unique_stdout_md5sum(1)
806	times %>% filter(task_name == 'word_freq') %>% unique_stdout_md5sum(1)
807	# 3 different inputs
808	times %>% filter(task_name == 'parse_help') %>% unique_stdout_md5sum(3)
809
810	times %>% filter(task_name == 'bubble_sort') %>% unique_stdout_md5sum(2)
811
812	# TODO:
813	# - oils_cpp doesn't implement unicode LANG=C
814	# - bash behaves differently on your desktop vs. in the container
815	# - might need layer-locales in the image?
816
817	#times %>% filter(task_name == 'palindrome' & arg1 == 'unicode') %>% unique_stdout_md5sum(1)
818	# Ditto here
819	#times %>% filter(task_name == 'palindrome' & arg1 == 'bytes') %>% unique_stdout_md5sum(1)
820
821	#
822	# Find distinct shells and hosts, and label them for readability.
823	#
824
825	# Runtimes are called shells, as a hack for code reuse
826	times %>%
827	mutate(shell_name = runtime_name, shell_hash = runtime_hash) %>%
828	select(c(host_name, host_hash, shell_name, shell_hash)) ->
829	tmp
830
831	distinct_hosts = DistinctHosts(tmp)
832	Log('')
833	Log('Distinct hosts')
834	print(distinct_hosts)
835
836	distinct_shells = DistinctShells(tmp)
837	Log('')
838	Log('Distinct runtimes')
839	print(distinct_shells)
840
841	num_hosts = nrow(distinct_hosts)
842
843	times %>%
844	select(-c(status, stdout_md5sum, stdout_filename, host_hash, runtime_hash)) %>%
845	mutate(runtime_label = ShellLabels(runtime_name, runtime_hash, num_hosts),
846	elapsed_ms = elapsed_secs * 1000,
847	user_ms = user_secs * 1000,
848	sys_ms = sys_secs * 1000,
849	max_rss_MB = max_rss_KiB * 1024 / 1e6) %>%
850	select(-c(runtime_name, elapsed_secs, user_secs, sys_secs, max_rss_KiB)) %>%
851	arrange(host_name, task_name, arg1, arg2, user_ms) ->
852	details
853
854	times %>%
855	mutate(
856	runtime_label = ShellLabels(runtime_name, runtime_hash, num_hosts),
857	stdout_md5sum_HREF = file.path('tmp', task_name, stdout_filename)) %>%
858	select(c(host_name, task_name, arg1, arg2, runtime_label,
859	stdout_md5sum, stdout_md5sum_HREF)) ->
860	stdout_files
861
862	details %>% filter(task_name == 'hello') %>% select(-c(task_name)) -> hello
863	details %>% filter(task_name == 'fib') %>% select(-c(task_name)) -> fib
864	details %>% filter(task_name == 'for_loop') %>% select(-c(task_name)) -> for_loop
865	details %>% filter(task_name == 'control_flow') %>% select(-c(task_name)) -> control_flow
866	details %>% filter(task_name == 'word_freq') %>% select(-c(task_name)) -> word_freq
867	# There's no arg2
868	details %>% filter(task_name == 'parse_help') %>% select(-c(task_name, arg2)) -> parse_help
869
870	details %>% filter(task_name == 'bubble_sort') %>% select(-c(task_name)) -> bubble_sort
871	details %>% filter(task_name == 'palindrome' & arg1 == 'unicode') %>% select(-c(task_name)) -> palindrome
872
873	precision = ColumnPrecision(list(max_rss_MB = 1), default = 0)
874	writeTsv(details, file.path(out_dir, 'details'), precision)
875
876	writeTsv(stdout_files, file.path(out_dir, 'stdout_files'), precision)
877
878	writeTsv(hello, file.path(out_dir, 'hello'), precision)
879	writeTsv(fib, file.path(out_dir, 'fib'), precision)
880	writeTsv(word_freq, file.path(out_dir, 'word_freq'), precision)
881	writeTsv(for_loop, file.path(out_dir, 'for_loop'), precision)
882	writeTsv(control_flow, file.path(out_dir, 'control_flow'), precision)
883	writeTsv(parse_help, file.path(out_dir, 'parse_help'), precision)
884
885	writeTsv(bubble_sort, file.path(out_dir, 'bubble_sort'), precision)
886	writeTsv(palindrome, file.path(out_dir, 'palindrome'), precision)
887
888	WriteProvenance(distinct_hosts, distinct_shells, out_dir, tsv = T)
889	}
890
891	WriteOneTask = function(times, out_dir, task_name, precision) {
892	times %>%
893	filter(task == task_name) %>%
894	select(-c(task)) -> subset
895
896	writeTsv(subset, file.path(out_dir, task_name), precision)
897	}
898
899	SHELL_ORDER = c('dash',
900	'bash',
901	'zsh',
902	'_bin/cxx-opt+bumpleak/osh',
903	'_bin/cxx-opt+bumproot/osh',
904	'_bin/cxx-opt+bumpsmall/osh',
905	'_bin/cxx-opt/osh',
906	'_bin/cxx-opt/mycpp-souffle/osh',
907	'_bin/cxx-opt+nopool/osh')
908
909	GcReport = function(in_dir, out_dir) {
910	times = read.table(file.path(in_dir, 'raw/times.tsv'), header=T)
911	gc_stats = read.table(file.path(in_dir, 'stage1/gc_stats.tsv'), header=T)
912
913	times %>% filter(status != 0) -> failed
914	if (nrow(failed) != 0) {
915	print(failed)
916	stop('Some gc tasks failed')
917	}
918
919	# Change units and order columns
920	times %>%
921	arrange(task, factor(sh_path, levels = SHELL_ORDER)) %>%
922	mutate(elapsed_ms = elapsed_secs * 1000,
923	user_ms = user_secs * 1000,
924	sys_ms = sys_secs * 1000,
925	max_rss_MB = max_rss_KiB * 1024 / 1e6,
926	shell_label = ShellLabelFromPath(sh_path)
927	) %>%
928	select(c(join_id, task, elapsed_ms, user_ms, sys_ms, max_rss_MB, shell_label,
929	shell_runtime_opts)) ->
930	times
931
932	# Join and order columns
933	gc_stats %>% left_join(times, by = c('join_id')) %>%
934	arrange(desc(task)) %>%
935	mutate(allocated_MB = bytes_allocated / 1e6) %>%
936	# try to make the table skinnier
937	rename(num_gc_done = num_collections) %>%
938	select(task, elapsed_ms, max_gc_millis, total_gc_millis,
939	allocated_MB, max_rss_MB, num_allocated,
940	num_gc_points, num_gc_done, gc_threshold, num_growths, max_survived,
941	shell_label) ->
942	gc_stats
943
944	times %>% select(-c(join_id)) -> times
945
946
947	precision = ColumnPrecision(list(max_rss_MB = 1, allocated_MB = 1),
948	default = 0)
949
950	writeTsv(times, file.path(out_dir, 'times'), precision)
951	writeTsv(gc_stats, file.path(out_dir, 'gc_stats'), precision)
952
953	tasks = c('parse.configure-coreutils',
954	'parse.configure-cpython',
955	'parse.abuild',
956	'ex.compute-fib',
957	'ex.bashcomp-parse-help',
958	'ex.abuild-print-help')
959	# Write out separate rows
960	for (task in tasks) {
961	WriteOneTask(times, out_dir, task, precision)
962	}
963	}
964
965	GcCachegrindReport = function(in_dir, out_dir) {
966	times = readTsv(file.path(in_dir, 'raw/times.tsv'))
967	counts = readTsv(file.path(in_dir, 'stage1/cachegrind.tsv'))
968
969	times %>% filter(status != 0) -> failed
970	if (nrow(failed) != 0) {
971	print(failed)
972	stop('Some gc tasks failed')
973	}
974
975	print(times)
976	print(counts)
977
978	counts %>% left_join(times, by = c('join_id')) %>%
979	mutate(million_irefs = irefs / 1e6) %>%
980	select(c(million_irefs, task, sh_path, shell_runtime_opts)) %>%
981	arrange(factor(sh_path, levels = SHELL_ORDER)) ->
982	counts
983
984	precision = NULL
985	tasks = c('parse.abuild', 'ex.compute-fib')
986	for (task in tasks) {
987	WriteOneTask(counts, out_dir, task, precision)
988	}
989	}
990
991	MyCppReport = function(in_dir, out_dir) {
992	times = readTsv(file.path(in_dir, 'benchmark-table.tsv'))
993	print(times)
994
995	times %>% filter(status != 0) -> failed
996	if (nrow(failed) != 0) {
997	print(failed)
998	stop('Some mycpp tasks failed')
999	}
1000
1001	# Don't care about elapsed and system
1002	times %>% select(-c(status, elapsed_secs, bin, task_out)) %>%
1003	mutate(example_name_HREF = mycppUrl(example_name),
1004	gen = c('gen'),
1005	gen_HREF = genUrl(example_name),
1006	user_ms = user_secs * 1000,
1007	sys_ms = sys_secs * 1000,
1008	max_rss_MB = max_rss_KiB * 1024 / 1e6) %>%
1009	select(-c(user_secs, sys_secs, max_rss_KiB)) ->
1010	details
1011
1012	details %>% select(-c(sys_ms, max_rss_MB)) %>%
1013	spread(key = impl, value = user_ms) %>%
1014	mutate(`C++ : Python` = `C++` / Python) %>%
1015	arrange(`C++ : Python`) ->
1016	user_time
1017
1018	details %>% select(-c(user_ms, max_rss_MB)) %>%
1019	spread(key = impl, value = sys_ms) %>%
1020	mutate(`C++ : Python` = `C++` / Python) %>%
1021	arrange(`C++ : Python`) ->
1022	sys_time
1023
1024	details %>% select(-c(user_ms, sys_ms)) %>%
1025	spread(key = impl, value = max_rss_MB) %>%
1026	mutate(`C++ : Python` = `C++` / Python) %>%
1027	arrange(`C++ : Python`) ->
1028	max_rss
1029
1030	# Sometimes it speeds up by more than 10x
1031	precision1 = ColumnPrecision(list(`C++ : Python` = 3), default = 0)
1032	writeTsv(user_time, file.path(out_dir, 'user_time'), precision1)
1033	writeTsv(sys_time, file.path(out_dir, 'sys_time'), precision1)
1034
1035	precision2 = ColumnPrecision(list(`C++ : Python` = 2), default = 1)
1036	writeTsv(max_rss, file.path(out_dir, 'max_rss'), precision2)
1037
1038	writeTsv(details, file.path(out_dir, 'details'))
1039	}
1040
1041	UftraceTaskReport = function(env, task_name, summaries) {
1042	# Need this again after redirect
1043	MaybeDisableColor(stdout())
1044
1045	task_env = env[[task_name]]
1046
1047	untyped = task_env$untyped
1048	typed = task_env$typed
1049	strings = task_env$strings
1050	slabs = task_env$slabs
1051	reserve = task_env$reserve
1052
1053	string_overhead = 17 # GC header (8) + len (4) + hash value (4) + NUL (1)
1054	strings %>% mutate(obj_len = str_len + string_overhead) -> strings
1055
1056	# TODO: Output these totals PER WORKLOAD, e.g. parsing big/small, executing
1057	# big/small
1058	#
1059	# And then zoom in on distributions as well
1060
1061	num_allocs = nrow(untyped)
1062	total_bytes = sum(untyped$obj_len)
1063
1064	untyped %>% group_by(obj_len) %>% count() %>% ungroup() -> untyped_hist
1065	#print(untyped_hist)
1066
1067	untyped_hist %>%
1068	mutate(n_less_than = cumsum(n),
1069	percent = n_less_than * 100.0 / num_allocs) ->
1070	alloc_sizes
1071
1072	a24 = untyped_hist %>% filter(obj_len <= 24)
1073	a48 = untyped_hist %>% filter(obj_len <= 48)
1074	a96 = untyped_hist %>% filter(obj_len <= 96)
1075
1076	allocs_24_bytes_or_less = sum(a24$n) * 100.0 / num_allocs
1077	allocs_48_bytes_or_less = sum(a48$n) * 100.0 / num_allocs
1078	allocs_96_bytes_or_less = sum(a96$n) * 100.0 / num_allocs
1079
1080	Log('Percentage of allocs less than 48 bytes: %.1f', allocs_48_bytes_or_less)
1081
1082	options(tibble.print_min=25)
1083
1084	Log('')
1085	Log('All allocations')
1086	print(alloc_sizes %>% head(22))
1087	print(alloc_sizes %>% tail(5))
1088
1089	Log('')
1090	Log('Common Sizes')
1091	print(untyped_hist %>% arrange(desc(n)) %>% head(8))
1092
1093	Log('')
1094	Log(' %s total allocations, total bytes = %s', commas(num_allocs), commas(total_bytes))
1095	Log('')
1096
1097	Log('Typed allocations')
1098
1099	num_typed = nrow(typed)
1100
1101	typed %>% group_by(func_name) %>% count() %>% ungroup() %>%
1102	mutate(percent = n * 100.0 / num_typed) %>%
1103	arrange(desc(n)) -> most_common_types
1104
1105	print(most_common_types %>% head(20))
1106	print(most_common_types %>% tail(5))
1107
1108	lists = typed %>% filter(str_starts(func_name, ('List<')))
1109	#print(lists)
1110
1111	num_lists = nrow(lists)
1112	total_list_bytes = num_lists * 24 # sizeof List<T> head is hard-coded
1113
1114	Log('')
1115	Log('%s typed allocs, including %s List<T>', commas(num_typed), commas(num_lists))
1116	Log('%.2f%% of allocs are typed', num_typed * 100 / num_allocs)
1117	Log('')
1118
1119	#
1120	# Strings
1121	#
1122
1123	num_strings = nrow(strings)
1124	total_string_bytes = sum(strings$obj_len)
1125
1126	strings %>% group_by(str_len) %>% count() %>% ungroup() %>%
1127	mutate(n_less_than = cumsum(n),
1128	percent = n_less_than * 100.0 / num_strings) ->
1129	string_lengths
1130
1131	strs_6_bytes_or_less = string_lengths %>% filter(str_len == 6) %>% select(percent)
1132	strs_14_bytes_or_less = string_lengths %>% filter(str_len == 14) %>% select(percent)
1133
1134	# Parse workload
1135	# 62% of strings <= 6 bytes
1136	# 84% of strings <= 14 bytes
1137
1138	Log('Str - NewStr() and OverAllocatedStr()')
1139	print(string_lengths %>% head(16))
1140	print(string_lengths %>% tail(5))
1141	Log('')
1142
1143	Log('%s string allocations, total length = %s, total bytes = %s', commas(num_strings),
1144	commas(sum(strings$str_len)), commas(total_string_bytes))
1145	Log('')
1146	Log('%.2f%% of allocs are strings', num_strings * 100 / num_allocs)
1147	Log('%.2f%% of bytes are strings', total_string_bytes * 100 / total_bytes)
1148	Log('')
1149
1150	#
1151	# Slabs
1152	#
1153
1154	Log('NewSlab()')
1155
1156	num_slabs = nrow(slabs)
1157	slabs %>% group_by(slab_len) %>% count() %>% ungroup() %>%
1158	mutate(n_less_than = cumsum(n),
1159	percent = n_less_than * 100.0 / num_slabs) ->
1160	slab_lengths
1161
1162	slabs %>% group_by(func_name) %>% count() %>% ungroup() %>%
1163	arrange(desc(n)) -> slab_types
1164
1165	Log(' Lengths')
1166	print(slab_lengths %>% head())
1167	print(slab_lengths %>% tail(5))
1168	Log('')
1169
1170	Log(' Slab Types')
1171	print(slab_types %>% head())
1172	print(slab_types %>% tail(5))
1173	Log('')
1174
1175	total_slab_items = sum(slabs$slab_len)
1176
1177	Log('%s slabs, total items = %s', commas(num_slabs),
1178	commas(sum(slabs$slab_len)))
1179	Log('%.2f%% of allocs are slabs', num_slabs * 100 / num_allocs)
1180	Log('')
1181
1182	#
1183	# reserve() calls
1184	#
1185
1186	# There should be strictly more List::reserve() calls than NewSlab
1187
1188	Log('::reserve(int n)')
1189	Log('')
1190
1191	num_reserve = nrow(reserve)
1192	reserve %>% group_by(num_items) %>% count() %>% ungroup() %>%
1193	mutate(n_less_than = cumsum(n),
1194	percent = n_less_than * 100.0 / num_reserve) ->
1195	reserve_args
1196
1197	Log(' Num Items')
1198	print(reserve_args %>% head(15))
1199	print(reserve_args %>% tail(5))
1200	Log('')
1201
1202	Log('%s reserve() calls, total items = %s', commas(num_reserve),
1203	commas(sum(reserve$num_items)))
1204	Log('')
1205
1206	# Accounting for all allocations!
1207	Log('Untyped: %s', commas(num_allocs))
1208	Log('Typed + Str + Slab: %s', commas(num_typed + num_strings + num_slabs))
1209	Log('')
1210
1211	num_other_typed = num_typed - num_lists
1212
1213	# Summary table
1214	stats = tibble(task = task_name,
1215	total_bytes_ = commas(total_bytes),
1216	num_allocs_ = commas(num_allocs),
1217	sum_typed_strs_slabs = commas(num_typed + num_strings + num_slabs),
1218	num_reserve_calls = commas(num_reserve),
1219
1220	percent_list_allocs = Percent(num_lists, num_allocs),
1221	percent_slab_allocs = Percent(num_slabs, num_allocs),
1222	percent_string_allocs = Percent(num_strings, num_allocs),
1223	percent_other_typed_allocs = Percent(num_other_typed, num_allocs),
1224
1225	percent_list_bytes = Percent(total_list_bytes, total_bytes),
1226	percent_string_bytes = Percent(total_string_bytes, total_bytes),
1227
1228	allocs_24_bytes_or_less = sprintf('%.1f%%', allocs_24_bytes_or_less),
1229	allocs_48_bytes_or_less = sprintf('%.1f%%', allocs_48_bytes_or_less),
1230	allocs_96_bytes_or_less = sprintf('%.1f%%', allocs_96_bytes_or_less),
1231
1232	strs_6_bytes_or_less = sprintf('%.1f%%', strs_6_bytes_or_less),
1233	strs_14_bytes_or_less = sprintf('%.1f%%', strs_14_bytes_or_less),
1234	)
1235	summaries$stats[[task_name]] = stats
1236
1237	summaries$most_common_types[[task_name]] = most_common_types
1238	}
1239
1240	LoadUftraceTsv = function(in_dir, env) {
1241	for (task in list.files(in_dir)) {
1242	Log('Loading data for task %s', task)
1243	base_dir = file.path(in_dir, task)
1244
1245	task_env = new.env()
1246	env[[task]] = task_env
1247
1248	# TSV file, not CSV
1249	task_env$untyped = readTsv(file.path(base_dir, 'all-untyped.tsv'))
1250	task_env$typed = readTsv(file.path(base_dir, 'typed.tsv'))
1251	task_env$strings = readTsv(file.path(base_dir, 'strings.tsv'))
1252	task_env$slabs = readTsv(file.path(base_dir, 'slabs.tsv'))
1253	task_env$reserve = readTsv(file.path(base_dir, 'reserve.tsv'))
1254
1255	# median string length is 4, mean is 9.5!
1256	Log('UNTYPED')
1257	print(summary(task_env$untyped))
1258	Log('')
1259
1260	Log('TYPED')
1261	print(summary(task_env$typed))
1262	Log('')
1263
1264	Log('STRINGS')
1265	print(summary(task_env$strings))
1266	Log('')
1267
1268	Log('SLABS')
1269	print(summary(task_env$slabs))
1270	Log('')
1271
1272	Log('RESERVE')
1273	print(summary(task_env$reserve))
1274	Log('')
1275	}
1276	}
1277
1278	Percent = function(n, total) {
1279	sprintf('%.1f%%', n * 100.0 / total)
1280	}
1281
1282	PrettyPrintLong = function(d) {
1283	tr = t(d) # transpose
1284
1285	row_names = rownames(tr)
1286
1287	for (i in 1:nrow(tr)) {
1288	row_name = row_names[i]
1289	cat(sprintf('%26s', row_name)) # calculated min width manually
1290	cat(sprintf('%20s', tr[i,]))
1291	cat('\n')
1292
1293	# Extra spacing
1294	if (row_name %in% c('num_reserve_calls',
1295	'percent_string_bytes',
1296	'percent_other_typed_allocs',
1297	'allocs_96_bytes_or_less')) {
1298	cat('\n')
1299	}
1300	}
1301	}
1302
1303
1304	UftraceReport = function(env, out_dir) {
1305	# summaries$stats should be a list of 1-row data frames
1306	# summaries$top_types should be a list of types
1307	summaries = new.env()
1308
1309	for (task_name in names(env)) {
1310	report_out = file.path(out_dir, paste0(task_name, '.txt'))
1311
1312	Log('Making report for task %s -> %s', task_name, report_out)
1313
1314	sink(file = report_out)
1315	UftraceTaskReport(env, task_name, summaries)
1316	sink() # reset
1317	}
1318	Log('')
1319
1320	# Concate all the data frames added to summary
1321	stats = bind_rows(as.list(summaries$stats))
1322
1323	sink(file = file.path(out_dir, 'summary.txt'))
1324	#print(stats)
1325	#Log('')
1326
1327	PrettyPrintLong(stats)
1328	Log('')
1329
1330	mct = summaries$most_common_types
1331	for (task_name in names(mct)) {
1332	Log('Common types in workload %s', task_name)
1333	Log('')
1334
1335	print(mct[[task_name]] %>% head(5))
1336	Log('')
1337	}
1338	sink()
1339
1340	# For the REPL
1341	return(list(stats = stats))
1342	}
1343
1344	main = function(argv) {
1345	action = argv[[1]]
1346	in_dir = argv[[2]]
1347	out_dir = argv[[3]]
1348
1349	if (action == 'osh-parser') {
1350	ParserReport(in_dir, out_dir)
1351
1352	} else if (action == 'osh-runtime') {
1353	RuntimeReport(in_dir, out_dir)
1354
1355	} else if (action == 'vm-baseline') {
1356	VmBaselineReport(in_dir, out_dir)
1357
1358	} else if (action == 'ovm-build') {
1359	OvmBuildReport(in_dir, out_dir)
1360
1361	} else if (action == 'compute') {
1362	ComputeReport(in_dir, out_dir)
1363
1364	} else if (action == 'gc') {
1365	GcReport(in_dir, out_dir)
1366
1367	} else if (action == 'gc-cachegrind') {
1368	GcCachegrindReport(in_dir, out_dir)
1369
1370	} else if (action == 'mycpp') {
1371	MyCppReport(in_dir, out_dir)
1372
1373	} else if (action == 'uftrace') {
1374	d = new.env()
1375	LoadUftraceTsv(in_dir, d)
1376	UftraceReport(d, out_dir)
1377
1378	} else {
1379	Log("Invalid action '%s'", action)
1380	quit(status = 1)
1381	}
1382	Log('PID %d done', Sys.getpid())
1383	}
1384
1385	if (length(sys.frames()) == 0) {
1386	# increase ggplot font size globally
1387	#theme_set(theme_grey(base_size = 20))
1388
1389	main(commandArgs(TRUE))
1390	}