OILS / benchmarks / report.R View on Github | oils.pub

1390 lines, 964 significant
1#!/usr/bin/env Rscript
2#
3# benchmarks/report.R -- Analyze data collected by shell scripts.
4#
5# Usage:
6# benchmarks/report.R OUT_DIR [TIMES_CSV...]
7
8# Suppress warnings about functions masked from 'package:stats' and 'package:base'
9# filter, lag
10# intersect, setdiff, setequal, union
11library(dplyr, warn.conflicts = FALSE)
12library(tidyr) # spread()
13library(stringr)
14
15source('benchmarks/common.R')
16
17options(stringsAsFactors = F)
18
19# For pretty printing
20commas = function(x) {
21 format(x, big.mark=',')
22}
23
24sourceUrl = function(path) {
25 sprintf('https://github.com/oilshell/oil/blob/master/%s', path)
26}
27
28# Takes a filename, not a path.
29sourceUrl2 = function(filename) {
30 sprintf(
31 'https://github.com/oilshell/oil/blob/master/benchmarks/testdata/%s',
32 filename)
33}
34
35mycppUrl = function(name) {
36 sprintf('https://github.com/oilshell/oil/blob/master/mycpp/examples/%s.py', name)
37}
38
39genUrl = function(name) {
40 sprintf('../../_gen/mycpp/examples/%s.mycpp.cc', name)
41}
42
43
44# TODO: Set up cgit because Github links are slow.
45benchmarkDataLink = function(subdir, name, suffix) {
46 #sprintf('../../../../benchmark-data/shell-id/%s', shell_id)
47 sprintf('https://github.com/oilshell/benchmark-data/blob/master/%s/%s%s',
48 subdir, name, suffix)
49}
50
51provenanceLink = function(subdir, name, suffix) {
52 sprintf('../%s/%s%s', subdir, name, suffix)
53}
54
55
56GetOshLabel = function(shell_hash, prov_dir) {
57 ### Given a string, return another string.
58
59 path = sprintf('%s/shell-id/osh-%s/sh-path.txt', prov_dir, shell_hash)
60
61 if (file.exists(path)) {
62 Log('Reading %s', path)
63 lines = readLines(path)
64 if (length(grep('_bin/osh', lines)) > 0) {
65 label = 'osh-ovm'
66 } else if (length(grep('bin/osh', lines)) > 0) {
67 label = 'osh-cpython'
68 } else if (length(grep('_bin/.*/mycpp-souffle/osh', lines)) > 0) {
69 label = 'osh-souffle'
70 } else if (length(grep('_bin/.*/osh', lines)) > 0) {
71 label = 'osh-native'
72 } else {
73 stop("Expected _bin/osh, bin/osh, or _bin/.*/osh")
74 }
75 } else {
76 stop(sprintf("%s doesn't exist", path))
77 }
78 return(label)
79}
80
81osh_opt_suffix1 = '_bin/cxx-opt/osh'
82osh_opt_suffix2 = '_bin/cxx-opt-sh/osh'
83
84osh_souffle_suffix1 = '_bin/cxx-opt/mycpp-souffle/osh'
85osh_souffle_suffix2 = '_bin/cxx-opt-sh/mycpp-souffle/osh'
86
87ysh_souffle_suffix1 = '_bin/cxx-opt/mycpp-souffle/ysh'
88ysh_souffle_suffix2 = '_bin/cxx-opt-sh/mycpp-souffle/ysh'
89
90ysh_opt_suffix1 = '_bin/cxx-opt/ysh'
91ysh_opt_suffix2 = '_bin/cxx-opt-sh/ysh'
92
93ShellLabels = function(shell_name, shell_hash, num_hosts) {
94 ### Given 2 vectors, return a vector of readable labels.
95
96 # TODO: Clean up callers. Some metrics all this function with a
97 # shell/runtime BASENAME, and others a PATH
98 # - e.g. ComputeReport calls this with runtime_name which is actually a PATH
99
100 #Log('name %s', shell_name)
101 #Log('hash %s', shell_hash)
102
103 if (num_hosts == 1) {
104 prov_dir = '_tmp'
105 } else {
106 prov_dir = '../benchmark-data/'
107 }
108
109 labels = c()
110 for (i in 1:length(shell_name)) {
111 sh = shell_name[i]
112 if (sh == 'osh') {
113 label = GetOshLabel(shell_hash[i], prov_dir)
114
115 } else if (endsWith(sh, 'osh-static')) {
116 label = 'osh-static'
117 } else if (endsWith(sh, 'ysh-static')) {
118 label = 'ysh-static'
119
120 } else if (endsWith(sh, osh_opt_suffix1) || endsWith(sh, osh_opt_suffix2)) {
121 label = 'opt/osh'
122 } else if (endsWith(sh, ysh_opt_suffix1) || endsWith(sh, ysh_opt_suffix2)) {
123 label = 'opt/ysh'
124
125 } else if (endsWith(sh, osh_souffle_suffix1) || endsWith(sh, osh_souffle_suffix2)) {
126 label = 'opt/osh-souffle'
127 } else if (endsWith(sh, ysh_souffle_suffix1) || endsWith(sh, ysh_souffle_suffix2)) {
128 label = 'opt/ysh-souffle'
129
130 } else if (endsWith(sh, '_bin/cxx-opt+bumpleak/osh')) {
131 label = 'bumpleak/osh'
132
133 } else {
134 label = sh
135 }
136
137 Log('[%s] [%s]', shell_name[i], label)
138 labels = c(labels, label)
139 }
140
141 return(labels)
142}
143
144# Simple version of the above, used by benchmarks/{gc,osh-runtime}
145ShellLabelFromPath = function(sh_path) {
146 labels = c()
147 for (i in 1:length(sh_path)) {
148 sh = sh_path[i]
149
150 if (endsWith(sh, osh_opt_suffix1) || endsWith(sh, osh_opt_suffix2)) {
151 # the opt binary is called osh-native - the osh-runtime report relies on this
152 label = 'osh-native'
153
154 } else if (endsWith(sh, ysh_opt_suffix1) || endsWith(sh, ysh_opt_suffix2)) {
155 label = 'opt/ysh'
156
157 } else if (endsWith(sh, 'osh-static')) {
158 label = 'osh-static'
159 } else if (endsWith(sh, 'ysh-static')) {
160 label = 'ysh-static'
161
162 } else if (endsWith(sh, osh_souffle_suffix1) || endsWith(sh, osh_souffle_suffix2)) {
163 label = 'osh-souffle'
164
165 } else if (endsWith(sh, '_bin/cxx-opt+bumpleak/osh')) {
166 label = 'bumpleak/osh'
167
168 } else if (endsWith(sh, '_bin/osh')) { # the app bundle
169 label = 'osh-ovm'
170
171 } else if (endsWith(sh, 'bin/osh')) {
172 label = 'osh-cpython'
173
174 } else {
175 label = sh
176 }
177 labels = c(labels, label)
178 }
179 return(labels)
180}
181
182DistinctHosts = function(t) {
183 t %>% distinct(host_name, host_hash) -> distinct_hosts
184 # The label is just the name
185 distinct_hosts$host_label = distinct_hosts$host_name
186 return(distinct_hosts)
187}
188
189DistinctShells = function(t, num_hosts = -1) {
190 t %>% distinct(shell_name, shell_hash) -> distinct_shells
191
192 Log('')
193 Log('Labeling shells')
194
195 # Calculate it if not passed
196 if (num_hosts == -1) {
197 num_hosts = nrow(DistinctHosts(t))
198 }
199
200 distinct_shells$shell_label = ShellLabels(distinct_shells$shell_name,
201 distinct_shells$shell_hash,
202 num_hosts)
203 return(distinct_shells)
204}
205
206ParserReport = function(in_dir, out_dir) {
207 times = read.csv(file.path(in_dir, 'times.csv'))
208 lines = read.csv(file.path(in_dir, 'lines.csv'))
209 raw_data = read.csv(file.path(in_dir, 'raw-data.csv'))
210
211 cachegrind = readTsv(file.path(in_dir, 'cachegrind.tsv'))
212
213 # For joining by filename
214 lines_by_filename = tibble(
215 num_lines = lines$num_lines,
216 filename = basename(lines$path)
217 )
218
219 # Remove failures
220 times %>% filter(status == 0) %>% select(-c(status)) -> times
221 cachegrind %>% filter(status == 0) %>% select(-c(status)) -> cachegrind
222
223 # Add the number of lines, joining on path, and compute lines/ms
224 times %>%
225 left_join(lines, by = c('path')) %>%
226 mutate(filename = basename(path), filename_HREF = sourceUrl(path),
227 max_rss_MB = max_rss_KiB * 1024 / 1e6,
228 elapsed_ms = elapsed_secs * 1000,
229 user_ms = user_secs * 1000,
230 sys_ms = sys_secs * 1000,
231 lines_per_ms = num_lines / elapsed_ms) %>%
232 select(-c(path, max_rss_KiB, elapsed_secs, user_secs, sys_secs)) ->
233 joined_times
234
235 #print(head(times))
236 #print(head(lines))
237 #print(head(vm))
238 #print(head(joined_times))
239
240 print(summary(joined_times))
241
242 #
243 # Find distinct shells and hosts, and label them for readability.
244 #
245
246 distinct_hosts = DistinctHosts(joined_times)
247 Log('')
248 Log('Distinct hosts')
249 print(distinct_hosts)
250
251 distinct_shells = DistinctShells(joined_times)
252 Log('')
253 Log('Distinct shells')
254 print(distinct_shells)
255
256 # Replace name/hash combinations with labels.
257 joined_times %>%
258 left_join(distinct_hosts, by = c('host_name', 'host_hash')) %>%
259 left_join(distinct_shells, by = c('shell_name', 'shell_hash')) %>%
260 select(-c(host_name, host_hash, shell_name, shell_hash)) ->
261 joined_times
262
263 # Like 'times', but do shell_label as one step
264 # Hack: we know benchmarks/auto.sh runs this on one machine
265 distinct_shells_2 = DistinctShells(cachegrind, num_hosts = nrow(distinct_hosts))
266 cachegrind %>%
267 left_join(lines, by = c('path')) %>%
268 select(-c(elapsed_secs, user_secs, sys_secs, max_rss_KiB)) %>%
269 left_join(distinct_shells_2, by = c('shell_name', 'shell_hash')) %>%
270 select(-c(shell_name, shell_hash)) %>%
271 mutate(filename = basename(path), filename_HREF = sourceUrl(path)) %>%
272 select(-c(path)) ->
273 joined_cachegrind
274
275 Log('summary(joined_times):')
276 print(summary(joined_times))
277 Log('head(joined_times):')
278 print(head(joined_times))
279
280 # Summarize rates by platform/shell
281 joined_times %>%
282 mutate(host_label = paste("host", host_label)) %>%
283 group_by(host_label, shell_label) %>%
284 summarize(total_lines = sum(num_lines), total_ms = sum(elapsed_ms)) %>%
285 mutate(lines_per_ms = total_lines / total_ms) %>%
286 select(-c(total_ms)) %>%
287 spread(key = host_label, value = lines_per_ms) ->
288 times_summary
289
290 # Sort by parsing rate on machine 1
291 if ("host hoover" %in% colnames(times_summary)) {
292 times_summary %>% arrange(desc(`host hoover`)) -> times_summary
293 } else {
294 times_summary %>% arrange(desc(`host no-host`)) -> times_summary
295 }
296
297 Log('times_summary:')
298 print(times_summary)
299
300 # Summarize cachegrind by platform/shell
301 # Bug fix: as.numeric(irefs) avoids 32-bit integer overflow!
302 joined_cachegrind %>%
303 group_by(shell_label) %>%
304 summarize(total_lines = sum(num_lines), total_irefs = sum(as.numeric(irefs))) %>%
305 mutate(thousand_irefs_per_line = total_irefs / total_lines / 1000) %>%
306 select(-c(total_irefs)) ->
307 cachegrind_summary
308
309 if ("no-host" %in% distinct_hosts$host_label) {
310
311 # We don't have all the shells
312 elapsed = NULL
313 rate = NULL
314 max_rss = NULL
315 instructions = NULL
316
317 joined_times %>%
318 select(c(shell_label, elapsed_ms, user_ms, sys_ms, max_rss_MB,
319 num_lines, filename, filename_HREF)) %>%
320 arrange(filename, elapsed_ms) ->
321 times_flat
322
323 joined_cachegrind %>%
324 select(c(shell_label, irefs, num_lines, filename, filename_HREF)) %>%
325 arrange(filename, irefs) ->
326 cachegrind_flat
327
328 } else {
329
330 times_flat = NULL
331 cachegrind_flat = NULL
332
333 # Hack for release. TODO: unify with SoilAdd commentMore actions
334 if (Sys.getenv("OILS_NO_SOUFFLE") == "") {
335 souffle_col = c('osh-souffle')
336 } else {
337 souffle_col = c()
338 }
339
340 cols1 = c('host_label', 'bash', 'dash', 'mksh', 'zsh',
341 'osh-ovm', 'osh-cpython', 'osh-native', souffle_col,
342 'osh_to_bash_ratio', 'num_lines', 'filename', 'filename_HREF')
343
344 # Elapsed seconds for each shell by platform and file
345 joined_times %>%
346 select(-c(lines_per_ms, user_ms, sys_ms, max_rss_MB)) %>%
347 spread(key = shell_label, value = elapsed_ms) %>%
348 arrange(host_label, num_lines) %>%
349 mutate(osh_to_bash_ratio = `osh-native` / bash) %>%
350 select(all_of(cols1)) ->
351 elapsed
352
353 Log('\n')
354 Log('ELAPSED')
355 print(elapsed)
356
357 cols2 = c('host_label', 'bash', 'dash', 'mksh', 'zsh',
358 'osh-ovm', 'osh-cpython', 'osh-native', souffle_col,
359 'num_lines', 'filename', 'filename_HREF')
360 # Rates by file and shell
361 joined_times %>%
362 select(-c(elapsed_ms, user_ms, sys_ms, max_rss_MB)) %>%
363 spread(key = shell_label, value = lines_per_ms) %>%
364 arrange(host_label, num_lines) %>%
365 select(all_of(cols2)) ->
366 rate
367
368 Log('\n')
369 Log('RATE')
370 print(rate)
371
372 # Memory usage by file
373 joined_times %>%
374 select(-c(elapsed_ms, lines_per_ms, user_ms, sys_ms)) %>%
375 spread(key = shell_label, value = max_rss_MB) %>%
376 arrange(host_label, num_lines) %>%
377 select(all_of(cols2)) ->
378 max_rss
379
380 Log('\n')
381 Log('MAX RSS')
382 print(max_rss)
383
384 Log('\n')
385 Log('joined_cachegrind has %d rows', nrow(joined_cachegrind))
386 print(joined_cachegrind)
387 #print(joined_cachegrind %>% filter(path == 'benchmarks/testdata/configure-helper.sh'))
388
389 cols3 = c('bash', 'dash', 'mksh', 'osh-native', souffle_col,
390 'num_lines', 'filename', 'filename_HREF')
391
392 # Cachegrind instructions by file
393 joined_cachegrind %>%
394 mutate(thousand_irefs_per_line = irefs / num_lines / 1000) %>%
395 select(-c(irefs)) %>%
396 spread(key = shell_label, value = thousand_irefs_per_line) %>%
397 arrange(num_lines) %>%
398 select(all_of(cols3)) ->
399 instructions
400
401 Log('\n')
402 Log('instructions has %d rows', nrow(instructions))
403 print(instructions)
404 }
405
406 WriteProvenance(distinct_hosts, distinct_shells, out_dir)
407
408 raw_data_table = tibble(
409 filename = basename(as.character(raw_data$path)),
410 filename_HREF = benchmarkDataLink('osh-parser', filename, '')
411 )
412 #print(raw_data_table)
413
414 writeCsv(raw_data_table, file.path(out_dir, 'raw-data'))
415
416 precision = SamePrecision(0) # lines per ms
417 writeCsv(times_summary, file.path(out_dir, 'summary'), precision)
418
419 precision = ColumnPrecision(list(), default = 1)
420 writeTsv(cachegrind_summary, file.path(out_dir, 'cachegrind_summary'), precision)
421
422 if (!is.null(times_flat)) {
423 precision = SamePrecision(0)
424 writeTsv(times_flat, file.path(out_dir, 'times_flat'), precision)
425 }
426
427 if (!is.null(cachegrind_flat)) {
428 precision = SamePrecision(0)
429 writeTsv(cachegrind_flat, file.path(out_dir, 'cachegrind_flat'), precision)
430 }
431
432 if (!is.null(elapsed)) { # equivalent to no-host
433 # Round to nearest millisecond, but the ratio has a decimal point.
434 precision = ColumnPrecision(list(osh_to_bash_ratio = 1), default = 0)
435 writeCsv(elapsed, file.path(out_dir, 'elapsed'), precision)
436
437 precision = SamePrecision(0)
438 writeCsv(rate, file.path(out_dir, 'rate'), precision)
439
440 writeCsv(max_rss, file.path(out_dir, 'max_rss'))
441
442 precision = SamePrecision(1)
443 writeTsv(instructions, file.path(out_dir, 'instructions'), precision)
444 }
445
446 Log('Wrote %s', out_dir)
447}
448
449WriteProvenance = function(distinct_hosts, distinct_shells, out_dir, tsv = F) {
450
451 num_hosts = nrow(distinct_hosts)
452 if (num_hosts == 1) {
453 linkify = provenanceLink
454 } else {
455 linkify = benchmarkDataLink
456 }
457
458 Log('distinct_hosts')
459 print(distinct_hosts)
460 Log('')
461
462 Log('distinct_shells')
463 print(distinct_shells)
464 Log('')
465
466 # Should be:
467 # host_id_url
468 # And then csv_to_html will be smart enough? It should take --url flag?
469 host_table = tibble(
470 host_label = distinct_hosts$host_label,
471 host_id = paste(distinct_hosts$host_name,
472 distinct_hosts$host_hash, sep='-'),
473 host_id_HREF = linkify('host-id', host_id, '/')
474 )
475 Log('host_table')
476 print(host_table)
477 Log('')
478
479 shell_table = tibble(
480 shell_label = distinct_shells$shell_label,
481 shell_id = paste(distinct_shells$shell_name,
482 distinct_shells$shell_hash, sep='-'),
483 shell_id_HREF = linkify('shell-id', shell_id, '/')
484 )
485
486 Log('shell_table')
487 print(shell_table)
488 Log('')
489
490 if (tsv) {
491 writeTsv(host_table, file.path(out_dir, 'hosts'))
492 writeTsv(shell_table, file.path(out_dir, 'shells'))
493 } else {
494 writeCsv(host_table, file.path(out_dir, 'hosts'))
495 writeCsv(shell_table, file.path(out_dir, 'shells'))
496 }
497}
498
499WriteSimpleProvenance = function(provenance, out_dir) {
500 Log('provenance')
501 print(provenance)
502 Log('')
503
504 # Legacy: add $shell_name, because "$shell_basename-$shell_hash" is what
505 # benchmarks/id.sh publish-shell-id uses
506 provenance %>%
507 mutate(shell_name = basename(sh_path)) %>%
508 distinct(shell_label, shell_name, shell_hash) ->
509 distinct_shells
510
511 Log('distinct_shells')
512 print(distinct_shells)
513 Log('')
514
515 provenance %>% distinct(host_label, host_name, host_hash) -> distinct_hosts
516
517 WriteProvenance(distinct_hosts, distinct_shells, out_dir, tsv = T)
518}
519
520RuntimeReport = function(in_dir, out_dir) {
521 times = readTsv(file.path(in_dir, 'times.tsv'))
522
523 gc_stats = readTsv(file.path(in_dir, 'gc_stats.tsv'))
524 provenance = readTsv(file.path(in_dir, 'provenance.tsv'))
525
526 times %>% filter(status != 0) -> failed
527 if (nrow(failed) != 0) {
528 print(failed)
529 stop('Some osh-runtime tasks failed')
530 }
531
532 # Joins:
533 # times <= sh_path => provenance
534 # times <= join_id, host_name => gc_stats
535
536 # TODO: provenance may have rows from 2 machines. Could validate them and
537 # deduplicate.
538
539 # It should have (host_label, host_name, host_hash)
540 # (shell_label, sh_path, shell_hash)
541 provenance %>%
542 mutate(host_label = host_name, shell_label = ShellLabelFromPath(sh_path)) ->
543 provenance
544
545 provenance %>% distinct(sh_path, shell_label) -> label_lookup
546
547 Log('label_lookup')
548 print(label_lookup)
549
550 # Join with provenance for host label and shell label
551 times %>%
552 select(c(elapsed_secs, user_secs, sys_secs, max_rss_KiB, task_id,
553 host_name, sh_path, workload)) %>%
554 mutate(elapsed_ms = elapsed_secs * 1000,
555 user_ms = user_secs * 1000,
556 sys_ms = sys_secs * 1000,
557 max_rss_MB = max_rss_KiB * 1024 / 1e6) %>%
558 select(-c(elapsed_secs, user_secs, sys_secs, max_rss_KiB)) %>%
559 left_join(label_lookup, by = c('sh_path')) %>%
560 select(-c(sh_path)) %>%
561 # we want to compare workloads on adjacent rows
562 arrange(workload) ->
563 details
564
565 times %>%
566 select(c(task_id, host_name, sh_path, workload, minor_faults, major_faults, swaps, in_block, out_block, signals, voluntary_ctx, involuntary_ctx)) %>%
567 left_join(label_lookup, by = c('sh_path')) %>%
568 select(-c(sh_path)) %>%
569 # we want to compare workloads on adjacent rows
570 arrange(workload) ->
571 details_io
572
573 Log('details')
574 print(details)
575
576 cols2 = c('workload', 'host_name',
577 'bash', 'dash', 'osh-native', 'osh-souffle', 'osh-static',
578 'osh_bash_ratio', 'static_bash_ratio')
579
580 # Elapsed time comparison
581 details %>%
582 select(-c(task_id, user_ms, sys_ms, max_rss_MB)) %>%
583 spread(key = shell_label, value = elapsed_ms) %>%
584 mutate(osh_bash_ratio = `osh-native` / bash) %>%
585 mutate(static_bash_ratio = `osh-static` / bash) %>%
586 arrange(workload, host_name) %>%
587 select(all_of(cols2)) ->
588 elapsed
589
590 Log('elapsed')
591 print(elapsed)
592
593 # Minor Page Faults Comparison
594 details_io %>%
595 select(c(host_name, shell_label, workload, minor_faults)) %>%
596 spread(key = shell_label, value = minor_faults) %>%
597 mutate(osh_bash_ratio = `osh-native` / bash) %>%
598 mutate(static_bash_ratio = `osh-static` / bash) %>%
599 arrange(workload, host_name) %>%
600 select(all_of(cols2)) ->
601 page_faults
602
603 Log('page_faults')
604 print(page_faults)
605
606 # Max RSS comparison
607 details %>%
608 select(c(host_name, shell_label, workload, max_rss_MB)) %>%
609 spread(key = shell_label, value = max_rss_MB) %>%
610 mutate(osh_bash_ratio = `osh-native` / bash) %>%
611 mutate(static_bash_ratio = `osh-static` / bash) %>%
612 arrange(workload, host_name) %>%
613 select(all_of(cols2)) ->
614 max_rss
615
616 Log('max rss')
617 print(max_rss)
618
619 details %>%
620 select(c(task_id, host_name, workload, elapsed_ms, max_rss_MB)) %>%
621 mutate(join_id = sprintf("gc-%d", task_id)) %>%
622 select(-c(task_id)) ->
623 gc_details
624
625 Log('GC details')
626 print(gc_details)
627 Log('')
628
629 Log('GC stats')
630 print(gc_stats)
631 Log('')
632
633 gc_stats %>%
634 left_join(gc_details, by = c('join_id', 'host_name')) %>%
635 select(-c(join_id, roots_capacity, objs_capacity)) %>%
636 # Do same transformations as GcReport()
637 mutate(allocated_MB = bytes_allocated / 1e6) %>%
638 select(-c(bytes_allocated)) %>%
639 rename(num_gc_done = num_collections) %>%
640 # Put these columns first
641 relocate(workload, host_name,
642 elapsed_ms, max_gc_millis, total_gc_millis,
643 allocated_MB, max_rss_MB, num_allocated) ->
644 gc_stats
645
646 Log('After GC stats')
647 print(gc_stats)
648 Log('')
649
650 WriteSimpleProvenance(provenance, out_dir)
651
652 # milliseconds don't need decimal digit
653 precision = ColumnPrecision(list(bash = 0, dash = 0, `osh-cpython` = 0,
654 `osh-native` = 0, `osh-souffle` = 0, `osh-static` = 0,
655 osh_bash_ratio = 2,
656 static_bash_ratio = 2))
657 writeTsv(elapsed, file.path(out_dir, 'elapsed'), precision)
658 writeTsv(page_faults, file.path(out_dir, 'page_faults'), precision)
659
660 precision2 = ColumnPrecision(list(osh_bash_ratio = 2, static_bash_ratio = 2))
661 writeTsv(max_rss, file.path(out_dir, 'max_rss'), precision2)
662
663 precision3 = ColumnPrecision(list(max_rss_MB = 1, allocated_MB = 1),
664 default = 0)
665 writeTsv(gc_stats, file.path(out_dir, 'gc_stats'), precision3)
666
667 writeTsv(details, file.path(out_dir, 'details'), precision3)
668 writeTsv(details_io, file.path(out_dir, 'details_io'))
669
670 Log('Wrote %s', out_dir)
671}
672
673VmBaselineReport = function(in_dir, out_dir) {
674 vm = readTsv(file.path(in_dir, 'vm-baseline.tsv'))
675 #print(vm)
676
677 # Not using DistinctHosts() because field host_hash isn't collected
678 num_hosts = nrow(vm %>% distinct(host))
679
680 vm %>%
681 rename(kib = metric_value) %>%
682 mutate(shell_label = ShellLabels(shell_name, shell_hash, num_hosts),
683 megabytes = kib * 1024 / 1e6) %>%
684 select(-c(shell_name, kib)) %>%
685 spread(key = c(metric_name), value = megabytes) %>%
686 rename(VmPeak_MB = VmPeak, VmRSS_MB = VmRSS) %>%
687 select(c(shell_label, shell_hash, host, VmRSS_MB, VmPeak_MB)) %>%
688 arrange(shell_label, shell_hash, host, VmPeak_MB) ->
689 vm
690
691 print(vm)
692
693 writeTsv(vm, file.path(out_dir, 'vm-baseline'))
694}
695
696WriteOvmBuildDetails = function(distinct_hosts, distinct_compilers, out_dir) {
697 host_table = tibble(
698 host_label = distinct_hosts$host_label,
699 host_id = paste(distinct_hosts$host_name,
700 distinct_hosts$host_hash, sep='-'),
701 host_id_HREF = benchmarkDataLink('host-id', host_id, '/')
702 )
703 print(host_table)
704
705 dc = distinct_compilers
706 compiler_table = tibble(
707 compiler_label = dc$compiler_label,
708 compiler_id = paste(dc$compiler_label, dc$compiler_hash, sep='-'),
709 compiler_id_HREF = benchmarkDataLink('compiler-id', compiler_id, '/')
710 )
711 print(compiler_table)
712
713 writeTsv(host_table, file.path(out_dir, 'hosts'))
714 writeTsv(compiler_table, file.path(out_dir, 'compilers'))
715}
716
717OvmBuildReport = function(in_dir, out_dir) {
718 times = readTsv(file.path(in_dir, 'times.tsv'))
719 native_sizes = readTsv(file.path(in_dir, 'native-sizes.tsv'))
720 #raw_data = readTsv(file.path(in_dir, 'raw-data.tsv'))
721
722 times %>% filter(status != 0) -> failed
723 if (nrow(failed) != 0) {
724 print(failed)
725 stop('Some ovm-build tasks failed')
726 }
727
728 times %>% distinct(host_name, host_hash) -> distinct_hosts
729 distinct_hosts$host_label = distinct_hosts$host_name
730
731 times %>% distinct(compiler_path, compiler_hash) -> distinct_compilers
732 distinct_compilers$compiler_label = basename(distinct_compilers$compiler_path)
733
734 #print(distinct_hosts)
735 #print(distinct_compilers)
736
737 WriteOvmBuildDetails(distinct_hosts, distinct_compilers, out_dir)
738
739 times %>%
740 select(-c(status)) %>%
741 left_join(distinct_hosts, by = c('host_name', 'host_hash')) %>%
742 left_join(distinct_compilers, by = c('compiler_path', 'compiler_hash')) %>%
743 select(-c(host_name, host_hash, compiler_path, compiler_hash)) %>%
744 mutate(src_dir = basename(src_dir),
745 host_label = paste("host ", host_label),
746 is_conf = str_detect(action, 'configure'),
747 is_ovm = str_detect(action, 'oil.ovm'),
748 is_dbg = str_detect(action, 'dbg'),
749 ) %>%
750 select(host_label, src_dir, compiler_label, action, is_conf, is_ovm, is_dbg,
751 elapsed_secs) %>%
752 spread(key = c(host_label), value = elapsed_secs) %>%
753 arrange(src_dir, compiler_label, desc(is_conf), is_ovm, desc(is_dbg)) %>%
754 select(-c(is_conf, is_ovm, is_dbg)) ->
755 times
756
757 #print(times)
758
759 # paths look like _tmp/ovm-build/bin/clang/oils_cpp.stripped
760 native_sizes %>%
761 select(c(host_label, path, num_bytes)) %>%
762 mutate(host_label = paste("host ", host_label),
763 binary = basename(path),
764 compiler = basename(dirname(path)),
765 ) %>%
766 select(-c(path)) %>%
767 spread(key = c(host_label), value = num_bytes) %>%
768 arrange(compiler, binary) ->
769 native_sizes
770
771 # NOTE: These don't have the host and compiler.
772 writeTsv(times, file.path(out_dir, 'times'))
773 writeTsv(native_sizes, file.path(out_dir, 'native-sizes'))
774
775 # TODO: I want a size report too
776 #writeCsv(sizes, file.path(out_dir, 'sizes'))
777}
778
779unique_stdout_md5sum = function(t, num_expected) {
780 u = n_distinct(t$stdout_md5sum)
781 if (u != num_expected) {
782 t %>% select(c(host_name, task_name, arg1, arg2, runtime_name, stdout_md5sum)) %>% print()
783 stop(sprintf('Expected %d unique md5sums, got %d', num_expected, u))
784 }
785}
786
787ComputeReport = function(in_dir, out_dir) {
788 # TSV file, not CSV
789 times = read.table(file.path(in_dir, 'times.tsv'), header=T)
790 print(times)
791
792 times %>% filter(status != 0) -> failed
793 if (nrow(failed) != 0) {
794 print(failed)
795 stop('Some compute tasks failed')
796 }
797
798 #
799 # Check correctness
800 #
801
802 times %>% filter(task_name == 'hello') %>% unique_stdout_md5sum(1)
803 times %>% filter(task_name == 'fib') %>% unique_stdout_md5sum(1)
804 times %>% filter(task_name == 'for_loop') %>% unique_stdout_md5sum(1)
805 times %>% filter(task_name == 'control_flow') %>% unique_stdout_md5sum(1)
806 times %>% filter(task_name == 'word_freq') %>% unique_stdout_md5sum(1)
807 # 3 different inputs
808 times %>% filter(task_name == 'parse_help') %>% unique_stdout_md5sum(3)
809
810 times %>% filter(task_name == 'bubble_sort') %>% unique_stdout_md5sum(2)
811
812 # TODO:
813 # - oils_cpp doesn't implement unicode LANG=C
814 # - bash behaves differently on your desktop vs. in the container
815 # - might need layer-locales in the image?
816
817 #times %>% filter(task_name == 'palindrome' & arg1 == 'unicode') %>% unique_stdout_md5sum(1)
818 # Ditto here
819 #times %>% filter(task_name == 'palindrome' & arg1 == 'bytes') %>% unique_stdout_md5sum(1)
820
821 #
822 # Find distinct shells and hosts, and label them for readability.
823 #
824
825 # Runtimes are called shells, as a hack for code reuse
826 times %>%
827 mutate(shell_name = runtime_name, shell_hash = runtime_hash) %>%
828 select(c(host_name, host_hash, shell_name, shell_hash)) ->
829 tmp
830
831 distinct_hosts = DistinctHosts(tmp)
832 Log('')
833 Log('Distinct hosts')
834 print(distinct_hosts)
835
836 distinct_shells = DistinctShells(tmp)
837 Log('')
838 Log('Distinct runtimes')
839 print(distinct_shells)
840
841 num_hosts = nrow(distinct_hosts)
842
843 times %>%
844 select(-c(status, stdout_md5sum, stdout_filename, host_hash, runtime_hash)) %>%
845 mutate(runtime_label = ShellLabels(runtime_name, runtime_hash, num_hosts),
846 elapsed_ms = elapsed_secs * 1000,
847 user_ms = user_secs * 1000,
848 sys_ms = sys_secs * 1000,
849 max_rss_MB = max_rss_KiB * 1024 / 1e6) %>%
850 select(-c(runtime_name, elapsed_secs, user_secs, sys_secs, max_rss_KiB)) %>%
851 arrange(host_name, task_name, arg1, arg2, user_ms) ->
852 details
853
854 times %>%
855 mutate(
856 runtime_label = ShellLabels(runtime_name, runtime_hash, num_hosts),
857 stdout_md5sum_HREF = file.path('tmp', task_name, stdout_filename)) %>%
858 select(c(host_name, task_name, arg1, arg2, runtime_label,
859 stdout_md5sum, stdout_md5sum_HREF)) ->
860 stdout_files
861
862 details %>% filter(task_name == 'hello') %>% select(-c(task_name)) -> hello
863 details %>% filter(task_name == 'fib') %>% select(-c(task_name)) -> fib
864 details %>% filter(task_name == 'for_loop') %>% select(-c(task_name)) -> for_loop
865 details %>% filter(task_name == 'control_flow') %>% select(-c(task_name)) -> control_flow
866 details %>% filter(task_name == 'word_freq') %>% select(-c(task_name)) -> word_freq
867 # There's no arg2
868 details %>% filter(task_name == 'parse_help') %>% select(-c(task_name, arg2)) -> parse_help
869
870 details %>% filter(task_name == 'bubble_sort') %>% select(-c(task_name)) -> bubble_sort
871 details %>% filter(task_name == 'palindrome' & arg1 == 'unicode') %>% select(-c(task_name)) -> palindrome
872
873 precision = ColumnPrecision(list(max_rss_MB = 1), default = 0)
874 writeTsv(details, file.path(out_dir, 'details'), precision)
875
876 writeTsv(stdout_files, file.path(out_dir, 'stdout_files'), precision)
877
878 writeTsv(hello, file.path(out_dir, 'hello'), precision)
879 writeTsv(fib, file.path(out_dir, 'fib'), precision)
880 writeTsv(word_freq, file.path(out_dir, 'word_freq'), precision)
881 writeTsv(for_loop, file.path(out_dir, 'for_loop'), precision)
882 writeTsv(control_flow, file.path(out_dir, 'control_flow'), precision)
883 writeTsv(parse_help, file.path(out_dir, 'parse_help'), precision)
884
885 writeTsv(bubble_sort, file.path(out_dir, 'bubble_sort'), precision)
886 writeTsv(palindrome, file.path(out_dir, 'palindrome'), precision)
887
888 WriteProvenance(distinct_hosts, distinct_shells, out_dir, tsv = T)
889}
890
891WriteOneTask = function(times, out_dir, task_name, precision) {
892 times %>%
893 filter(task == task_name) %>%
894 select(-c(task)) -> subset
895
896 writeTsv(subset, file.path(out_dir, task_name), precision)
897}
898
899SHELL_ORDER = c('dash',
900 'bash',
901 'zsh',
902 '_bin/cxx-opt+bumpleak/osh',
903 '_bin/cxx-opt+bumproot/osh',
904 '_bin/cxx-opt+bumpsmall/osh',
905 '_bin/cxx-opt/osh',
906 '_bin/cxx-opt/mycpp-souffle/osh',
907 '_bin/cxx-opt+nopool/osh')
908
909GcReport = function(in_dir, out_dir) {
910 times = read.table(file.path(in_dir, 'raw/times.tsv'), header=T)
911 gc_stats = read.table(file.path(in_dir, 'stage1/gc_stats.tsv'), header=T)
912
913 times %>% filter(status != 0) -> failed
914 if (nrow(failed) != 0) {
915 print(failed)
916 stop('Some gc tasks failed')
917 }
918
919 # Change units and order columns
920 times %>%
921 arrange(task, factor(sh_path, levels = SHELL_ORDER)) %>%
922 mutate(elapsed_ms = elapsed_secs * 1000,
923 user_ms = user_secs * 1000,
924 sys_ms = sys_secs * 1000,
925 max_rss_MB = max_rss_KiB * 1024 / 1e6,
926 shell_label = ShellLabelFromPath(sh_path)
927 ) %>%
928 select(c(join_id, task, elapsed_ms, user_ms, sys_ms, max_rss_MB, shell_label,
929 shell_runtime_opts)) ->
930 times
931
932 # Join and order columns
933 gc_stats %>% left_join(times, by = c('join_id')) %>%
934 arrange(desc(task)) %>%
935 mutate(allocated_MB = bytes_allocated / 1e6) %>%
936 # try to make the table skinnier
937 rename(num_gc_done = num_collections) %>%
938 select(task, elapsed_ms, max_gc_millis, total_gc_millis,
939 allocated_MB, max_rss_MB, num_allocated,
940 num_gc_points, num_gc_done, gc_threshold, num_growths, max_survived,
941 shell_label) ->
942 gc_stats
943
944 times %>% select(-c(join_id)) -> times
945
946
947 precision = ColumnPrecision(list(max_rss_MB = 1, allocated_MB = 1),
948 default = 0)
949
950 writeTsv(times, file.path(out_dir, 'times'), precision)
951 writeTsv(gc_stats, file.path(out_dir, 'gc_stats'), precision)
952
953 tasks = c('parse.configure-coreutils',
954 'parse.configure-cpython',
955 'parse.abuild',
956 'ex.compute-fib',
957 'ex.bashcomp-parse-help',
958 'ex.abuild-print-help')
959 # Write out separate rows
960 for (task in tasks) {
961 WriteOneTask(times, out_dir, task, precision)
962 }
963}
964
965GcCachegrindReport = function(in_dir, out_dir) {
966 times = readTsv(file.path(in_dir, 'raw/times.tsv'))
967 counts = readTsv(file.path(in_dir, 'stage1/cachegrind.tsv'))
968
969 times %>% filter(status != 0) -> failed
970 if (nrow(failed) != 0) {
971 print(failed)
972 stop('Some gc tasks failed')
973 }
974
975 print(times)
976 print(counts)
977
978 counts %>% left_join(times, by = c('join_id')) %>%
979 mutate(million_irefs = irefs / 1e6) %>%
980 select(c(million_irefs, task, sh_path, shell_runtime_opts)) %>%
981 arrange(factor(sh_path, levels = SHELL_ORDER)) ->
982 counts
983
984 precision = NULL
985 tasks = c('parse.abuild', 'ex.compute-fib')
986 for (task in tasks) {
987 WriteOneTask(counts, out_dir, task, precision)
988 }
989}
990
991MyCppReport = function(in_dir, out_dir) {
992 times = readTsv(file.path(in_dir, 'benchmark-table.tsv'))
993 print(times)
994
995 times %>% filter(status != 0) -> failed
996 if (nrow(failed) != 0) {
997 print(failed)
998 stop('Some mycpp tasks failed')
999 }
1000
1001 # Don't care about elapsed and system
1002 times %>% select(-c(status, elapsed_secs, bin, task_out)) %>%
1003 mutate(example_name_HREF = mycppUrl(example_name),
1004 gen = c('gen'),
1005 gen_HREF = genUrl(example_name),
1006 user_ms = user_secs * 1000,
1007 sys_ms = sys_secs * 1000,
1008 max_rss_MB = max_rss_KiB * 1024 / 1e6) %>%
1009 select(-c(user_secs, sys_secs, max_rss_KiB)) ->
1010 details
1011
1012 details %>% select(-c(sys_ms, max_rss_MB)) %>%
1013 spread(key = impl, value = user_ms) %>%
1014 mutate(`C++ : Python` = `C++` / Python) %>%
1015 arrange(`C++ : Python`) ->
1016 user_time
1017
1018 details %>% select(-c(user_ms, max_rss_MB)) %>%
1019 spread(key = impl, value = sys_ms) %>%
1020 mutate(`C++ : Python` = `C++` / Python) %>%
1021 arrange(`C++ : Python`) ->
1022 sys_time
1023
1024 details %>% select(-c(user_ms, sys_ms)) %>%
1025 spread(key = impl, value = max_rss_MB) %>%
1026 mutate(`C++ : Python` = `C++` / Python) %>%
1027 arrange(`C++ : Python`) ->
1028 max_rss
1029
1030 # Sometimes it speeds up by more than 10x
1031 precision1 = ColumnPrecision(list(`C++ : Python` = 3), default = 0)
1032 writeTsv(user_time, file.path(out_dir, 'user_time'), precision1)
1033 writeTsv(sys_time, file.path(out_dir, 'sys_time'), precision1)
1034
1035 precision2 = ColumnPrecision(list(`C++ : Python` = 2), default = 1)
1036 writeTsv(max_rss, file.path(out_dir, 'max_rss'), precision2)
1037
1038 writeTsv(details, file.path(out_dir, 'details'))
1039}
1040
1041UftraceTaskReport = function(env, task_name, summaries) {
1042 # Need this again after redirect
1043 MaybeDisableColor(stdout())
1044
1045 task_env = env[[task_name]]
1046
1047 untyped = task_env$untyped
1048 typed = task_env$typed
1049 strings = task_env$strings
1050 slabs = task_env$slabs
1051 reserve = task_env$reserve
1052
1053 string_overhead = 17 # GC header (8) + len (4) + hash value (4) + NUL (1)
1054 strings %>% mutate(obj_len = str_len + string_overhead) -> strings
1055
1056 # TODO: Output these totals PER WORKLOAD, e.g. parsing big/small, executing
1057 # big/small
1058 #
1059 # And then zoom in on distributions as well
1060
1061 num_allocs = nrow(untyped)
1062 total_bytes = sum(untyped$obj_len)
1063
1064 untyped %>% group_by(obj_len) %>% count() %>% ungroup() -> untyped_hist
1065 #print(untyped_hist)
1066
1067 untyped_hist %>%
1068 mutate(n_less_than = cumsum(n),
1069 percent = n_less_than * 100.0 / num_allocs) ->
1070 alloc_sizes
1071
1072 a24 = untyped_hist %>% filter(obj_len <= 24)
1073 a48 = untyped_hist %>% filter(obj_len <= 48)
1074 a96 = untyped_hist %>% filter(obj_len <= 96)
1075
1076 allocs_24_bytes_or_less = sum(a24$n) * 100.0 / num_allocs
1077 allocs_48_bytes_or_less = sum(a48$n) * 100.0 / num_allocs
1078 allocs_96_bytes_or_less = sum(a96$n) * 100.0 / num_allocs
1079
1080 Log('Percentage of allocs less than 48 bytes: %.1f', allocs_48_bytes_or_less)
1081
1082 options(tibble.print_min=25)
1083
1084 Log('')
1085 Log('All allocations')
1086 print(alloc_sizes %>% head(22))
1087 print(alloc_sizes %>% tail(5))
1088
1089 Log('')
1090 Log('Common Sizes')
1091 print(untyped_hist %>% arrange(desc(n)) %>% head(8))
1092
1093 Log('')
1094 Log(' %s total allocations, total bytes = %s', commas(num_allocs), commas(total_bytes))
1095 Log('')
1096
1097 Log('Typed allocations')
1098
1099 num_typed = nrow(typed)
1100
1101 typed %>% group_by(func_name) %>% count() %>% ungroup() %>%
1102 mutate(percent = n * 100.0 / num_typed) %>%
1103 arrange(desc(n)) -> most_common_types
1104
1105 print(most_common_types %>% head(20))
1106 print(most_common_types %>% tail(5))
1107
1108 lists = typed %>% filter(str_starts(func_name, ('List<')))
1109 #print(lists)
1110
1111 num_lists = nrow(lists)
1112 total_list_bytes = num_lists * 24 # sizeof List<T> head is hard-coded
1113
1114 Log('')
1115 Log('%s typed allocs, including %s List<T>', commas(num_typed), commas(num_lists))
1116 Log('%.2f%% of allocs are typed', num_typed * 100 / num_allocs)
1117 Log('')
1118
1119 #
1120 # Strings
1121 #
1122
1123 num_strings = nrow(strings)
1124 total_string_bytes = sum(strings$obj_len)
1125
1126 strings %>% group_by(str_len) %>% count() %>% ungroup() %>%
1127 mutate(n_less_than = cumsum(n),
1128 percent = n_less_than * 100.0 / num_strings) ->
1129 string_lengths
1130
1131 strs_6_bytes_or_less = string_lengths %>% filter(str_len == 6) %>% select(percent)
1132 strs_14_bytes_or_less = string_lengths %>% filter(str_len == 14) %>% select(percent)
1133
1134 # Parse workload
1135 # 62% of strings <= 6 bytes
1136 # 84% of strings <= 14 bytes
1137
1138 Log('Str - NewStr() and OverAllocatedStr()')
1139 print(string_lengths %>% head(16))
1140 print(string_lengths %>% tail(5))
1141 Log('')
1142
1143 Log('%s string allocations, total length = %s, total bytes = %s', commas(num_strings),
1144 commas(sum(strings$str_len)), commas(total_string_bytes))
1145 Log('')
1146 Log('%.2f%% of allocs are strings', num_strings * 100 / num_allocs)
1147 Log('%.2f%% of bytes are strings', total_string_bytes * 100 / total_bytes)
1148 Log('')
1149
1150 #
1151 # Slabs
1152 #
1153
1154 Log('NewSlab()')
1155
1156 num_slabs = nrow(slabs)
1157 slabs %>% group_by(slab_len) %>% count() %>% ungroup() %>%
1158 mutate(n_less_than = cumsum(n),
1159 percent = n_less_than * 100.0 / num_slabs) ->
1160 slab_lengths
1161
1162 slabs %>% group_by(func_name) %>% count() %>% ungroup() %>%
1163 arrange(desc(n)) -> slab_types
1164
1165 Log(' Lengths')
1166 print(slab_lengths %>% head())
1167 print(slab_lengths %>% tail(5))
1168 Log('')
1169
1170 Log(' Slab Types')
1171 print(slab_types %>% head())
1172 print(slab_types %>% tail(5))
1173 Log('')
1174
1175 total_slab_items = sum(slabs$slab_len)
1176
1177 Log('%s slabs, total items = %s', commas(num_slabs),
1178 commas(sum(slabs$slab_len)))
1179 Log('%.2f%% of allocs are slabs', num_slabs * 100 / num_allocs)
1180 Log('')
1181
1182 #
1183 # reserve() calls
1184 #
1185
1186 # There should be strictly more List::reserve() calls than NewSlab
1187
1188 Log('::reserve(int n)')
1189 Log('')
1190
1191 num_reserve = nrow(reserve)
1192 reserve %>% group_by(num_items) %>% count() %>% ungroup() %>%
1193 mutate(n_less_than = cumsum(n),
1194 percent = n_less_than * 100.0 / num_reserve) ->
1195 reserve_args
1196
1197 Log(' Num Items')
1198 print(reserve_args %>% head(15))
1199 print(reserve_args %>% tail(5))
1200 Log('')
1201
1202 Log('%s reserve() calls, total items = %s', commas(num_reserve),
1203 commas(sum(reserve$num_items)))
1204 Log('')
1205
1206 # Accounting for all allocations!
1207 Log('Untyped: %s', commas(num_allocs))
1208 Log('Typed + Str + Slab: %s', commas(num_typed + num_strings + num_slabs))
1209 Log('')
1210
1211 num_other_typed = num_typed - num_lists
1212
1213 # Summary table
1214 stats = tibble(task = task_name,
1215 total_bytes_ = commas(total_bytes),
1216 num_allocs_ = commas(num_allocs),
1217 sum_typed_strs_slabs = commas(num_typed + num_strings + num_slabs),
1218 num_reserve_calls = commas(num_reserve),
1219
1220 percent_list_allocs = Percent(num_lists, num_allocs),
1221 percent_slab_allocs = Percent(num_slabs, num_allocs),
1222 percent_string_allocs = Percent(num_strings, num_allocs),
1223 percent_other_typed_allocs = Percent(num_other_typed, num_allocs),
1224
1225 percent_list_bytes = Percent(total_list_bytes, total_bytes),
1226 percent_string_bytes = Percent(total_string_bytes, total_bytes),
1227
1228 allocs_24_bytes_or_less = sprintf('%.1f%%', allocs_24_bytes_or_less),
1229 allocs_48_bytes_or_less = sprintf('%.1f%%', allocs_48_bytes_or_less),
1230 allocs_96_bytes_or_less = sprintf('%.1f%%', allocs_96_bytes_or_less),
1231
1232 strs_6_bytes_or_less = sprintf('%.1f%%', strs_6_bytes_or_less),
1233 strs_14_bytes_or_less = sprintf('%.1f%%', strs_14_bytes_or_less),
1234 )
1235 summaries$stats[[task_name]] = stats
1236
1237 summaries$most_common_types[[task_name]] = most_common_types
1238}
1239
1240LoadUftraceTsv = function(in_dir, env) {
1241 for (task in list.files(in_dir)) {
1242 Log('Loading data for task %s', task)
1243 base_dir = file.path(in_dir, task)
1244
1245 task_env = new.env()
1246 env[[task]] = task_env
1247
1248 # TSV file, not CSV
1249 task_env$untyped = readTsv(file.path(base_dir, 'all-untyped.tsv'))
1250 task_env$typed = readTsv(file.path(base_dir, 'typed.tsv'))
1251 task_env$strings = readTsv(file.path(base_dir, 'strings.tsv'))
1252 task_env$slabs = readTsv(file.path(base_dir, 'slabs.tsv'))
1253 task_env$reserve = readTsv(file.path(base_dir, 'reserve.tsv'))
1254
1255 # median string length is 4, mean is 9.5!
1256 Log('UNTYPED')
1257 print(summary(task_env$untyped))
1258 Log('')
1259
1260 Log('TYPED')
1261 print(summary(task_env$typed))
1262 Log('')
1263
1264 Log('STRINGS')
1265 print(summary(task_env$strings))
1266 Log('')
1267
1268 Log('SLABS')
1269 print(summary(task_env$slabs))
1270 Log('')
1271
1272 Log('RESERVE')
1273 print(summary(task_env$reserve))
1274 Log('')
1275 }
1276}
1277
1278Percent = function(n, total) {
1279 sprintf('%.1f%%', n * 100.0 / total)
1280}
1281
1282PrettyPrintLong = function(d) {
1283 tr = t(d) # transpose
1284
1285 row_names = rownames(tr)
1286
1287 for (i in 1:nrow(tr)) {
1288 row_name = row_names[i]
1289 cat(sprintf('%26s', row_name)) # calculated min width manually
1290 cat(sprintf('%20s', tr[i,]))
1291 cat('\n')
1292
1293 # Extra spacing
1294 if (row_name %in% c('num_reserve_calls',
1295 'percent_string_bytes',
1296 'percent_other_typed_allocs',
1297 'allocs_96_bytes_or_less')) {
1298 cat('\n')
1299 }
1300 }
1301}
1302
1303
1304UftraceReport = function(env, out_dir) {
1305 # summaries$stats should be a list of 1-row data frames
1306 # summaries$top_types should be a list of types
1307 summaries = new.env()
1308
1309 for (task_name in names(env)) {
1310 report_out = file.path(out_dir, paste0(task_name, '.txt'))
1311
1312 Log('Making report for task %s -> %s', task_name, report_out)
1313
1314 sink(file = report_out)
1315 UftraceTaskReport(env, task_name, summaries)
1316 sink() # reset
1317 }
1318 Log('')
1319
1320 # Concate all the data frames added to summary
1321 stats = bind_rows(as.list(summaries$stats))
1322
1323 sink(file = file.path(out_dir, 'summary.txt'))
1324 #print(stats)
1325 #Log('')
1326
1327 PrettyPrintLong(stats)
1328 Log('')
1329
1330 mct = summaries$most_common_types
1331 for (task_name in names(mct)) {
1332 Log('Common types in workload %s', task_name)
1333 Log('')
1334
1335 print(mct[[task_name]] %>% head(5))
1336 Log('')
1337 }
1338 sink()
1339
1340 # For the REPL
1341 return(list(stats = stats))
1342}
1343
1344main = function(argv) {
1345 action = argv[[1]]
1346 in_dir = argv[[2]]
1347 out_dir = argv[[3]]
1348
1349 if (action == 'osh-parser') {
1350 ParserReport(in_dir, out_dir)
1351
1352 } else if (action == 'osh-runtime') {
1353 RuntimeReport(in_dir, out_dir)
1354
1355 } else if (action == 'vm-baseline') {
1356 VmBaselineReport(in_dir, out_dir)
1357
1358 } else if (action == 'ovm-build') {
1359 OvmBuildReport(in_dir, out_dir)
1360
1361 } else if (action == 'compute') {
1362 ComputeReport(in_dir, out_dir)
1363
1364 } else if (action == 'gc') {
1365 GcReport(in_dir, out_dir)
1366
1367 } else if (action == 'gc-cachegrind') {
1368 GcCachegrindReport(in_dir, out_dir)
1369
1370 } else if (action == 'mycpp') {
1371 MyCppReport(in_dir, out_dir)
1372
1373 } else if (action == 'uftrace') {
1374 d = new.env()
1375 LoadUftraceTsv(in_dir, d)
1376 UftraceReport(d, out_dir)
1377
1378 } else {
1379 Log("Invalid action '%s'", action)
1380 quit(status = 1)
1381 }
1382 Log('PID %d done', Sys.getpid())
1383}
1384
1385if (length(sys.frames()) == 0) {
1386 # increase ggplot font size globally
1387 #theme_set(theme_grey(base_size = 20))
1388
1389 main(commandArgs(TRUE))
1390}