1 | #!/usr/bin/env Rscript
|
2 | #
|
3 | # benchmarks/report.R -- Analyze data collected by shell scripts.
|
4 | #
|
5 | # Usage:
|
6 | # benchmarks/report.R OUT_DIR [TIMES_CSV...]
|
7 |
|
8 | # Suppress warnings about functions masked from 'package:stats' and 'package:base'
|
9 | # filter, lag
|
10 | # intersect, setdiff, setequal, union
|
11 | library(dplyr, warn.conflicts = FALSE)
|
12 | library(tidyr) # spread()
|
13 | library(stringr)
|
14 |
|
15 | source('benchmarks/common.R')
|
16 |
|
17 | options(stringsAsFactors = F)
|
18 |
|
19 | # For pretty printing
|
20 | commas = function(x) {
|
21 | format(x, big.mark=',')
|
22 | }
|
23 |
|
24 | sourceUrl = function(path) {
|
25 | sprintf('https://github.com/oilshell/oil/blob/master/%s', path)
|
26 | }
|
27 |
|
28 | # Takes a filename, not a path.
|
29 | sourceUrl2 = function(filename) {
|
30 | sprintf(
|
31 | 'https://github.com/oilshell/oil/blob/master/benchmarks/testdata/%s',
|
32 | filename)
|
33 | }
|
34 |
|
35 | mycppUrl = function(name) {
|
36 | sprintf('https://github.com/oilshell/oil/blob/master/mycpp/examples/%s.py', name)
|
37 | }
|
38 |
|
39 | genUrl = function(name) {
|
40 | sprintf('../../_gen/mycpp/examples/%s.mycpp.cc', name)
|
41 | }
|
42 |
|
43 |
|
44 | # TODO: Set up cgit because Github links are slow.
|
45 | benchmarkDataLink = function(subdir, name, suffix) {
|
46 | #sprintf('../../../../benchmark-data/shell-id/%s', shell_id)
|
47 | sprintf('https://github.com/oilshell/benchmark-data/blob/master/%s/%s%s',
|
48 | subdir, name, suffix)
|
49 | }
|
50 |
|
51 | provenanceLink = function(subdir, name, suffix) {
|
52 | sprintf('../%s/%s%s', subdir, name, suffix)
|
53 | }
|
54 |
|
55 |
|
56 | GetOshLabel = function(shell_hash, prov_dir) {
|
57 | ### Given a string, return another string.
|
58 |
|
59 | path = sprintf('%s/shell-id/osh-%s/sh-path.txt', prov_dir, shell_hash)
|
60 |
|
61 | if (file.exists(path)) {
|
62 | Log('Reading %s', path)
|
63 | lines = readLines(path)
|
64 | if (length(grep('_bin/osh', lines)) > 0) {
|
65 | label = 'osh-ovm'
|
66 | } else if (length(grep('bin/osh', lines)) > 0) {
|
67 | label = 'osh-cpython'
|
68 | } else if (length(grep('_bin/.*/mycpp-souffle/osh', lines)) > 0) {
|
69 | label = 'osh-souffle'
|
70 | } else if (length(grep('_bin/.*/osh', lines)) > 0) {
|
71 | label = 'osh-native'
|
72 | } else {
|
73 | stop("Expected _bin/osh, bin/osh, or _bin/.*/osh")
|
74 | }
|
75 | } else {
|
76 | stop(sprintf("%s doesn't exist", path))
|
77 | }
|
78 | return(label)
|
79 | }
|
80 |
|
81 | osh_opt_suffix1 = '_bin/cxx-opt/osh'
|
82 | osh_opt_suffix2 = '_bin/cxx-opt-sh/osh'
|
83 |
|
84 | osh_souffle_suffix1 = '_bin/cxx-opt/mycpp-souffle/osh'
|
85 | osh_souffle_suffix2 = '_bin/cxx-opt-sh/mycpp-souffle/osh'
|
86 |
|
87 | ysh_souffle_suffix1 = '_bin/cxx-opt/mycpp-souffle/ysh'
|
88 | ysh_souffle_suffix2 = '_bin/cxx-opt-sh/mycpp-souffle/ysh'
|
89 |
|
90 | ysh_opt_suffix1 = '_bin/cxx-opt/ysh'
|
91 | ysh_opt_suffix2 = '_bin/cxx-opt-sh/ysh'
|
92 |
|
93 | ShellLabels = function(shell_name, shell_hash, num_hosts) {
|
94 | ### Given 2 vectors, return a vector of readable labels.
|
95 |
|
96 | # TODO: Clean up callers. Some metrics all this function with a
|
97 | # shell/runtime BASENAME, and others a PATH
|
98 | # - e.g. ComputeReport calls this with runtime_name which is actually a PATH
|
99 |
|
100 | #Log('name %s', shell_name)
|
101 | #Log('hash %s', shell_hash)
|
102 |
|
103 | if (num_hosts == 1) {
|
104 | prov_dir = '_tmp'
|
105 | } else {
|
106 | prov_dir = '../benchmark-data/'
|
107 | }
|
108 |
|
109 | labels = c()
|
110 | for (i in 1:length(shell_name)) {
|
111 | sh = shell_name[i]
|
112 | if (sh == 'osh') {
|
113 | label = GetOshLabel(shell_hash[i], prov_dir)
|
114 |
|
115 | } else if (endsWith(sh, 'osh-static')) {
|
116 | label = 'osh-static'
|
117 | } else if (endsWith(sh, 'ysh-static')) {
|
118 | label = 'ysh-static'
|
119 |
|
120 | } else if (endsWith(sh, osh_opt_suffix1) || endsWith(sh, osh_opt_suffix2)) {
|
121 | label = 'opt/osh'
|
122 | } else if (endsWith(sh, ysh_opt_suffix1) || endsWith(sh, ysh_opt_suffix2)) {
|
123 | label = 'opt/ysh'
|
124 |
|
125 | } else if (endsWith(sh, osh_souffle_suffix1) || endsWith(sh, osh_souffle_suffix2)) {
|
126 | label = 'opt/osh-souffle'
|
127 | } else if (endsWith(sh, ysh_souffle_suffix1) || endsWith(sh, ysh_souffle_suffix2)) {
|
128 | label = 'opt/ysh-souffle'
|
129 |
|
130 | } else if (endsWith(sh, '_bin/cxx-opt+bumpleak/osh')) {
|
131 | label = 'bumpleak/osh'
|
132 |
|
133 | } else {
|
134 | label = sh
|
135 | }
|
136 |
|
137 | Log('[%s] [%s]', shell_name[i], label)
|
138 | labels = c(labels, label)
|
139 | }
|
140 |
|
141 | return(labels)
|
142 | }
|
143 |
|
144 | # Simple version of the above, used by benchmarks/{gc,osh-runtime}
|
145 | ShellLabelFromPath = function(sh_path) {
|
146 | labels = c()
|
147 | for (i in 1:length(sh_path)) {
|
148 | sh = sh_path[i]
|
149 |
|
150 | if (endsWith(sh, osh_opt_suffix1) || endsWith(sh, osh_opt_suffix2)) {
|
151 | # the opt binary is called osh-native - the osh-runtime report relies on this
|
152 | label = 'osh-native'
|
153 |
|
154 | } else if (endsWith(sh, ysh_opt_suffix1) || endsWith(sh, ysh_opt_suffix2)) {
|
155 | label = 'opt/ysh'
|
156 |
|
157 | } else if (endsWith(sh, 'osh-static')) {
|
158 | label = 'osh-static'
|
159 | } else if (endsWith(sh, 'ysh-static')) {
|
160 | label = 'ysh-static'
|
161 |
|
162 | } else if (endsWith(sh, osh_souffle_suffix1) || endsWith(sh, osh_souffle_suffix2)) {
|
163 | label = 'osh-souffle'
|
164 |
|
165 | } else if (endsWith(sh, '_bin/cxx-opt+bumpleak/osh')) {
|
166 | label = 'bumpleak/osh'
|
167 |
|
168 | } else if (endsWith(sh, '_bin/osh')) { # the app bundle
|
169 | label = 'osh-ovm'
|
170 |
|
171 | } else if (endsWith(sh, 'bin/osh')) {
|
172 | label = 'osh-cpython'
|
173 |
|
174 | } else {
|
175 | label = sh
|
176 | }
|
177 | labels = c(labels, label)
|
178 | }
|
179 | return(labels)
|
180 | }
|
181 |
|
182 | DistinctHosts = function(t) {
|
183 | t %>% distinct(host_name, host_hash) -> distinct_hosts
|
184 | # The label is just the name
|
185 | distinct_hosts$host_label = distinct_hosts$host_name
|
186 | return(distinct_hosts)
|
187 | }
|
188 |
|
189 | DistinctShells = function(t, num_hosts = -1) {
|
190 | t %>% distinct(shell_name, shell_hash) -> distinct_shells
|
191 |
|
192 | Log('')
|
193 | Log('Labeling shells')
|
194 |
|
195 | # Calculate it if not passed
|
196 | if (num_hosts == -1) {
|
197 | num_hosts = nrow(DistinctHosts(t))
|
198 | }
|
199 |
|
200 | distinct_shells$shell_label = ShellLabels(distinct_shells$shell_name,
|
201 | distinct_shells$shell_hash,
|
202 | num_hosts)
|
203 | return(distinct_shells)
|
204 | }
|
205 |
|
206 | ParserReport = function(in_dir, out_dir) {
|
207 | times = read.csv(file.path(in_dir, 'times.csv'))
|
208 | lines = read.csv(file.path(in_dir, 'lines.csv'))
|
209 | raw_data = read.csv(file.path(in_dir, 'raw-data.csv'))
|
210 |
|
211 | cachegrind = readTsv(file.path(in_dir, 'cachegrind.tsv'))
|
212 |
|
213 | # For joining by filename
|
214 | lines_by_filename = tibble(
|
215 | num_lines = lines$num_lines,
|
216 | filename = basename(lines$path)
|
217 | )
|
218 |
|
219 | # Remove failures
|
220 | times %>% filter(status == 0) %>% select(-c(status)) -> times
|
221 | cachegrind %>% filter(status == 0) %>% select(-c(status)) -> cachegrind
|
222 |
|
223 | # Add the number of lines, joining on path, and compute lines/ms
|
224 | times %>%
|
225 | left_join(lines, by = c('path')) %>%
|
226 | mutate(filename = basename(path), filename_HREF = sourceUrl(path),
|
227 | max_rss_MB = max_rss_KiB * 1024 / 1e6,
|
228 | elapsed_ms = elapsed_secs * 1000,
|
229 | user_ms = user_secs * 1000,
|
230 | sys_ms = sys_secs * 1000,
|
231 | lines_per_ms = num_lines / elapsed_ms) %>%
|
232 | select(-c(path, max_rss_KiB, elapsed_secs, user_secs, sys_secs)) ->
|
233 | joined_times
|
234 |
|
235 | #print(head(times))
|
236 | #print(head(lines))
|
237 | #print(head(vm))
|
238 | #print(head(joined_times))
|
239 |
|
240 | print(summary(joined_times))
|
241 |
|
242 | #
|
243 | # Find distinct shells and hosts, and label them for readability.
|
244 | #
|
245 |
|
246 | distinct_hosts = DistinctHosts(joined_times)
|
247 | Log('')
|
248 | Log('Distinct hosts')
|
249 | print(distinct_hosts)
|
250 |
|
251 | distinct_shells = DistinctShells(joined_times)
|
252 | Log('')
|
253 | Log('Distinct shells')
|
254 | print(distinct_shells)
|
255 |
|
256 | # Replace name/hash combinations with labels.
|
257 | joined_times %>%
|
258 | left_join(distinct_hosts, by = c('host_name', 'host_hash')) %>%
|
259 | left_join(distinct_shells, by = c('shell_name', 'shell_hash')) %>%
|
260 | select(-c(host_name, host_hash, shell_name, shell_hash)) ->
|
261 | joined_times
|
262 |
|
263 | # Like 'times', but do shell_label as one step
|
264 | # Hack: we know benchmarks/auto.sh runs this on one machine
|
265 | distinct_shells_2 = DistinctShells(cachegrind, num_hosts = nrow(distinct_hosts))
|
266 | cachegrind %>%
|
267 | left_join(lines, by = c('path')) %>%
|
268 | select(-c(elapsed_secs, user_secs, sys_secs, max_rss_KiB)) %>%
|
269 | left_join(distinct_shells_2, by = c('shell_name', 'shell_hash')) %>%
|
270 | select(-c(shell_name, shell_hash)) %>%
|
271 | mutate(filename = basename(path), filename_HREF = sourceUrl(path)) %>%
|
272 | select(-c(path)) ->
|
273 | joined_cachegrind
|
274 |
|
275 | Log('summary(joined_times):')
|
276 | print(summary(joined_times))
|
277 | Log('head(joined_times):')
|
278 | print(head(joined_times))
|
279 |
|
280 | # Summarize rates by platform/shell
|
281 | joined_times %>%
|
282 | mutate(host_label = paste("host", host_label)) %>%
|
283 | group_by(host_label, shell_label) %>%
|
284 | summarize(total_lines = sum(num_lines), total_ms = sum(elapsed_ms)) %>%
|
285 | mutate(lines_per_ms = total_lines / total_ms) %>%
|
286 | select(-c(total_ms)) %>%
|
287 | spread(key = host_label, value = lines_per_ms) ->
|
288 | times_summary
|
289 |
|
290 | # Sort by parsing rate on machine 1
|
291 | if ("host hoover" %in% colnames(times_summary)) {
|
292 | times_summary %>% arrange(desc(`host hoover`)) -> times_summary
|
293 | } else {
|
294 | times_summary %>% arrange(desc(`host no-host`)) -> times_summary
|
295 | }
|
296 |
|
297 | Log('times_summary:')
|
298 | print(times_summary)
|
299 |
|
300 | # Summarize cachegrind by platform/shell
|
301 | # Bug fix: as.numeric(irefs) avoids 32-bit integer overflow!
|
302 | joined_cachegrind %>%
|
303 | group_by(shell_label) %>%
|
304 | summarize(total_lines = sum(num_lines), total_irefs = sum(as.numeric(irefs))) %>%
|
305 | mutate(thousand_irefs_per_line = total_irefs / total_lines / 1000) %>%
|
306 | select(-c(total_irefs)) ->
|
307 | cachegrind_summary
|
308 |
|
309 | if ("no-host" %in% distinct_hosts$host_label) {
|
310 |
|
311 | # We don't have all the shells
|
312 | elapsed = NULL
|
313 | rate = NULL
|
314 | max_rss = NULL
|
315 | instructions = NULL
|
316 |
|
317 | joined_times %>%
|
318 | select(c(shell_label, elapsed_ms, user_ms, sys_ms, max_rss_MB,
|
319 | num_lines, filename, filename_HREF)) %>%
|
320 | arrange(filename, elapsed_ms) ->
|
321 | times_flat
|
322 |
|
323 | joined_cachegrind %>%
|
324 | select(c(shell_label, irefs, num_lines, filename, filename_HREF)) %>%
|
325 | arrange(filename, irefs) ->
|
326 | cachegrind_flat
|
327 |
|
328 | } else {
|
329 |
|
330 | times_flat = NULL
|
331 | cachegrind_flat = NULL
|
332 |
|
333 | # Hack for release. TODO: unify with SoilAdd commentMore actions
|
334 | if (Sys.getenv("OILS_NO_SOUFFLE") == "") {
|
335 | souffle_col = c('osh-souffle')
|
336 | } else {
|
337 | souffle_col = c()
|
338 | }
|
339 |
|
340 | cols1 = c('host_label', 'bash', 'dash', 'mksh', 'zsh',
|
341 | 'osh-ovm', 'osh-cpython', 'osh-native', souffle_col,
|
342 | 'osh_to_bash_ratio', 'num_lines', 'filename', 'filename_HREF')
|
343 |
|
344 | # Elapsed seconds for each shell by platform and file
|
345 | joined_times %>%
|
346 | select(-c(lines_per_ms, user_ms, sys_ms, max_rss_MB)) %>%
|
347 | spread(key = shell_label, value = elapsed_ms) %>%
|
348 | arrange(host_label, num_lines) %>%
|
349 | mutate(osh_to_bash_ratio = `osh-native` / bash) %>%
|
350 | select(all_of(cols1)) ->
|
351 | elapsed
|
352 |
|
353 | Log('\n')
|
354 | Log('ELAPSED')
|
355 | print(elapsed)
|
356 |
|
357 | cols2 = c('host_label', 'bash', 'dash', 'mksh', 'zsh',
|
358 | 'osh-ovm', 'osh-cpython', 'osh-native', souffle_col,
|
359 | 'num_lines', 'filename', 'filename_HREF')
|
360 | # Rates by file and shell
|
361 | joined_times %>%
|
362 | select(-c(elapsed_ms, user_ms, sys_ms, max_rss_MB)) %>%
|
363 | spread(key = shell_label, value = lines_per_ms) %>%
|
364 | arrange(host_label, num_lines) %>%
|
365 | select(all_of(cols2)) ->
|
366 | rate
|
367 |
|
368 | Log('\n')
|
369 | Log('RATE')
|
370 | print(rate)
|
371 |
|
372 | # Memory usage by file
|
373 | joined_times %>%
|
374 | select(-c(elapsed_ms, lines_per_ms, user_ms, sys_ms)) %>%
|
375 | spread(key = shell_label, value = max_rss_MB) %>%
|
376 | arrange(host_label, num_lines) %>%
|
377 | select(all_of(cols2)) ->
|
378 | max_rss
|
379 |
|
380 | Log('\n')
|
381 | Log('MAX RSS')
|
382 | print(max_rss)
|
383 |
|
384 | Log('\n')
|
385 | Log('joined_cachegrind has %d rows', nrow(joined_cachegrind))
|
386 | print(joined_cachegrind)
|
387 | #print(joined_cachegrind %>% filter(path == 'benchmarks/testdata/configure-helper.sh'))
|
388 |
|
389 | cols3 = c('bash', 'dash', 'mksh', 'osh-native', souffle_col,
|
390 | 'num_lines', 'filename', 'filename_HREF')
|
391 |
|
392 | # Cachegrind instructions by file
|
393 | joined_cachegrind %>%
|
394 | mutate(thousand_irefs_per_line = irefs / num_lines / 1000) %>%
|
395 | select(-c(irefs)) %>%
|
396 | spread(key = shell_label, value = thousand_irefs_per_line) %>%
|
397 | arrange(num_lines) %>%
|
398 | select(all_of(cols3)) ->
|
399 | instructions
|
400 |
|
401 | Log('\n')
|
402 | Log('instructions has %d rows', nrow(instructions))
|
403 | print(instructions)
|
404 | }
|
405 |
|
406 | WriteProvenance(distinct_hosts, distinct_shells, out_dir)
|
407 |
|
408 | raw_data_table = tibble(
|
409 | filename = basename(as.character(raw_data$path)),
|
410 | filename_HREF = benchmarkDataLink('osh-parser', filename, '')
|
411 | )
|
412 | #print(raw_data_table)
|
413 |
|
414 | writeCsv(raw_data_table, file.path(out_dir, 'raw-data'))
|
415 |
|
416 | precision = SamePrecision(0) # lines per ms
|
417 | writeCsv(times_summary, file.path(out_dir, 'summary'), precision)
|
418 |
|
419 | precision = ColumnPrecision(list(), default = 1)
|
420 | writeTsv(cachegrind_summary, file.path(out_dir, 'cachegrind_summary'), precision)
|
421 |
|
422 | if (!is.null(times_flat)) {
|
423 | precision = SamePrecision(0)
|
424 | writeTsv(times_flat, file.path(out_dir, 'times_flat'), precision)
|
425 | }
|
426 |
|
427 | if (!is.null(cachegrind_flat)) {
|
428 | precision = SamePrecision(0)
|
429 | writeTsv(cachegrind_flat, file.path(out_dir, 'cachegrind_flat'), precision)
|
430 | }
|
431 |
|
432 | if (!is.null(elapsed)) { # equivalent to no-host
|
433 | # Round to nearest millisecond, but the ratio has a decimal point.
|
434 | precision = ColumnPrecision(list(osh_to_bash_ratio = 1), default = 0)
|
435 | writeCsv(elapsed, file.path(out_dir, 'elapsed'), precision)
|
436 |
|
437 | precision = SamePrecision(0)
|
438 | writeCsv(rate, file.path(out_dir, 'rate'), precision)
|
439 |
|
440 | writeCsv(max_rss, file.path(out_dir, 'max_rss'))
|
441 |
|
442 | precision = SamePrecision(1)
|
443 | writeTsv(instructions, file.path(out_dir, 'instructions'), precision)
|
444 | }
|
445 |
|
446 | Log('Wrote %s', out_dir)
|
447 | }
|
448 |
|
449 | WriteProvenance = function(distinct_hosts, distinct_shells, out_dir, tsv = F) {
|
450 |
|
451 | num_hosts = nrow(distinct_hosts)
|
452 | if (num_hosts == 1) {
|
453 | linkify = provenanceLink
|
454 | } else {
|
455 | linkify = benchmarkDataLink
|
456 | }
|
457 |
|
458 | Log('distinct_hosts')
|
459 | print(distinct_hosts)
|
460 | Log('')
|
461 |
|
462 | Log('distinct_shells')
|
463 | print(distinct_shells)
|
464 | Log('')
|
465 |
|
466 | # Should be:
|
467 | # host_id_url
|
468 | # And then csv_to_html will be smart enough? It should take --url flag?
|
469 | host_table = tibble(
|
470 | host_label = distinct_hosts$host_label,
|
471 | host_id = paste(distinct_hosts$host_name,
|
472 | distinct_hosts$host_hash, sep='-'),
|
473 | host_id_HREF = linkify('host-id', host_id, '/')
|
474 | )
|
475 | Log('host_table')
|
476 | print(host_table)
|
477 | Log('')
|
478 |
|
479 | shell_table = tibble(
|
480 | shell_label = distinct_shells$shell_label,
|
481 | shell_id = paste(distinct_shells$shell_name,
|
482 | distinct_shells$shell_hash, sep='-'),
|
483 | shell_id_HREF = linkify('shell-id', shell_id, '/')
|
484 | )
|
485 |
|
486 | Log('shell_table')
|
487 | print(shell_table)
|
488 | Log('')
|
489 |
|
490 | if (tsv) {
|
491 | writeTsv(host_table, file.path(out_dir, 'hosts'))
|
492 | writeTsv(shell_table, file.path(out_dir, 'shells'))
|
493 | } else {
|
494 | writeCsv(host_table, file.path(out_dir, 'hosts'))
|
495 | writeCsv(shell_table, file.path(out_dir, 'shells'))
|
496 | }
|
497 | }
|
498 |
|
499 | WriteSimpleProvenance = function(provenance, out_dir) {
|
500 | Log('provenance')
|
501 | print(provenance)
|
502 | Log('')
|
503 |
|
504 | # Legacy: add $shell_name, because "$shell_basename-$shell_hash" is what
|
505 | # benchmarks/id.sh publish-shell-id uses
|
506 | provenance %>%
|
507 | mutate(shell_name = basename(sh_path)) %>%
|
508 | distinct(shell_label, shell_name, shell_hash) ->
|
509 | distinct_shells
|
510 |
|
511 | Log('distinct_shells')
|
512 | print(distinct_shells)
|
513 | Log('')
|
514 |
|
515 | provenance %>% distinct(host_label, host_name, host_hash) -> distinct_hosts
|
516 |
|
517 | WriteProvenance(distinct_hosts, distinct_shells, out_dir, tsv = T)
|
518 | }
|
519 |
|
520 | RuntimeReport = function(in_dir, out_dir) {
|
521 | times = readTsv(file.path(in_dir, 'times.tsv'))
|
522 |
|
523 | gc_stats = readTsv(file.path(in_dir, 'gc_stats.tsv'))
|
524 | provenance = readTsv(file.path(in_dir, 'provenance.tsv'))
|
525 |
|
526 | times %>% filter(status != 0) -> failed
|
527 | if (nrow(failed) != 0) {
|
528 | print(failed)
|
529 | stop('Some osh-runtime tasks failed')
|
530 | }
|
531 |
|
532 | # Joins:
|
533 | # times <= sh_path => provenance
|
534 | # times <= join_id, host_name => gc_stats
|
535 |
|
536 | # TODO: provenance may have rows from 2 machines. Could validate them and
|
537 | # deduplicate.
|
538 |
|
539 | # It should have (host_label, host_name, host_hash)
|
540 | # (shell_label, sh_path, shell_hash)
|
541 | provenance %>%
|
542 | mutate(host_label = host_name, shell_label = ShellLabelFromPath(sh_path)) ->
|
543 | provenance
|
544 |
|
545 | provenance %>% distinct(sh_path, shell_label) -> label_lookup
|
546 |
|
547 | Log('label_lookup')
|
548 | print(label_lookup)
|
549 |
|
550 | # Join with provenance for host label and shell label
|
551 | times %>%
|
552 | select(c(elapsed_secs, user_secs, sys_secs, max_rss_KiB, task_id,
|
553 | host_name, sh_path, workload)) %>%
|
554 | mutate(elapsed_ms = elapsed_secs * 1000,
|
555 | user_ms = user_secs * 1000,
|
556 | sys_ms = sys_secs * 1000,
|
557 | max_rss_MB = max_rss_KiB * 1024 / 1e6) %>%
|
558 | select(-c(elapsed_secs, user_secs, sys_secs, max_rss_KiB)) %>%
|
559 | left_join(label_lookup, by = c('sh_path')) %>%
|
560 | select(-c(sh_path)) %>%
|
561 | # we want to compare workloads on adjacent rows
|
562 | arrange(workload) ->
|
563 | details
|
564 |
|
565 | times %>%
|
566 | select(c(task_id, host_name, sh_path, workload, minor_faults, major_faults, swaps, in_block, out_block, signals, voluntary_ctx, involuntary_ctx)) %>%
|
567 | left_join(label_lookup, by = c('sh_path')) %>%
|
568 | select(-c(sh_path)) %>%
|
569 | # we want to compare workloads on adjacent rows
|
570 | arrange(workload) ->
|
571 | details_io
|
572 |
|
573 | Log('details')
|
574 | print(details)
|
575 |
|
576 | cols2 = c('workload', 'host_name',
|
577 | 'bash', 'dash', 'osh-native', 'osh-souffle', 'osh-static',
|
578 | 'osh_bash_ratio', 'static_bash_ratio')
|
579 |
|
580 | # Elapsed time comparison
|
581 | details %>%
|
582 | select(-c(task_id, user_ms, sys_ms, max_rss_MB)) %>%
|
583 | spread(key = shell_label, value = elapsed_ms) %>%
|
584 | mutate(osh_bash_ratio = `osh-native` / bash) %>%
|
585 | mutate(static_bash_ratio = `osh-static` / bash) %>%
|
586 | arrange(workload, host_name) %>%
|
587 | select(all_of(cols2)) ->
|
588 | elapsed
|
589 |
|
590 | Log('elapsed')
|
591 | print(elapsed)
|
592 |
|
593 | # Minor Page Faults Comparison
|
594 | details_io %>%
|
595 | select(c(host_name, shell_label, workload, minor_faults)) %>%
|
596 | spread(key = shell_label, value = minor_faults) %>%
|
597 | mutate(osh_bash_ratio = `osh-native` / bash) %>%
|
598 | mutate(static_bash_ratio = `osh-static` / bash) %>%
|
599 | arrange(workload, host_name) %>%
|
600 | select(all_of(cols2)) ->
|
601 | page_faults
|
602 |
|
603 | Log('page_faults')
|
604 | print(page_faults)
|
605 |
|
606 | # Max RSS comparison
|
607 | details %>%
|
608 | select(c(host_name, shell_label, workload, max_rss_MB)) %>%
|
609 | spread(key = shell_label, value = max_rss_MB) %>%
|
610 | mutate(osh_bash_ratio = `osh-native` / bash) %>%
|
611 | mutate(static_bash_ratio = `osh-static` / bash) %>%
|
612 | arrange(workload, host_name) %>%
|
613 | select(all_of(cols2)) ->
|
614 | max_rss
|
615 |
|
616 | Log('max rss')
|
617 | print(max_rss)
|
618 |
|
619 | details %>%
|
620 | select(c(task_id, host_name, workload, elapsed_ms, max_rss_MB)) %>%
|
621 | mutate(join_id = sprintf("gc-%d", task_id)) %>%
|
622 | select(-c(task_id)) ->
|
623 | gc_details
|
624 |
|
625 | Log('GC details')
|
626 | print(gc_details)
|
627 | Log('')
|
628 |
|
629 | Log('GC stats')
|
630 | print(gc_stats)
|
631 | Log('')
|
632 |
|
633 | gc_stats %>%
|
634 | left_join(gc_details, by = c('join_id', 'host_name')) %>%
|
635 | select(-c(join_id, roots_capacity, objs_capacity)) %>%
|
636 | # Do same transformations as GcReport()
|
637 | mutate(allocated_MB = bytes_allocated / 1e6) %>%
|
638 | select(-c(bytes_allocated)) %>%
|
639 | rename(num_gc_done = num_collections) %>%
|
640 | # Put these columns first
|
641 | relocate(workload, host_name,
|
642 | elapsed_ms, max_gc_millis, total_gc_millis,
|
643 | allocated_MB, max_rss_MB, num_allocated) ->
|
644 | gc_stats
|
645 |
|
646 | Log('After GC stats')
|
647 | print(gc_stats)
|
648 | Log('')
|
649 |
|
650 | WriteSimpleProvenance(provenance, out_dir)
|
651 |
|
652 | # milliseconds don't need decimal digit
|
653 | precision = ColumnPrecision(list(bash = 0, dash = 0, `osh-cpython` = 0,
|
654 | `osh-native` = 0, `osh-souffle` = 0, `osh-static` = 0,
|
655 | osh_bash_ratio = 2,
|
656 | static_bash_ratio = 2))
|
657 | writeTsv(elapsed, file.path(out_dir, 'elapsed'), precision)
|
658 | writeTsv(page_faults, file.path(out_dir, 'page_faults'), precision)
|
659 |
|
660 | precision2 = ColumnPrecision(list(osh_bash_ratio = 2, static_bash_ratio = 2))
|
661 | writeTsv(max_rss, file.path(out_dir, 'max_rss'), precision2)
|
662 |
|
663 | precision3 = ColumnPrecision(list(max_rss_MB = 1, allocated_MB = 1),
|
664 | default = 0)
|
665 | writeTsv(gc_stats, file.path(out_dir, 'gc_stats'), precision3)
|
666 |
|
667 | writeTsv(details, file.path(out_dir, 'details'), precision3)
|
668 | writeTsv(details_io, file.path(out_dir, 'details_io'))
|
669 |
|
670 | Log('Wrote %s', out_dir)
|
671 | }
|
672 |
|
673 | VmBaselineReport = function(in_dir, out_dir) {
|
674 | vm = readTsv(file.path(in_dir, 'vm-baseline.tsv'))
|
675 | #print(vm)
|
676 |
|
677 | # Not using DistinctHosts() because field host_hash isn't collected
|
678 | num_hosts = nrow(vm %>% distinct(host))
|
679 |
|
680 | vm %>%
|
681 | rename(kib = metric_value) %>%
|
682 | mutate(shell_label = ShellLabels(shell_name, shell_hash, num_hosts),
|
683 | megabytes = kib * 1024 / 1e6) %>%
|
684 | select(-c(shell_name, kib)) %>%
|
685 | spread(key = c(metric_name), value = megabytes) %>%
|
686 | rename(VmPeak_MB = VmPeak, VmRSS_MB = VmRSS) %>%
|
687 | select(c(shell_label, shell_hash, host, VmRSS_MB, VmPeak_MB)) %>%
|
688 | arrange(shell_label, shell_hash, host, VmPeak_MB) ->
|
689 | vm
|
690 |
|
691 | print(vm)
|
692 |
|
693 | writeTsv(vm, file.path(out_dir, 'vm-baseline'))
|
694 | }
|
695 |
|
696 | WriteOvmBuildDetails = function(distinct_hosts, distinct_compilers, out_dir) {
|
697 | host_table = tibble(
|
698 | host_label = distinct_hosts$host_label,
|
699 | host_id = paste(distinct_hosts$host_name,
|
700 | distinct_hosts$host_hash, sep='-'),
|
701 | host_id_HREF = benchmarkDataLink('host-id', host_id, '/')
|
702 | )
|
703 | print(host_table)
|
704 |
|
705 | dc = distinct_compilers
|
706 | compiler_table = tibble(
|
707 | compiler_label = dc$compiler_label,
|
708 | compiler_id = paste(dc$compiler_label, dc$compiler_hash, sep='-'),
|
709 | compiler_id_HREF = benchmarkDataLink('compiler-id', compiler_id, '/')
|
710 | )
|
711 | print(compiler_table)
|
712 |
|
713 | writeTsv(host_table, file.path(out_dir, 'hosts'))
|
714 | writeTsv(compiler_table, file.path(out_dir, 'compilers'))
|
715 | }
|
716 |
|
717 | OvmBuildReport = function(in_dir, out_dir) {
|
718 | times = readTsv(file.path(in_dir, 'times.tsv'))
|
719 | native_sizes = readTsv(file.path(in_dir, 'native-sizes.tsv'))
|
720 | #raw_data = readTsv(file.path(in_dir, 'raw-data.tsv'))
|
721 |
|
722 | times %>% filter(status != 0) -> failed
|
723 | if (nrow(failed) != 0) {
|
724 | print(failed)
|
725 | stop('Some ovm-build tasks failed')
|
726 | }
|
727 |
|
728 | times %>% distinct(host_name, host_hash) -> distinct_hosts
|
729 | distinct_hosts$host_label = distinct_hosts$host_name
|
730 |
|
731 | times %>% distinct(compiler_path, compiler_hash) -> distinct_compilers
|
732 | distinct_compilers$compiler_label = basename(distinct_compilers$compiler_path)
|
733 |
|
734 | #print(distinct_hosts)
|
735 | #print(distinct_compilers)
|
736 |
|
737 | WriteOvmBuildDetails(distinct_hosts, distinct_compilers, out_dir)
|
738 |
|
739 | times %>%
|
740 | select(-c(status)) %>%
|
741 | left_join(distinct_hosts, by = c('host_name', 'host_hash')) %>%
|
742 | left_join(distinct_compilers, by = c('compiler_path', 'compiler_hash')) %>%
|
743 | select(-c(host_name, host_hash, compiler_path, compiler_hash)) %>%
|
744 | mutate(src_dir = basename(src_dir),
|
745 | host_label = paste("host ", host_label),
|
746 | is_conf = str_detect(action, 'configure'),
|
747 | is_ovm = str_detect(action, 'oil.ovm'),
|
748 | is_dbg = str_detect(action, 'dbg'),
|
749 | ) %>%
|
750 | select(host_label, src_dir, compiler_label, action, is_conf, is_ovm, is_dbg,
|
751 | elapsed_secs) %>%
|
752 | spread(key = c(host_label), value = elapsed_secs) %>%
|
753 | arrange(src_dir, compiler_label, desc(is_conf), is_ovm, desc(is_dbg)) %>%
|
754 | select(-c(is_conf, is_ovm, is_dbg)) ->
|
755 | times
|
756 |
|
757 | #print(times)
|
758 |
|
759 | # paths look like _tmp/ovm-build/bin/clang/oils_cpp.stripped
|
760 | native_sizes %>%
|
761 | select(c(host_label, path, num_bytes)) %>%
|
762 | mutate(host_label = paste("host ", host_label),
|
763 | binary = basename(path),
|
764 | compiler = basename(dirname(path)),
|
765 | ) %>%
|
766 | select(-c(path)) %>%
|
767 | spread(key = c(host_label), value = num_bytes) %>%
|
768 | arrange(compiler, binary) ->
|
769 | native_sizes
|
770 |
|
771 | # NOTE: These don't have the host and compiler.
|
772 | writeTsv(times, file.path(out_dir, 'times'))
|
773 | writeTsv(native_sizes, file.path(out_dir, 'native-sizes'))
|
774 |
|
775 | # TODO: I want a size report too
|
776 | #writeCsv(sizes, file.path(out_dir, 'sizes'))
|
777 | }
|
778 |
|
779 | unique_stdout_md5sum = function(t, num_expected) {
|
780 | u = n_distinct(t$stdout_md5sum)
|
781 | if (u != num_expected) {
|
782 | t %>% select(c(host_name, task_name, arg1, arg2, runtime_name, stdout_md5sum)) %>% print()
|
783 | stop(sprintf('Expected %d unique md5sums, got %d', num_expected, u))
|
784 | }
|
785 | }
|
786 |
|
787 | ComputeReport = function(in_dir, out_dir) {
|
788 | # TSV file, not CSV
|
789 | times = read.table(file.path(in_dir, 'times.tsv'), header=T)
|
790 | print(times)
|
791 |
|
792 | times %>% filter(status != 0) -> failed
|
793 | if (nrow(failed) != 0) {
|
794 | print(failed)
|
795 | stop('Some compute tasks failed')
|
796 | }
|
797 |
|
798 | #
|
799 | # Check correctness
|
800 | #
|
801 |
|
802 | times %>% filter(task_name == 'hello') %>% unique_stdout_md5sum(1)
|
803 | times %>% filter(task_name == 'fib') %>% unique_stdout_md5sum(1)
|
804 | times %>% filter(task_name == 'for_loop') %>% unique_stdout_md5sum(1)
|
805 | times %>% filter(task_name == 'control_flow') %>% unique_stdout_md5sum(1)
|
806 | times %>% filter(task_name == 'word_freq') %>% unique_stdout_md5sum(1)
|
807 | # 3 different inputs
|
808 | times %>% filter(task_name == 'parse_help') %>% unique_stdout_md5sum(3)
|
809 |
|
810 | times %>% filter(task_name == 'bubble_sort') %>% unique_stdout_md5sum(2)
|
811 |
|
812 | # TODO:
|
813 | # - oils_cpp doesn't implement unicode LANG=C
|
814 | # - bash behaves differently on your desktop vs. in the container
|
815 | # - might need layer-locales in the image?
|
816 |
|
817 | #times %>% filter(task_name == 'palindrome' & arg1 == 'unicode') %>% unique_stdout_md5sum(1)
|
818 | # Ditto here
|
819 | #times %>% filter(task_name == 'palindrome' & arg1 == 'bytes') %>% unique_stdout_md5sum(1)
|
820 |
|
821 | #
|
822 | # Find distinct shells and hosts, and label them for readability.
|
823 | #
|
824 |
|
825 | # Runtimes are called shells, as a hack for code reuse
|
826 | times %>%
|
827 | mutate(shell_name = runtime_name, shell_hash = runtime_hash) %>%
|
828 | select(c(host_name, host_hash, shell_name, shell_hash)) ->
|
829 | tmp
|
830 |
|
831 | distinct_hosts = DistinctHosts(tmp)
|
832 | Log('')
|
833 | Log('Distinct hosts')
|
834 | print(distinct_hosts)
|
835 |
|
836 | distinct_shells = DistinctShells(tmp)
|
837 | Log('')
|
838 | Log('Distinct runtimes')
|
839 | print(distinct_shells)
|
840 |
|
841 | num_hosts = nrow(distinct_hosts)
|
842 |
|
843 | times %>%
|
844 | select(-c(status, stdout_md5sum, stdout_filename, host_hash, runtime_hash)) %>%
|
845 | mutate(runtime_label = ShellLabels(runtime_name, runtime_hash, num_hosts),
|
846 | elapsed_ms = elapsed_secs * 1000,
|
847 | user_ms = user_secs * 1000,
|
848 | sys_ms = sys_secs * 1000,
|
849 | max_rss_MB = max_rss_KiB * 1024 / 1e6) %>%
|
850 | select(-c(runtime_name, elapsed_secs, user_secs, sys_secs, max_rss_KiB)) %>%
|
851 | arrange(host_name, task_name, arg1, arg2, user_ms) ->
|
852 | details
|
853 |
|
854 | times %>%
|
855 | mutate(
|
856 | runtime_label = ShellLabels(runtime_name, runtime_hash, num_hosts),
|
857 | stdout_md5sum_HREF = file.path('tmp', task_name, stdout_filename)) %>%
|
858 | select(c(host_name, task_name, arg1, arg2, runtime_label,
|
859 | stdout_md5sum, stdout_md5sum_HREF)) ->
|
860 | stdout_files
|
861 |
|
862 | details %>% filter(task_name == 'hello') %>% select(-c(task_name)) -> hello
|
863 | details %>% filter(task_name == 'fib') %>% select(-c(task_name)) -> fib
|
864 | details %>% filter(task_name == 'for_loop') %>% select(-c(task_name)) -> for_loop
|
865 | details %>% filter(task_name == 'control_flow') %>% select(-c(task_name)) -> control_flow
|
866 | details %>% filter(task_name == 'word_freq') %>% select(-c(task_name)) -> word_freq
|
867 | # There's no arg2
|
868 | details %>% filter(task_name == 'parse_help') %>% select(-c(task_name, arg2)) -> parse_help
|
869 |
|
870 | details %>% filter(task_name == 'bubble_sort') %>% select(-c(task_name)) -> bubble_sort
|
871 | details %>% filter(task_name == 'palindrome' & arg1 == 'unicode') %>% select(-c(task_name)) -> palindrome
|
872 |
|
873 | precision = ColumnPrecision(list(max_rss_MB = 1), default = 0)
|
874 | writeTsv(details, file.path(out_dir, 'details'), precision)
|
875 |
|
876 | writeTsv(stdout_files, file.path(out_dir, 'stdout_files'), precision)
|
877 |
|
878 | writeTsv(hello, file.path(out_dir, 'hello'), precision)
|
879 | writeTsv(fib, file.path(out_dir, 'fib'), precision)
|
880 | writeTsv(word_freq, file.path(out_dir, 'word_freq'), precision)
|
881 | writeTsv(for_loop, file.path(out_dir, 'for_loop'), precision)
|
882 | writeTsv(control_flow, file.path(out_dir, 'control_flow'), precision)
|
883 | writeTsv(parse_help, file.path(out_dir, 'parse_help'), precision)
|
884 |
|
885 | writeTsv(bubble_sort, file.path(out_dir, 'bubble_sort'), precision)
|
886 | writeTsv(palindrome, file.path(out_dir, 'palindrome'), precision)
|
887 |
|
888 | WriteProvenance(distinct_hosts, distinct_shells, out_dir, tsv = T)
|
889 | }
|
890 |
|
891 | WriteOneTask = function(times, out_dir, task_name, precision) {
|
892 | times %>%
|
893 | filter(task == task_name) %>%
|
894 | select(-c(task)) -> subset
|
895 |
|
896 | writeTsv(subset, file.path(out_dir, task_name), precision)
|
897 | }
|
898 |
|
899 | SHELL_ORDER = c('dash',
|
900 | 'bash',
|
901 | 'zsh',
|
902 | '_bin/cxx-opt+bumpleak/osh',
|
903 | '_bin/cxx-opt+bumproot/osh',
|
904 | '_bin/cxx-opt+bumpsmall/osh',
|
905 | '_bin/cxx-opt/osh',
|
906 | '_bin/cxx-opt/mycpp-souffle/osh',
|
907 | '_bin/cxx-opt+nopool/osh')
|
908 |
|
909 | GcReport = function(in_dir, out_dir) {
|
910 | times = read.table(file.path(in_dir, 'raw/times.tsv'), header=T)
|
911 | gc_stats = read.table(file.path(in_dir, 'stage1/gc_stats.tsv'), header=T)
|
912 |
|
913 | times %>% filter(status != 0) -> failed
|
914 | if (nrow(failed) != 0) {
|
915 | print(failed)
|
916 | stop('Some gc tasks failed')
|
917 | }
|
918 |
|
919 | # Change units and order columns
|
920 | times %>%
|
921 | arrange(task, factor(sh_path, levels = SHELL_ORDER)) %>%
|
922 | mutate(elapsed_ms = elapsed_secs * 1000,
|
923 | user_ms = user_secs * 1000,
|
924 | sys_ms = sys_secs * 1000,
|
925 | max_rss_MB = max_rss_KiB * 1024 / 1e6,
|
926 | shell_label = ShellLabelFromPath(sh_path)
|
927 | ) %>%
|
928 | select(c(join_id, task, elapsed_ms, user_ms, sys_ms, max_rss_MB, shell_label,
|
929 | shell_runtime_opts)) ->
|
930 | times
|
931 |
|
932 | # Join and order columns
|
933 | gc_stats %>% left_join(times, by = c('join_id')) %>%
|
934 | arrange(desc(task)) %>%
|
935 | mutate(allocated_MB = bytes_allocated / 1e6) %>%
|
936 | # try to make the table skinnier
|
937 | rename(num_gc_done = num_collections) %>%
|
938 | select(task, elapsed_ms, max_gc_millis, total_gc_millis,
|
939 | allocated_MB, max_rss_MB, num_allocated,
|
940 | num_gc_points, num_gc_done, gc_threshold, num_growths, max_survived,
|
941 | shell_label) ->
|
942 | gc_stats
|
943 |
|
944 | times %>% select(-c(join_id)) -> times
|
945 |
|
946 |
|
947 | precision = ColumnPrecision(list(max_rss_MB = 1, allocated_MB = 1),
|
948 | default = 0)
|
949 |
|
950 | writeTsv(times, file.path(out_dir, 'times'), precision)
|
951 | writeTsv(gc_stats, file.path(out_dir, 'gc_stats'), precision)
|
952 |
|
953 | tasks = c('parse.configure-coreutils',
|
954 | 'parse.configure-cpython',
|
955 | 'parse.abuild',
|
956 | 'ex.compute-fib',
|
957 | 'ex.bashcomp-parse-help',
|
958 | 'ex.abuild-print-help')
|
959 | # Write out separate rows
|
960 | for (task in tasks) {
|
961 | WriteOneTask(times, out_dir, task, precision)
|
962 | }
|
963 | }
|
964 |
|
965 | GcCachegrindReport = function(in_dir, out_dir) {
|
966 | times = readTsv(file.path(in_dir, 'raw/times.tsv'))
|
967 | counts = readTsv(file.path(in_dir, 'stage1/cachegrind.tsv'))
|
968 |
|
969 | times %>% filter(status != 0) -> failed
|
970 | if (nrow(failed) != 0) {
|
971 | print(failed)
|
972 | stop('Some gc tasks failed')
|
973 | }
|
974 |
|
975 | print(times)
|
976 | print(counts)
|
977 |
|
978 | counts %>% left_join(times, by = c('join_id')) %>%
|
979 | mutate(million_irefs = irefs / 1e6) %>%
|
980 | select(c(million_irefs, task, sh_path, shell_runtime_opts)) %>%
|
981 | arrange(factor(sh_path, levels = SHELL_ORDER)) ->
|
982 | counts
|
983 |
|
984 | precision = NULL
|
985 | tasks = c('parse.abuild', 'ex.compute-fib')
|
986 | for (task in tasks) {
|
987 | WriteOneTask(counts, out_dir, task, precision)
|
988 | }
|
989 | }
|
990 |
|
991 | MyCppReport = function(in_dir, out_dir) {
|
992 | times = readTsv(file.path(in_dir, 'benchmark-table.tsv'))
|
993 | print(times)
|
994 |
|
995 | times %>% filter(status != 0) -> failed
|
996 | if (nrow(failed) != 0) {
|
997 | print(failed)
|
998 | stop('Some mycpp tasks failed')
|
999 | }
|
1000 |
|
1001 | # Don't care about elapsed and system
|
1002 | times %>% select(-c(status, elapsed_secs, bin, task_out)) %>%
|
1003 | mutate(example_name_HREF = mycppUrl(example_name),
|
1004 | gen = c('gen'),
|
1005 | gen_HREF = genUrl(example_name),
|
1006 | user_ms = user_secs * 1000,
|
1007 | sys_ms = sys_secs * 1000,
|
1008 | max_rss_MB = max_rss_KiB * 1024 / 1e6) %>%
|
1009 | select(-c(user_secs, sys_secs, max_rss_KiB)) ->
|
1010 | details
|
1011 |
|
1012 | details %>% select(-c(sys_ms, max_rss_MB)) %>%
|
1013 | spread(key = impl, value = user_ms) %>%
|
1014 | mutate(`C++ : Python` = `C++` / Python) %>%
|
1015 | arrange(`C++ : Python`) ->
|
1016 | user_time
|
1017 |
|
1018 | details %>% select(-c(user_ms, max_rss_MB)) %>%
|
1019 | spread(key = impl, value = sys_ms) %>%
|
1020 | mutate(`C++ : Python` = `C++` / Python) %>%
|
1021 | arrange(`C++ : Python`) ->
|
1022 | sys_time
|
1023 |
|
1024 | details %>% select(-c(user_ms, sys_ms)) %>%
|
1025 | spread(key = impl, value = max_rss_MB) %>%
|
1026 | mutate(`C++ : Python` = `C++` / Python) %>%
|
1027 | arrange(`C++ : Python`) ->
|
1028 | max_rss
|
1029 |
|
1030 | # Sometimes it speeds up by more than 10x
|
1031 | precision1 = ColumnPrecision(list(`C++ : Python` = 3), default = 0)
|
1032 | writeTsv(user_time, file.path(out_dir, 'user_time'), precision1)
|
1033 | writeTsv(sys_time, file.path(out_dir, 'sys_time'), precision1)
|
1034 |
|
1035 | precision2 = ColumnPrecision(list(`C++ : Python` = 2), default = 1)
|
1036 | writeTsv(max_rss, file.path(out_dir, 'max_rss'), precision2)
|
1037 |
|
1038 | writeTsv(details, file.path(out_dir, 'details'))
|
1039 | }
|
1040 |
|
1041 | UftraceTaskReport = function(env, task_name, summaries) {
|
1042 | # Need this again after redirect
|
1043 | MaybeDisableColor(stdout())
|
1044 |
|
1045 | task_env = env[[task_name]]
|
1046 |
|
1047 | untyped = task_env$untyped
|
1048 | typed = task_env$typed
|
1049 | strings = task_env$strings
|
1050 | slabs = task_env$slabs
|
1051 | reserve = task_env$reserve
|
1052 |
|
1053 | string_overhead = 17 # GC header (8) + len (4) + hash value (4) + NUL (1)
|
1054 | strings %>% mutate(obj_len = str_len + string_overhead) -> strings
|
1055 |
|
1056 | # TODO: Output these totals PER WORKLOAD, e.g. parsing big/small, executing
|
1057 | # big/small
|
1058 | #
|
1059 | # And then zoom in on distributions as well
|
1060 |
|
1061 | num_allocs = nrow(untyped)
|
1062 | total_bytes = sum(untyped$obj_len)
|
1063 |
|
1064 | untyped %>% group_by(obj_len) %>% count() %>% ungroup() -> untyped_hist
|
1065 | #print(untyped_hist)
|
1066 |
|
1067 | untyped_hist %>%
|
1068 | mutate(n_less_than = cumsum(n),
|
1069 | percent = n_less_than * 100.0 / num_allocs) ->
|
1070 | alloc_sizes
|
1071 |
|
1072 | a24 = untyped_hist %>% filter(obj_len <= 24)
|
1073 | a48 = untyped_hist %>% filter(obj_len <= 48)
|
1074 | a96 = untyped_hist %>% filter(obj_len <= 96)
|
1075 |
|
1076 | allocs_24_bytes_or_less = sum(a24$n) * 100.0 / num_allocs
|
1077 | allocs_48_bytes_or_less = sum(a48$n) * 100.0 / num_allocs
|
1078 | allocs_96_bytes_or_less = sum(a96$n) * 100.0 / num_allocs
|
1079 |
|
1080 | Log('Percentage of allocs less than 48 bytes: %.1f', allocs_48_bytes_or_less)
|
1081 |
|
1082 | options(tibble.print_min=25)
|
1083 |
|
1084 | Log('')
|
1085 | Log('All allocations')
|
1086 | print(alloc_sizes %>% head(22))
|
1087 | print(alloc_sizes %>% tail(5))
|
1088 |
|
1089 | Log('')
|
1090 | Log('Common Sizes')
|
1091 | print(untyped_hist %>% arrange(desc(n)) %>% head(8))
|
1092 |
|
1093 | Log('')
|
1094 | Log(' %s total allocations, total bytes = %s', commas(num_allocs), commas(total_bytes))
|
1095 | Log('')
|
1096 |
|
1097 | Log('Typed allocations')
|
1098 |
|
1099 | num_typed = nrow(typed)
|
1100 |
|
1101 | typed %>% group_by(func_name) %>% count() %>% ungroup() %>%
|
1102 | mutate(percent = n * 100.0 / num_typed) %>%
|
1103 | arrange(desc(n)) -> most_common_types
|
1104 |
|
1105 | print(most_common_types %>% head(20))
|
1106 | print(most_common_types %>% tail(5))
|
1107 |
|
1108 | lists = typed %>% filter(str_starts(func_name, ('List<')))
|
1109 | #print(lists)
|
1110 |
|
1111 | num_lists = nrow(lists)
|
1112 | total_list_bytes = num_lists * 24 # sizeof List<T> head is hard-coded
|
1113 |
|
1114 | Log('')
|
1115 | Log('%s typed allocs, including %s List<T>', commas(num_typed), commas(num_lists))
|
1116 | Log('%.2f%% of allocs are typed', num_typed * 100 / num_allocs)
|
1117 | Log('')
|
1118 |
|
1119 | #
|
1120 | # Strings
|
1121 | #
|
1122 |
|
1123 | num_strings = nrow(strings)
|
1124 | total_string_bytes = sum(strings$obj_len)
|
1125 |
|
1126 | strings %>% group_by(str_len) %>% count() %>% ungroup() %>%
|
1127 | mutate(n_less_than = cumsum(n),
|
1128 | percent = n_less_than * 100.0 / num_strings) ->
|
1129 | string_lengths
|
1130 |
|
1131 | strs_6_bytes_or_less = string_lengths %>% filter(str_len == 6) %>% select(percent)
|
1132 | strs_14_bytes_or_less = string_lengths %>% filter(str_len == 14) %>% select(percent)
|
1133 |
|
1134 | # Parse workload
|
1135 | # 62% of strings <= 6 bytes
|
1136 | # 84% of strings <= 14 bytes
|
1137 |
|
1138 | Log('Str - NewStr() and OverAllocatedStr()')
|
1139 | print(string_lengths %>% head(16))
|
1140 | print(string_lengths %>% tail(5))
|
1141 | Log('')
|
1142 |
|
1143 | Log('%s string allocations, total length = %s, total bytes = %s', commas(num_strings),
|
1144 | commas(sum(strings$str_len)), commas(total_string_bytes))
|
1145 | Log('')
|
1146 | Log('%.2f%% of allocs are strings', num_strings * 100 / num_allocs)
|
1147 | Log('%.2f%% of bytes are strings', total_string_bytes * 100 / total_bytes)
|
1148 | Log('')
|
1149 |
|
1150 | #
|
1151 | # Slabs
|
1152 | #
|
1153 |
|
1154 | Log('NewSlab()')
|
1155 |
|
1156 | num_slabs = nrow(slabs)
|
1157 | slabs %>% group_by(slab_len) %>% count() %>% ungroup() %>%
|
1158 | mutate(n_less_than = cumsum(n),
|
1159 | percent = n_less_than * 100.0 / num_slabs) ->
|
1160 | slab_lengths
|
1161 |
|
1162 | slabs %>% group_by(func_name) %>% count() %>% ungroup() %>%
|
1163 | arrange(desc(n)) -> slab_types
|
1164 |
|
1165 | Log(' Lengths')
|
1166 | print(slab_lengths %>% head())
|
1167 | print(slab_lengths %>% tail(5))
|
1168 | Log('')
|
1169 |
|
1170 | Log(' Slab Types')
|
1171 | print(slab_types %>% head())
|
1172 | print(slab_types %>% tail(5))
|
1173 | Log('')
|
1174 |
|
1175 | total_slab_items = sum(slabs$slab_len)
|
1176 |
|
1177 | Log('%s slabs, total items = %s', commas(num_slabs),
|
1178 | commas(sum(slabs$slab_len)))
|
1179 | Log('%.2f%% of allocs are slabs', num_slabs * 100 / num_allocs)
|
1180 | Log('')
|
1181 |
|
1182 | #
|
1183 | # reserve() calls
|
1184 | #
|
1185 |
|
1186 | # There should be strictly more List::reserve() calls than NewSlab
|
1187 |
|
1188 | Log('::reserve(int n)')
|
1189 | Log('')
|
1190 |
|
1191 | num_reserve = nrow(reserve)
|
1192 | reserve %>% group_by(num_items) %>% count() %>% ungroup() %>%
|
1193 | mutate(n_less_than = cumsum(n),
|
1194 | percent = n_less_than * 100.0 / num_reserve) ->
|
1195 | reserve_args
|
1196 |
|
1197 | Log(' Num Items')
|
1198 | print(reserve_args %>% head(15))
|
1199 | print(reserve_args %>% tail(5))
|
1200 | Log('')
|
1201 |
|
1202 | Log('%s reserve() calls, total items = %s', commas(num_reserve),
|
1203 | commas(sum(reserve$num_items)))
|
1204 | Log('')
|
1205 |
|
1206 | # Accounting for all allocations!
|
1207 | Log('Untyped: %s', commas(num_allocs))
|
1208 | Log('Typed + Str + Slab: %s', commas(num_typed + num_strings + num_slabs))
|
1209 | Log('')
|
1210 |
|
1211 | num_other_typed = num_typed - num_lists
|
1212 |
|
1213 | # Summary table
|
1214 | stats = tibble(task = task_name,
|
1215 | total_bytes_ = commas(total_bytes),
|
1216 | num_allocs_ = commas(num_allocs),
|
1217 | sum_typed_strs_slabs = commas(num_typed + num_strings + num_slabs),
|
1218 | num_reserve_calls = commas(num_reserve),
|
1219 |
|
1220 | percent_list_allocs = Percent(num_lists, num_allocs),
|
1221 | percent_slab_allocs = Percent(num_slabs, num_allocs),
|
1222 | percent_string_allocs = Percent(num_strings, num_allocs),
|
1223 | percent_other_typed_allocs = Percent(num_other_typed, num_allocs),
|
1224 |
|
1225 | percent_list_bytes = Percent(total_list_bytes, total_bytes),
|
1226 | percent_string_bytes = Percent(total_string_bytes, total_bytes),
|
1227 |
|
1228 | allocs_24_bytes_or_less = sprintf('%.1f%%', allocs_24_bytes_or_less),
|
1229 | allocs_48_bytes_or_less = sprintf('%.1f%%', allocs_48_bytes_or_less),
|
1230 | allocs_96_bytes_or_less = sprintf('%.1f%%', allocs_96_bytes_or_less),
|
1231 |
|
1232 | strs_6_bytes_or_less = sprintf('%.1f%%', strs_6_bytes_or_less),
|
1233 | strs_14_bytes_or_less = sprintf('%.1f%%', strs_14_bytes_or_less),
|
1234 | )
|
1235 | summaries$stats[[task_name]] = stats
|
1236 |
|
1237 | summaries$most_common_types[[task_name]] = most_common_types
|
1238 | }
|
1239 |
|
1240 | LoadUftraceTsv = function(in_dir, env) {
|
1241 | for (task in list.files(in_dir)) {
|
1242 | Log('Loading data for task %s', task)
|
1243 | base_dir = file.path(in_dir, task)
|
1244 |
|
1245 | task_env = new.env()
|
1246 | env[[task]] = task_env
|
1247 |
|
1248 | # TSV file, not CSV
|
1249 | task_env$untyped = readTsv(file.path(base_dir, 'all-untyped.tsv'))
|
1250 | task_env$typed = readTsv(file.path(base_dir, 'typed.tsv'))
|
1251 | task_env$strings = readTsv(file.path(base_dir, 'strings.tsv'))
|
1252 | task_env$slabs = readTsv(file.path(base_dir, 'slabs.tsv'))
|
1253 | task_env$reserve = readTsv(file.path(base_dir, 'reserve.tsv'))
|
1254 |
|
1255 | # median string length is 4, mean is 9.5!
|
1256 | Log('UNTYPED')
|
1257 | print(summary(task_env$untyped))
|
1258 | Log('')
|
1259 |
|
1260 | Log('TYPED')
|
1261 | print(summary(task_env$typed))
|
1262 | Log('')
|
1263 |
|
1264 | Log('STRINGS')
|
1265 | print(summary(task_env$strings))
|
1266 | Log('')
|
1267 |
|
1268 | Log('SLABS')
|
1269 | print(summary(task_env$slabs))
|
1270 | Log('')
|
1271 |
|
1272 | Log('RESERVE')
|
1273 | print(summary(task_env$reserve))
|
1274 | Log('')
|
1275 | }
|
1276 | }
|
1277 |
|
1278 | Percent = function(n, total) {
|
1279 | sprintf('%.1f%%', n * 100.0 / total)
|
1280 | }
|
1281 |
|
1282 | PrettyPrintLong = function(d) {
|
1283 | tr = t(d) # transpose
|
1284 |
|
1285 | row_names = rownames(tr)
|
1286 |
|
1287 | for (i in 1:nrow(tr)) {
|
1288 | row_name = row_names[i]
|
1289 | cat(sprintf('%26s', row_name)) # calculated min width manually
|
1290 | cat(sprintf('%20s', tr[i,]))
|
1291 | cat('\n')
|
1292 |
|
1293 | # Extra spacing
|
1294 | if (row_name %in% c('num_reserve_calls',
|
1295 | 'percent_string_bytes',
|
1296 | 'percent_other_typed_allocs',
|
1297 | 'allocs_96_bytes_or_less')) {
|
1298 | cat('\n')
|
1299 | }
|
1300 | }
|
1301 | }
|
1302 |
|
1303 |
|
1304 | UftraceReport = function(env, out_dir) {
|
1305 | # summaries$stats should be a list of 1-row data frames
|
1306 | # summaries$top_types should be a list of types
|
1307 | summaries = new.env()
|
1308 |
|
1309 | for (task_name in names(env)) {
|
1310 | report_out = file.path(out_dir, paste0(task_name, '.txt'))
|
1311 |
|
1312 | Log('Making report for task %s -> %s', task_name, report_out)
|
1313 |
|
1314 | sink(file = report_out)
|
1315 | UftraceTaskReport(env, task_name, summaries)
|
1316 | sink() # reset
|
1317 | }
|
1318 | Log('')
|
1319 |
|
1320 | # Concate all the data frames added to summary
|
1321 | stats = bind_rows(as.list(summaries$stats))
|
1322 |
|
1323 | sink(file = file.path(out_dir, 'summary.txt'))
|
1324 | #print(stats)
|
1325 | #Log('')
|
1326 |
|
1327 | PrettyPrintLong(stats)
|
1328 | Log('')
|
1329 |
|
1330 | mct = summaries$most_common_types
|
1331 | for (task_name in names(mct)) {
|
1332 | Log('Common types in workload %s', task_name)
|
1333 | Log('')
|
1334 |
|
1335 | print(mct[[task_name]] %>% head(5))
|
1336 | Log('')
|
1337 | }
|
1338 | sink()
|
1339 |
|
1340 | # For the REPL
|
1341 | return(list(stats = stats))
|
1342 | }
|
1343 |
|
1344 | main = function(argv) {
|
1345 | action = argv[[1]]
|
1346 | in_dir = argv[[2]]
|
1347 | out_dir = argv[[3]]
|
1348 |
|
1349 | if (action == 'osh-parser') {
|
1350 | ParserReport(in_dir, out_dir)
|
1351 |
|
1352 | } else if (action == 'osh-runtime') {
|
1353 | RuntimeReport(in_dir, out_dir)
|
1354 |
|
1355 | } else if (action == 'vm-baseline') {
|
1356 | VmBaselineReport(in_dir, out_dir)
|
1357 |
|
1358 | } else if (action == 'ovm-build') {
|
1359 | OvmBuildReport(in_dir, out_dir)
|
1360 |
|
1361 | } else if (action == 'compute') {
|
1362 | ComputeReport(in_dir, out_dir)
|
1363 |
|
1364 | } else if (action == 'gc') {
|
1365 | GcReport(in_dir, out_dir)
|
1366 |
|
1367 | } else if (action == 'gc-cachegrind') {
|
1368 | GcCachegrindReport(in_dir, out_dir)
|
1369 |
|
1370 | } else if (action == 'mycpp') {
|
1371 | MyCppReport(in_dir, out_dir)
|
1372 |
|
1373 | } else if (action == 'uftrace') {
|
1374 | d = new.env()
|
1375 | LoadUftraceTsv(in_dir, d)
|
1376 | UftraceReport(d, out_dir)
|
1377 |
|
1378 | } else {
|
1379 | Log("Invalid action '%s'", action)
|
1380 | quit(status = 1)
|
1381 | }
|
1382 | Log('PID %d done', Sys.getpid())
|
1383 | }
|
1384 |
|
1385 | if (length(sys.frames()) == 0) {
|
1386 | # increase ggplot font size globally
|
1387 | #theme_set(theme_grey(base_size = 20))
|
1388 |
|
1389 | main(commandArgs(TRUE))
|
1390 | }
|