OILS / benchmarks / id.sh View on Github | oils.pub

483 lines, 253 significant
1#!/usr/bin/env bash
2#
3# Keep track of benchmark data provenance.
4#
5# Usage:
6# benchmarks/id.sh <function name>
7
8set -o nounset
9set -o pipefail
10set -o errexit
11
12REPO_ROOT=$(cd "$(dirname $0)/.."; pwd)
13
14source build/common.sh # for $CLANG
15source benchmarks/common.sh
16source test/tsv-lib.sh # tsv-row
17
18print-job-id() {
19 date '+%Y-%m-%d__%H-%M-%S'
20}
21
22# TODO: add benchmark labels/hashes for osh and all other shells
23#
24# Need to archive labels too.
25#
26# TODO: How do I make sure the zsh label is current? Across different
27# machines?
28#
29# What happens when zsh is silently upgraded?
30# I guess before every benchmark, you have to run the ID collection. Man
31# that is a lot of code.
32#
33# Should I make symlinks to the published location?
34#
35# Maybe bash/dash/mksh/zsh should be invoked through a symlink?
36# Every symlink is a shell runtime version, and it has an associated
37# toolchain?
38
39# Platform is ambient?
40# _tmp/
41# shell-id/
42# bash/
43# HASH.txt
44# version.txt
45# dash/
46# HASH.txt
47# version.txt
48# host-id/
49# lisa/
50# HASH.txt
51# cpuinfo.txt
52
53# ../benchmark-data/
54# shell-id/
55# bash-$HASH/
56# osh-$HASH/ # osh-cpython, osh-ovm? osh-opy-ovm? Too many dimensions.
57# # the other shells don't have this?
58# zsh-$HASH/
59# host-id/
60# lisa-$HASH/
61
62_dump-if-exists() {
63 local path=$1
64 local out=$2
65 if ! test -f "$path"; then
66 return
67 fi
68 cat "$path" > $out
69}
70
71#
72# Shell ID
73#
74
75dump-shell-id() {
76 ### Write files that identify the shell
77
78 local sh_path=$1
79 local out_dir=$2
80
81 if ! command -v $sh_path >/dev/null; then
82 die "dump-shell-id: Couldn't find $sh_path"
83 fi
84
85 mkdir -p $out_dir
86
87 echo $sh_path > $out_dir/sh-path.txt
88
89 # Add extra repository info for osh.
90 case $sh_path in
91 */osh*|*/ysh*)
92 local commit_hash=$out_dir/git-commit-hash.txt
93
94 if test -n "${XSHAR_GIT_COMMIT:-}"; then
95 echo "$XSHAR_GIT_COMMIT" > $commit_hash
96 else
97 local branch
98 branch=$(git rev-parse --abbrev-ref HEAD)
99 echo $branch > $out_dir/git-branch.txt
100 git rev-parse $branch > $commit_hash
101 fi
102 ;;
103 esac
104
105 local sh_name
106 sh_name=$(basename $sh_path)
107
108 case $sh_name in
109 bash|zsh|yash)
110 $sh_path --version > $out_dir/version.txt
111 ;;
112 osh*)
113 case $sh_path in
114 *_bin/*/osh) # Is this branch dead?
115 # Doesn't support --version yet
116 ;;
117 *)
118 $sh_path --version > $out_dir/osh-version.txt
119 ;;
120 esac
121 ;;
122 ysh*)
123 $sh_path --version > $out_dir/ysh-version.txt
124 ;;
125 awk)
126 $sh_path --version > $out_dir/awk-version.txt
127 ;;
128
129 dash|mksh)
130 # These don't have version strings!
131 dpkg -s $sh_name > $out_dir/dpkg-version.txt
132 ;;
133
134 # not a shell, but useful for benchmarks/compute
135 python2)
136 $sh_path -V 2> $out_dir/version.txt
137 ;;
138 *)
139 die "Invalid shell '$sh_name'"
140 ;;
141 esac
142}
143
144_shell-id-hash() {
145 local src=$1
146
147 local file
148
149 # for shells and Python
150 file=$src/version.txt
151 test -f $file && cat $file
152
153 # Only hash the dimensions we want to keep
154 file=$src/dpkg-version.txt
155 test -f $file && egrep '^Version' $file
156
157 # Interpreter as CPython vs. OVM is what we care about, so
158 # select 'Interpreter:' but not 'Interpreter version:'.
159 # For example, the version is different on Ubuntu Bionic vs. Trusty, but we
160 # ignore that.
161 file=$src/osh-version.txt
162 test -f $file && egrep '^Oil version|^Interpreter:' $file
163
164 # For OSH
165 file=$src/git-commit-hash.txt
166 test -f $file && cat $file
167 # XXX: Include shell path to help distinguish between versions of OSH
168 echo $src
169
170 return 0
171}
172
173publish-shell-id() {
174 ### Copy temp directory to hashed location
175
176 local src=$1 # e.g. _tmp/prov-tmp/osh
177 local dest_base=${2:-../benchmark-data/shell-id} # or _tmp/shell-id
178
179 local sh_path sh_name
180 read sh_path < $src/sh-path.txt
181 sh_name=$(basename $sh_path)
182
183 local hash
184 hash=$(_shell-id-hash $src | md5sum) # not secure, an identifier
185
186 local id="${hash:0:8}"
187 local dest="$dest_base/$sh_name-$id"
188
189 mkdir -p $dest
190 cp --no-target-directory --recursive $src/ $dest/
191
192 echo $hash > $dest/HASH.txt
193
194 log "Published shell ID to $dest"
195
196 echo $id
197}
198
199#
200# Platform ID
201#
202
203# Events that will change the env for a given machine:
204# - kernel upgrade
205# - distro upgrade
206
207# How about ~/git/oilshell/benchmark-data/host-id/lisa-$HASH
208# How to calculate the hash though?
209
210dump-host-id() {
211 ### Write files that identify the host
212
213 local out_dir=${1:-_tmp/host-id/$(hostname)}
214
215 mkdir -p $out_dir
216
217 hostname > $out_dir/hostname.txt
218
219 # does it make sense to do individual fields like -m?
220 # avoid parsing?
221 # We care about the kernel and the CPU architecture.
222 # There is a lot of redundant information there.
223 uname -m > $out_dir/machine.txt
224
225 {
226 # Short flags work on OS X too
227 uname -s # --kernel-name
228 uname -r # --kernel-release
229 uname -v # --kernel-version
230 } > $out_dir/kernel.txt
231
232 _dump-if-exists /etc/lsb-release $out_dir/lsb-release.txt
233
234 # remove the cpu MHz field, which changes a lot
235 if test -e /proc/cpuinfo; then
236 grep -i -v 'cpu mhz' /proc/cpuinfo > $out_dir/cpuinfo.txt
237 fi
238
239 # mem info doesn't make a difference? I guess it's just nice to check that
240 # it's not swapping. But shouldn't be part of the hash.
241
242 if test -e /proc/meminfo; then
243 grep '^MemTotal' /proc/meminfo > $out_dir/meminfo.txt
244 fi
245
246 #head $out_dir/* 1>&2 # don't write to stdout
247}
248
249# There is already concept of the triple?
250# http://wiki.osdev.org/Target_Triplet
251# It's not exactly the same as what we need here, but close.
252
253_host-id-hash() {
254 local src=$1
255
256 # Don't hash CPU or memory
257 #cat $src/cpuinfo.txt
258 #cat $src/hostname.txt # e.g. lisa
259
260 cat $src/machine.txt # e.g. x86_64
261 cat $src/kernel.txt
262
263 # OS
264 local file=$src/lsb-release.txt
265 if test -f $file; then
266 cat $file
267 fi
268
269 return 0
270}
271
272# Writes a short ID to stdout.
273publish-host-id() {
274 local src=$1 # e.g. _tmp/host-id/lisa
275 local dest_base=${2:-../benchmark-data/host-id}
276
277 local name
278 name=$(basename $src)
279
280 local hash
281 hash=$(_host-id-hash $src | md5sum) # not secure, an identifier
282
283 local id="${hash:0:8}"
284 local dest="$dest_base/$name-$id"
285
286 mkdir -p $dest
287 cp --no-target-directory --recursive $src/ $dest/
288
289 echo $hash > $dest/HASH.txt
290
291 log "Published host ID to $dest"
292
293 echo $id
294}
295
296#
297# Compilers
298#
299
300dump-compiler-id() {
301 ### Write files that identify the compiler
302
303 local cc=$1 # path to the compiler
304 local out_dir=${2:-_tmp/compiler-id/$(basename $cc)}
305
306 mkdir -p $out_dir
307
308 case $cc in
309 */gcc)
310 $cc --version
311 # -v has more details, but they might be overkill.
312 ;;
313 */clang)
314 $cc --version
315 # -v has stuff we don't want
316 ;;
317 esac > $out_dir/version.txt
318}
319
320_compiler-id-hash() {
321 local src=$1
322
323 # Remove some extraneous information from clang.
324 cat $src/version.txt | grep -v InstalledDir
325}
326
327# Writes a short ID to stdout.
328publish-compiler-id() {
329 local src=$1 # e.g. _tmp/compiler-id/clang
330 local dest_base=${2:-../benchmark-data/compiler-id}
331
332 local name=$(basename $src)
333 local hash
334 hash=$(_compiler-id-hash $src | md5sum) # not secure, an identifier
335
336 local id="${hash:0:8}"
337 local dest="$dest_base/$name-$id"
338
339 mkdir -p $dest
340 cp --no-target-directory --recursive $src/ $dest/
341
342 echo $hash > $dest/HASH.txt
343
344 log "Published compiler ID to $dest"
345
346 echo $id
347}
348
349#
350# Table Output
351#
352
353# Writes a table of host and shells to stdout. Writes text files and
354# calculates IDs for them as a side effect.
355#
356# The table can be passed to other benchmarks to ensure that their provenance
357# is recorded.
358
359shell-provenance-2() {
360 ### Write to _tmp/provenance.{txt,tsv} and $out_dir/{shell-id,host-id}
361
362 local maybe_host=$1 # if it exists, it overrides the host
363 local job_id=$2
364 local out_dir=$3
365 shift 3
366
367 # log "*** shell-provenance"
368
369 local host_name
370 if test -n "$maybe_host"; then # label is often 'no-host'
371 host_name=$maybe_host
372 else
373 host_name=$(hostname)
374 fi
375
376 log "*** shell-provenance-2 $maybe_host $host_name $job_id $out_dir"
377
378 local tmp_dir=_tmp/prov-tmp/$host_name
379 dump-host-id $tmp_dir
380
381 local host_hash
382 host_hash=$(publish-host-id $tmp_dir "$out_dir/host-id")
383
384 local shell_hash
385
386 local out_txt=_tmp/provenance.txt # Legacy text file
387 echo -n '' > $out_txt # truncated, no header
388
389 local out_tsv=_tmp/provenance.tsv
390 tsv-row job_id host_name host_hash sh_path shell_hash > $out_tsv
391
392 local i=0
393
394 for sh_path in "$@"; do
395 # There can be two different OSH
396
397 tmp_dir=_tmp/prov-tmp/shell-$i
398 i=$((i + 1))
399
400 dump-shell-id $sh_path $tmp_dir
401
402 # writes to ../benchmark-data or _tmp/provenance
403 shell_hash=$(publish-shell-id $tmp_dir "$out_dir/shell-id")
404
405 # note: filter-provenance depends on $4 being $sh_path
406 # APPEND to txt
407 echo "$job_id $host_name $host_hash $sh_path $shell_hash" >> $out_txt
408
409 tsv-row "$job_id" "$host_name" "$host_hash" "$sh_path" "$shell_hash" >> $out_tsv
410 done
411
412 log "Wrote $out_txt and $out_tsv"
413}
414
415provenance-for-testing() {
416 ### For running benchmarks locally
417
418 local out_dir=_tmp/local-benchmarks
419 mkdir -v -p $out_dir
420 shell-provenance-2 \
421 $(hostname) 2025__test-job $out_dir \
422 "${SHELLS[@]}" $OSH_CPP_TWO python2
423}
424
425compiler-provenance-2() {
426 # Write to _tmp/compiler-provenance.txt and $out_dir/{compiler-id,host-id}
427
428 local maybe_host=$1 # if it exists, it overrides the host
429 local job_id=$2
430 local out_dir=$3
431
432 local host_name
433 if test -n "$maybe_host"; then # label is often 'no-host'
434 host_name=$maybe_host
435 else
436 host_name=$(hostname)
437 fi
438
439 log "*** compiler-provenance-2 $maybe_host $host_name $job_id $out_dir"
440
441 local tmp_dir=_tmp/prov-tmp/$host_name
442 dump-host-id $tmp_dir
443
444 local host_hash
445 host_hash=$(publish-host-id $tmp_dir "$out_dir/host-id")
446
447 local compiler_hash
448
449 local out_txt=_tmp/compiler-provenance.txt # Legacy text file
450 echo -n '' > $out_txt # truncated, no header
451
452 local out_tsv=_tmp/compiler-provenance.tsv
453 tsv-row job_id host_name host_hash compiler_path compiler_hash > $out_tsv
454
455 for compiler_path in $(which gcc) $CLANG; do
456 local name=$(basename $compiler_path)
457
458 tmp_dir=_tmp/prov-tmp/$name
459 dump-compiler-id $compiler_path $tmp_dir
460
461 compiler_hash=$(publish-compiler-id $tmp_dir "$out_dir/compiler-id")
462
463 echo "$job_id $host_name $host_hash $compiler_path $compiler_hash" \
464 >> $out_txt
465
466 tsv-row \
467 "$job_id" "$host_name" "$host_hash" "$compiler_path" "$compiler_hash" \
468 >> $out_tsv
469 done
470
471 log "Wrote $out_txt and $out_tsv"
472}
473
474out-param() {
475 declare -n out=$1
476
477 out=returned
478}
479
480if test $(basename $0) = 'id.sh'; then
481 "$@"
482fi
483