| 1 | #!/usr/bin/env bash
|
| 2 | #
|
| 3 | # Usage:
|
| 4 | # data_lang/htm8-test.sh
|
| 5 | #
|
| 6 | # TODO:
|
| 7 | #
|
| 8 | # - htm8.py should use one-pass algorithm
|
| 9 | # - micro-syntax should check all errors
|
| 10 | # - with tests
|
| 11 | # - and then download CommonCrawl data set?
|
| 12 | #
|
| 13 | # - translate to C++
|
| 14 | # - how to handle the regexes in the lexer? Port to re2c directly?
|
| 15 | # - for find(), do we need a C++ primitive for it?
|
| 16 | # - no allocation for TagName()
|
| 17 | #
|
| 18 | # re2c considerations:
|
| 19 | # - We need to use CAPTURES, so we can't use frontend/match directly
|
| 20 | # - Could we STREAM the lexer?
|
| 21 | # - Instead of sentinel model, use something else!
|
| 22 | # - default is sentinel with padding, and there is YYFILL with padding
|
| 23 | # - there is also the separate --storable-state option
|
| 24 | # - because this can be used queries that don't allocate
|
| 25 | # - I may also want to do this with JSON
|
| 26 | #
|
| 27 | # Features:
|
| 28 | # - Are there special rules for <svg> and <math>?
|
| 29 | # - Do we need to know about <textarea> <pre>? Those don't have the same
|
| 30 | # whitespace rules
|
| 31 |
|
| 32 |
|
| 33 | REPO_ROOT=$(cd "$(dirname $0)/.."; pwd)
|
| 34 |
|
| 35 | # Special case: we need $REPO_ROOT
|
| 36 | : ${LIB_OSH=$REPO_ROOT/stdlib/osh}
|
| 37 | source $LIB_OSH/bash-strict.sh
|
| 38 | source $LIB_OSH/task-five.sh
|
| 39 |
|
| 40 | site-files() {
|
| 41 | #find ../../oilshell/oilshell.org__deploy -name '*.html'
|
| 42 |
|
| 43 | # omit all the _ files
|
| 44 | git ls-files | grep '\.html$'
|
| 45 | }
|
| 46 |
|
| 47 | # Issues with lazylex/html.py
|
| 48 | #
|
| 49 | # - Token ID is annoying to express in Python
|
| 50 | # - re.DOTALL for newlines
|
| 51 | # - can we change that with [.\n]*?
|
| 52 | # - nongreedy match for --> and ?>
|
| 53 |
|
| 54 | htm8-tool() {
|
| 55 | PYTHONPATH="$REPO_ROOT:$REPO_ROOT/vendor" \
|
| 56 | $REPO_ROOT/data_lang/htm8_util.py "$@"
|
| 57 | }
|
| 58 |
|
| 59 | test-quick-scan() {
|
| 60 | cat >_tmp/bad-top.html <<EOF
|
| 61 | unfinished <!--
|
| 62 | hi && bye
|
| 63 | EOF
|
| 64 |
|
| 65 | set +o errexit
|
| 66 | echo '_tmp/bad-top.html' | htm8-tool quick-scan
|
| 67 |
|
| 68 | echo '_tmp/bad-top.html' | htm8-tool lex-htm8
|
| 69 |
|
| 70 | cat >_tmp/bad-attr.html <<EOF
|
| 71 | hi <a href !>
|
| 72 | EOF
|
| 73 |
|
| 74 | echo '*** bad-attr quick-scan'
|
| 75 | echo '_tmp/bad-attr.html' | htm8-tool quick-scan
|
| 76 |
|
| 77 | echo '*** bad-attr lex-htm8'
|
| 78 | echo '_tmp/bad-attr.html' | htm8-tool lex-htm8
|
| 79 | }
|
| 80 |
|
| 81 | # site errors
|
| 82 | #
|
| 83 | # Error in 'release/0.7.pre5/doc/osh-quick-ref.html': (LexError '&& or ||</h4>\n<!-- 2')
|
| 84 | # Error in 'src/data/symbol.html': (LexError "&& mangle[0]!='E' &&")
|
| 85 | # 5833374 tokens in 4710 files
|
| 86 | #
|
| 87 | # The second is the "Woboq" browser, which has CDATA
|
| 88 | # Ah I wonder if we need that.
|
| 89 |
|
| 90 | # Takes ~13 seconds
|
| 91 | test-site() {
|
| 92 | local new_site=${1:-}
|
| 93 |
|
| 94 | # TODO:
|
| 95 | # - test that the top level lexes
|
| 96 | # - test that each tag lexers
|
| 97 | # - test that each quoted attribute lexes
|
| 98 | # - test that tags are balanced
|
| 99 |
|
| 100 | local dir
|
| 101 | local action
|
| 102 | if test -n "$new_site"; then
|
| 103 | dir='../oils.pub__deploy'
|
| 104 | action='parse-htm8'
|
| 105 | else
|
| 106 | dir='../../oilshell/oilshell.org__deploy'
|
| 107 | action='lex-htm8'
|
| 108 | fi
|
| 109 |
|
| 110 | pushd $dir
|
| 111 |
|
| 112 | # Too many files
|
| 113 | # site-files | xargs wc -l | grep total
|
| 114 |
|
| 115 | # Not using xargs
|
| 116 | time site-files | $REPO_ROOT/$0 htm8-tool $action
|
| 117 |
|
| 118 | popd
|
| 119 | }
|
| 120 |
|
| 121 | readonly SOIL_ID=8924
|
| 122 | readonly WWZ_DIR=_tmp/$SOIL_ID
|
| 123 |
|
| 124 | sync-wwz() {
|
| 125 | mkdir -p $WWZ_DIR
|
| 126 | rsync --archive --verbose \
|
| 127 | op.oilshell.org:op.oilshell.org/uuu/github-jobs/$SOIL_ID/ $WWZ_DIR/
|
| 128 | }
|
| 129 |
|
| 130 | extract-wwz() {
|
| 131 | pushd $WWZ_DIR
|
| 132 | for z in *.wwz; do
|
| 133 | local name=$(basename $z .wwz)
|
| 134 |
|
| 135 | mkdir -p $name
|
| 136 | pushd $name >/dev/null
|
| 137 |
|
| 138 | echo $name
|
| 139 | unzip ../$z
|
| 140 |
|
| 141 | popd >/dev/null
|
| 142 | done
|
| 143 | popd
|
| 144 | }
|
| 145 |
|
| 146 | tree-wwz() {
|
| 147 | tree $WWZ_DIR
|
| 148 | }
|
| 149 |
|
| 150 | test-wwz() {
|
| 151 | pushd $WWZ_DIR
|
| 152 |
|
| 153 | time find . -name '*.html' | $REPO_ROOT/$0 htm8-tool parse-htm8
|
| 154 |
|
| 155 | popd
|
| 156 | }
|
| 157 |
|
| 158 | find-xml() {
|
| 159 | time find ~ -iname '*.xml' | tee _tmp/xml-files.txt
|
| 160 | }
|
| 161 |
|
| 162 | test-other-xml() {
|
| 163 | # 6 errors, relating to value='<' in some Python testdata files, which seems invalid
|
| 164 | time cat _tmp/xml-files.txt | $REPO_ROOT/$0 htm8-tool parse-xml
|
| 165 | }
|
| 166 |
|
| 167 | test-repo-xml() {
|
| 168 | # OK these parse
|
| 169 | time find . -name '_chroot' -a -prune -o -name '*.xml' -a -print \
|
| 170 | | $REPO_ROOT/$0 htm8-tool parse-xml
|
| 171 | }
|
| 172 |
|
| 173 | test-repo-html() {
|
| 174 | time find . -name '*.html' | $REPO_ROOT/$0 htm8-tool parse-htm8
|
| 175 | }
|
| 176 |
|
| 177 | test-docs() {
|
| 178 | time find _release/VERSION -name '*.html' | $REPO_ROOT/$0 htm8-tool parse-htm8
|
| 179 | }
|
| 180 |
|
| 181 | soil-run() {
|
| 182 | test-docs
|
| 183 | }
|
| 184 |
|
| 185 | # OK we have to skip the <script> tag! And <style>
|
| 186 | #
|
| 187 | # document.location = '#' + params.join('&');
|
| 188 | # gUrlHash = new UrlHash(location.hash);
|
| 189 | #
|
| 190 | # I think textarea we don't though?
|
| 191 |
|
| 192 |
|
| 193 | task-five "$@"
|
| 194 | exit
|
| 195 |
|
| 196 |
|
| 197 | echo '
|
| 198 | In HTML5, instead of
|
| 199 | <script>
|
| 200 | <![CDATA[
|
| 201 | if (x < y) { ... }
|
| 202 | ]]>
|
| 203 | </script>
|
| 204 |
|
| 205 | You can write
|
| 206 |
|
| 207 | <script>
|
| 208 | if (x < y) { ... }
|
| 209 | </script>
|
| 210 |
|
| 211 | <script> <style> <textarea>
|
| 212 |
|
| 213 | These have special escaping rules. I guess we just do NOT lex them at all?
|
| 214 | We can totally SKIP them.
|
| 215 |
|
| 216 | CDATA vs. RCDATA
|
| 217 |
|
| 218 | <textarea>
|
| 219 | <p> <!-- This will show as: <p> -->
|
| 220 | & <!-- This will show as: & -->
|
| 221 | </textarea>
|
| 222 |
|
| 223 | <script>
|
| 224 | <p> <!-- This will show literally as: <p> -->
|
| 225 | & <!-- This will show literally as: & -->
|
| 226 | </script>
|
| 227 |
|
| 228 | The main practical difference is that RCDATA processes HTML entities while
|
| 229 | CDATA treats them as literal text. Both modes ignore HTML tags (treating them
|
| 230 | as plain text) except for their own closing tag. '
|
| 231 | '
|
| 232 |
|