From: Siraaj Khandkar Date: Mon, 16 Mar 2020 17:31:18 +0000 (-0400) Subject: Import case count scrapers for USA: NY, MA X-Git-Url: https://git.xandkar.net/?a=commitdiff_plain;h=0b60ba947fe0490f018137cbb9c51b01bdcb8b39;p=covid-19-scrapers.git Import case count scrapers for USA: NY, MA --- diff --git a/fetch-case-count-usa-ma b/fetch-case-count-usa-ma new file mode 100755 index 0000000..5a070ac --- /dev/null +++ b/fetch-case-count-usa-ma @@ -0,0 +1,101 @@ +#! /bin/sh +# +# Dependencies: +# - curl +# - awk +# - hxpipe (packaged in html-xml-utils on Debian and Ubuntu) + +VALID_LOCATIONS=' + Boston: + Massachusetts: + ' + +DEFAULT_LOCATION='Boston:' + +usage() { + printf "Usage: %s [LOCATION]\n" "$0" + printf '\n' + printf 'LOCATION =' + printf '%s\n' "$VALID_LOCATIONS" + printf "Default LOCATION:\n %s\n" "$DEFAULT_LOCATION" + exit 1 +} + +case "$1" in + '-h') usage;; + '' ) location="$DEFAULT_LOCATION";; + * ) location="$1";; +esac + +curl 'https://www.boston.gov/news/coronavirus-disease-covid-19-boston' \ +| hxpipe \ +| awk ' + /^[\(\)]/ { + update_node() + next + } + + /^A/ && $2 == "CDATA" { + update_node_attributes() + next + } + + /^-/ { + XmlPayload = substr($0, 2, length($0)) + } + + XmlPath == "/html/body/div/div/input/header/div/div/div/section/article/div/div/div/div/div/div/div/div/address" \ + && XmlPayload ~ /^[A-Z][a-z]+: +[0-9]+/ { + print XmlPayload + } + + function path_to_string(path, depth, p, i) { + p = "" + for (i = 1; i <= depth; i++) { + p = p "/" path[i] + } + return p + } + + function update_node( paren, name, key, val, path, attr) { + paren = substr($1, 1, 1) + name = substr($1, 2, length($1) - 1) + if (paren == "(") { + _depth++ + _path[_depth] = name + XmlPath = path_to_string(_path, _depth) + for (key in _hxpipe_curr_attrs) { + val = _hxpipe_curr_attrs[key] + XmlAttr[XmlPath, key] = val + } + } else if (paren == ")") { + delete _hxpipe_curr_attrs + XmlPayload = "" + for (key in XmlAttr) { + split(key, k, SUBSEP) + path = k[1] + attr = k[2] + if (path == XmlPath) delete XmlAttr[key] + } + _depth-- + XmlPath = path_to_string(_path, _depth) + } else { + printf("ERROR in input line %d - not a parenthesis: \"%s\"\n", NR, paren) > "/dev/stderr" + exit 1 + } + } + + function update_node_attributes( key, val, s) { + key = substr($1, 2, length($1)) + val = $0 + s = " +" + sub("^" $1 s $2 s, "", val) + _hxpipe_curr_attrs[key] = val + } + ' \ +| awk -v location="$location" ' + $1 == location { + weird_space_character = " "; + n = split($2, cases, weird_space_character); + print cases[1] + }' diff --git a/fetch-case-count-usa-ny b/fetch-case-count-usa-ny new file mode 100755 index 0000000..95a039f --- /dev/null +++ b/fetch-case-count-usa-ny @@ -0,0 +1,121 @@ +#! /bin/sh +# +# Dependencies: +# - curl +# - awk +# - hxpipe (packaged in html-xml-utils on Debian and Ubuntu) + +VALID_LOCATIONS=' + Albany + Broome + Delaware + Dutchess + Erie + Greene + Herkimer + Monroe + Montgomery + Nassau + Orange + Putnam + Rockland + Saratoga + Schenectady + Suffolk + Tioga + Tompkins + Ulster + Westchester + New York State (Outside of NYC) + New York City: + Total Positive Cases (Statewide) + ' +DEFAULT_LOCATION='New York City:' + +usage() { + printf "Usage: %s [LOCATION]\n" "$0" + printf '\n' + printf 'LOCATION =' + printf '%s\n' "$VALID_LOCATIONS" + printf "Default LOCATION:\n %s\n" "$DEFAULT_LOCATION" + exit 1 +} + +case "$1" in + '-h') usage;; + '' ) location="$DEFAULT_LOCATION";; + * ) location="$1";; +esac + +curl 'https://health.ny.gov/diseases/communicable/coronavirus/' \ +| hxpipe \ +| awk -v location="$location" ' + /^[\(\)]/ { + update_node() + next + } + + /^A/ && $2 == "CDATA" { + update_node_attributes() + next + } + + /^-/ { + XmlPayload = substr($0, 2, length($0)) + } + + XmlPath == "/html/body/div/div/div/div/div/div/table/tr/td" && XmlPayload == location { + found = 1; + next + } + + XmlPath == "/html/body/div/div/div/div/div/div/table/tr/td" && found { + print XmlPayload; + found = 0; + next; + } + + function path_to_string(path, depth, p, i) { + p = "" + for (i = 1; i <= depth; i++) { + p = p "/" path[i] + } + return p + } + + function update_node( paren, name, key, val, path, attr) { + paren = substr($1, 1, 1) + name = substr($1, 2, length($1) - 1) + if (paren == "(") { + _depth++ + _path[_depth] = name + XmlPath = path_to_string(_path, _depth) + for (key in _hxpipe_curr_attrs) { + val = _hxpipe_curr_attrs[key] + XmlAttr[XmlPath, key] = val + } + } else if (paren == ")") { + delete _hxpipe_curr_attrs + XmlPayload = "" + for (key in XmlAttr) { + split(key, k, SUBSEP) + path = k[1] + attr = k[2] + if (path == XmlPath) delete XmlAttr[key] + } + _depth-- + XmlPath = path_to_string(_path, _depth) + } else { + printf("ERROR in input line %d - not a parenthesis: \"%s\"\n", NR, paren) > "/dev/stderr" + exit 1 + } + } + + function update_node_attributes( key, val, s) { + key = substr($1, 2, length($1)) + val = $0 + s = " +" + sub("^" $1 s $2 s, "", val) + _hxpipe_curr_attrs[key] = val + } + '