From 06ec33cbfa90c35e1c507bf54bba8a3b01b58f90 Mon Sep 17 00:00:00 2001 From: Siraaj Khandkar Date: Wed, 13 Mar 2019 06:27:12 -0400 Subject: [PATCH] Add web-cloning scripts --- bin/wget-clone-web | 48 +++++++++++++++++++++++++++++++++++++++++ bin/wget-clone-web-page | 3 +++ bin/wget-clone-web-site | 6 ++++++ 3 files changed, 57 insertions(+) create mode 100755 bin/wget-clone-web create mode 100755 bin/wget-clone-web-page create mode 100755 bin/wget-clone-web-site diff --git a/bin/wget-clone-web b/bin/wget-clone-web new file mode 100755 index 0000000..1f19c0e --- /dev/null +++ b/bin/wget-clone-web @@ -0,0 +1,48 @@ +#! /bin/sh + +uri_and_extra_options=$@; + + +# Continue partially downloaded +OPT_CONTINUE='-c' + +OPT_WAIT='--wait 0.75' + +# Randomly select wait (above) in range from wait*0.5 to wait*1.5 +OPT_WAIT_RANDON='--random-wait' + +# --page-requisites "download all the files that are necessary to properly +# display a given HTML page." +OPT_PAGE_REQUISITES='-p' + +# --user-agent="" +#OPT_USER_AGENT="-U='Mozilla/5.0 (X11; Linux i586; rv:31.0) Gecko/20100101 Firefox/31.0'" +OPT_USER_AGENT="-U='ELinks (0.12pre6; NetBSD; 800x600)'" + +# --convert-links +# After the download is complete, convert the links in the document to make +# them suitable for local viewing. This affects not only the visible +# hyperlinks, but any part of the document that links to external content, such +# as embedded images, links to style sheets, hyperlinks to non-HTML content, +# etc. +OPT_LINKS_LOCAL='-k' + +# --adjust-extension +# Ensure file extension matches file's MIME type +OPT_EXT_ADJUST='-E' + +# Disrespect robots.txt +OPT_ROBOTS_OFF='-e robots=off' + + +wget \ + -a wget.log \ + $OPT_CONTINUE \ + $OPT_WAIT \ + $OPT_WAIT_RANDON \ + $OPT_PAGE_REQUISITES \ + $OPT_LINKS_LOCAL \ + $OPT_EXT_ADJUST \ + "$OPT_USER_AGENT" \ + $OPT_ROBOTS_OFF \ + $uri_and_extra_options diff --git a/bin/wget-clone-web-page b/bin/wget-clone-web-page new file mode 100755 index 0000000..4de0d72 --- /dev/null +++ b/bin/wget-clone-web-page @@ -0,0 +1,3 @@ +#! /bin/sh + +wget-clone-web $@ diff --git a/bin/wget-clone-web-site b/bin/wget-clone-web-site new file mode 100755 index 0000000..0ae4419 --- /dev/null +++ b/bin/wget-clone-web-site @@ -0,0 +1,6 @@ +#! /bin/sh + +# Recursively download the site +OPT_RECURSIVE='-r' + +wget-clone-web $OPT_RECURSIVE $@ -- 2.20.1