Add web-cloning scripts
authorSiraaj Khandkar <siraaj@khandkar.net>
Wed, 13 Mar 2019 10:27:12 +0000 (06:27 -0400)
committerSiraaj Khandkar <siraaj@khandkar.net>
Wed, 13 Mar 2019 10:27:12 +0000 (06:27 -0400)
bin/wget-clone-web [new file with mode: 0755]
bin/wget-clone-web-page [new file with mode: 0755]
bin/wget-clone-web-site [new file with mode: 0755]

diff --git a/bin/wget-clone-web b/bin/wget-clone-web
new file mode 100755 (executable)
index 0000000..1f19c0e
--- /dev/null
@@ -0,0 +1,48 @@
+#! /bin/sh
+
+uri_and_extra_options=$@;
+
+
+# Continue partially downloaded
+OPT_CONTINUE='-c'
+
+OPT_WAIT='--wait 0.75'
+
+# Randomly select wait (above) in range from wait*0.5 to wait*1.5
+OPT_WAIT_RANDON='--random-wait'
+
+# --page-requisites "download all the files that are necessary to properly
+# display a given HTML page."
+OPT_PAGE_REQUISITES='-p'
+
+# --user-agent=""
+#OPT_USER_AGENT="-U='Mozilla/5.0 (X11; Linux i586; rv:31.0) Gecko/20100101 Firefox/31.0'"
+OPT_USER_AGENT="-U='ELinks (0.12pre6; NetBSD; 800x600)'"
+
+# --convert-links
+# After the download is complete, convert the links in the document to make
+# them suitable for local viewing.  This affects not only the visible
+# hyperlinks, but any part of the document that links to external content, such
+# as embedded images, links to style sheets, hyperlinks to non-HTML content,
+# etc.
+OPT_LINKS_LOCAL='-k'
+
+# --adjust-extension
+# Ensure file extension matches file's MIME type
+OPT_EXT_ADJUST='-E'
+
+# Disrespect robots.txt
+OPT_ROBOTS_OFF='-e robots=off'
+
+
+wget \
+    -a wget.log \
+    $OPT_CONTINUE \
+    $OPT_WAIT \
+    $OPT_WAIT_RANDON \
+    $OPT_PAGE_REQUISITES \
+    $OPT_LINKS_LOCAL \
+    $OPT_EXT_ADJUST \
+    "$OPT_USER_AGENT" \
+    $OPT_ROBOTS_OFF \
+    $uri_and_extra_options
diff --git a/bin/wget-clone-web-page b/bin/wget-clone-web-page
new file mode 100755 (executable)
index 0000000..4de0d72
--- /dev/null
@@ -0,0 +1,3 @@
+#! /bin/sh
+
+wget-clone-web $@
diff --git a/bin/wget-clone-web-site b/bin/wget-clone-web-site
new file mode 100755 (executable)
index 0000000..0ae4419
--- /dev/null
@@ -0,0 +1,6 @@
+#! /bin/sh
+
+# Recursively download the site
+OPT_RECURSIVE='-r'
+
+wget-clone-web $OPT_RECURSIVE $@
This page took 0.019776 seconds and 4 git commands to generate.