Implement timeline crawler 0.18.0
authorSiraaj Khandkar <siraaj@khandkar.net>
Thu, 25 Mar 2021 23:23:04 +0000 (19:23 -0400)
committerSiraaj Khandkar <siraaj@khandkar.net>
Thu, 25 Mar 2021 23:56:29 +0000 (19:56 -0400)
TODO
info.rkt
tt.rkt

diff --git a/TODO b/TODO
index 83aa016..11d77ee 100644 (file)
--- a/TODO
+++ b/TODO
@@ -18,6 +18,8 @@ In-progress
     - [ ] inner
     - [ ] imports
 - [-] commands:
+  - [x] c | crawl
+    Discover new peers mentioned by known peers.
   - [x] r | read
     - see timeline ops above
   - [ ] w | write
@@ -48,9 +50,14 @@ In-progress
     - [x] mentions from timeline messages
       - [x] @<source.nick source.url>
       - [x] @<source.url>
-    - [x] "following" from timeline comments: # following = <nick> <uri>
+    - [ ] "following" from timeline comments: # following = <nick> <uri>
   - [ ] Parse User-Agent web access logs.
-  - [ ] Update peer ref file(s)
+  - [-] Update peer ref file(s)
+    - [x] peers-all
+    - [x] peers-mentioned
+    - [ ] peers-followed (by others, parsed from comments)
+    - [ ] peers-down (net errors)
+    - [ ] redirects?
     Rough sketch from late 2019:
         let read file =
             ...
@@ -93,6 +100,7 @@ In-progress
 
 Backlog
 -------
+- [ ] user-agent file as CLI option - need to run at least the crawler as another user
 - [ ] Support fetching rsync URIs
 - [ ] Check for peer duplicates:
   - [ ] same nick for N>1 URIs
index 7c2f504..1da091a 100644 (file)
--- a/info.rkt
+++ b/info.rkt
@@ -6,7 +6,7 @@
 (define pkg-desc
   "twtxt client")
 (define version
-  "0.17.1")
+  "0.18.0")
 (define pkg-authors
   '("Siraaj Khandkar <siraaj@khandkar.net>"))
 (define deps
diff --git a/tt.rkt b/tt.rkt
index 40c6ea2..bc8e8bc 100644 (file)
--- a/tt.rkt
+++ b/tt.rkt
         (log-warning "Cache file not found for URI: ~a" (url->string uri))
         "")))
 
+(: uri? (-> String Boolean))
+(define (uri? str)
+  (regexp-match? #rx"^[a-z]+://.*" (string-downcase str)))
+
 (: str->peer (String (Option Peer)))
 (define (str->peer str)
   (log-debug "Parsing peer string: ~v" str)
           (log-error "Invalid URI in string: ~v, exn: ~v" str e)
           #f)])
     (match (string-split str)
-      [(list u)   (Peer #f  (string->url u))]
-      [(list n u) (Peer  n  (string->url u))]
+      [(list u)   #:when (uri? u) (Peer #f  (string->url u))]
+      [(list n u) #:when (uri? u) (Peer  n  (string->url u))]
       [_
         (log-error "Invalid peer string: ~v" str)
         #f])))
 (define (str->peers str)
   (filter-map str->peer (filter-comments (str->lines str))))
 
+(: peers->file (-> (Listof Peers) Path-String Void))
+(define (peers->file peers path)
+  (display-lines-to-file
+    (map (match-lambda
+           [(Peer n u)
+            (format "~a~a" (if n (format "~a " n) "") (url->string u))])
+         peers)
+    path
+    #:exists 'replace))
+
 (: file->peers (-> Path-String (Listof Peer)))
 (define (file->peers file-path)
   (if (file-exists? file-path)
       (str->peers (file->string file-path))
       (begin
-        (log-error "File does not exist: ~v" (path->string file-path))
+        (log-warning "File does not exist: ~v" (path->string file-path))
         '())))
 
 (define re-rfc2822
   ; TODO No need for map - can just iter
   (void (concurrent-filter-map num-workers peer-download peers)))
 
-; TODO timeline contract : time-sorted list of messages
-(: timeline-read (-> Timeline-Order (Listof Peer) (Listof Msg)))
-(define (timeline-read order peers)
+(define (uniq xs)
+  (set->list (list->set xs)))
+
+(: peers->timeline (-> (listof Peer) (listof Msg)))
+(define (peers->timeline peers)
+  (append* (filter-map peer->msgs peers)))
+
+(: timeline-sort (-> (listof Msg) timeline-order (Listof Msgs)))
+(define (timeline-sort msgs order)
   (define cmp (match order
                 ['old->new <]
                 ['new->old >]))
-  (sort (append* (filter-map peer->msgs peers))
-        (λ (a b) (cmp (Msg-ts-epoch a) (Msg-ts-epoch b)))))
+  (sort msgs (λ (a b) (cmp (Msg-ts-epoch a)
+                           (Msg-ts-epoch b)))))
 
 (: paths->peers (-> (Listof String) (Listof Peer)))
 (define (paths->peers paths)
       #:help-labels
       ""
       "and <command> is one of"
-      "r, read     : Read the timeline."
+      "r, read     : Read the timeline (offline operation)."
       "d, download : Download the timeline."
       ; TODO Add path dynamically
       "u, upload   : Upload your twtxt file (alias to execute ~/.tt/upload)."
+      "c, crawl    : Discover new peers mentioned by known peers (offline operation)."
       ""
       #:args (command . args)
       (define log-writer (log-writer-start log-level))
               njobs "Number of concurrent jobs."
               (set! num-workers (string->number njobs))]
              #:args file-paths
-             (define-values (_res _cpu real-ms _gc)
-               (time-apply timeline-download (list num-workers (paths->peers file-paths))))
-             (log-info "Timeline downloaded in ~a seconds." (/ real-ms 1000.0))))]
+             (let ([peers (paths->peers file-paths)])
+               (define-values (_res _cpu real-ms _gc)
+                 (time-apply timeline-download (list num-workers peers)))
+               (log-info "Downloaded timelines from ~a peers in ~a seconds."
+                         (length peers)
+                         (/ real-ms 1000.0)))))]
         [(or "u" "upload")
          (command-line
            #:program
               "Long output format"
               (set! out-format 'multi-line)]
              #:args file-paths
-             (timeline-print out-format (timeline-read order (paths->peers file-paths)))))]
+             (let* ([peers
+                      (paths->peers file-paths)]
+                    [timeline
+                      (timeline-sort (peers->timeline peers) order)])
+               (timeline-print out-format timeline))))]
+        [(or "c" "crawl")
+         (command-line
+           #:program
+           "tt crawl"
+           #:args file-paths
+           (let* ([peers-all-file
+                    (build-path tt-home-dir "peers-all")]
+                  [peers-mentioned-file
+                    (build-path tt-home-dir "peers-mentioned")]
+                  [peers
+                    (paths->peers
+                      (match file-paths
+                        ; TODO Refactor such that path->string not needed
+                        ['() (list (path->string peers-all-file))]
+                        [_   file-paths]))]
+                  [timeline
+                    (peers->timeline peers)]
+                  [peers-mentioned-curr
+                    (uniq (append* (map Msg-mentions timeline)))]
+                  [peers-mentioned-prev
+                    (file->peers peers-mentioned-file)]
+                  [peers-mentioned
+                    (uniq (append peers-mentioned-prev
+                                  peers-mentioned-curr))]
+                  [peers-all-prev
+                    (file->peers peers-all-file)]
+                  [peers-all
+                    (uniq (append peers
+                                  peers-mentioned
+                                  peers-all-prev))])
+             (peers->file peers-mentioned
+                          peers-mentioned-file)
+             (peers->file peers-all
+                          peers-all-file)))]
         [command
           (eprintf "Error: invalid command: ~v\n" command)
           (eprintf "Please use the \"--help\" option to see a list of available commands.\n")
This page took 0.03724 seconds and 4 git commands to generate.