From: Siraaj Khandkar Date: Thu, 25 Mar 2021 23:23:04 +0000 (-0400) Subject: Implement timeline crawler X-Git-Tag: 0.18.0 X-Git-Url: https://git.xandkar.net/?p=tt.git;a=commitdiff_plain;h=a60c484e9105bad3ec0bd90148b27f44dbfeb266 Implement timeline crawler --- diff --git a/TODO b/TODO index 83aa016..11d77ee 100644 --- a/TODO +++ b/TODO @@ -18,6 +18,8 @@ In-progress - [ ] inner - [ ] imports - [-] commands: + - [x] c | crawl + Discover new peers mentioned by known peers. - [x] r | read - see timeline ops above - [ ] w | write @@ -48,9 +50,14 @@ In-progress - [x] mentions from timeline messages - [x] @ - [x] @ - - [x] "following" from timeline comments: # following = + - [ ] "following" from timeline comments: # following = - [ ] Parse User-Agent web access logs. - - [ ] Update peer ref file(s) + - [-] Update peer ref file(s) + - [x] peers-all + - [x] peers-mentioned + - [ ] peers-followed (by others, parsed from comments) + - [ ] peers-down (net errors) + - [ ] redirects? Rough sketch from late 2019: let read file = ... @@ -93,6 +100,7 @@ In-progress Backlog ------- +- [ ] user-agent file as CLI option - need to run at least the crawler as another user - [ ] Support fetching rsync URIs - [ ] Check for peer duplicates: - [ ] same nick for N>1 URIs diff --git a/info.rkt b/info.rkt index 7c2f504..1da091a 100644 --- a/info.rkt +++ b/info.rkt @@ -6,7 +6,7 @@ (define pkg-desc "twtxt client") (define version - "0.17.1") + "0.18.0") (define pkg-authors '("Siraaj Khandkar ")) (define deps diff --git a/tt.rkt b/tt.rkt index 40c6ea2..bc8e8bc 100644 --- a/tt.rkt +++ b/tt.rkt @@ -279,6 +279,10 @@ (log-warning "Cache file not found for URI: ~a" (url->string uri)) ""))) +(: uri? (-> String Boolean)) +(define (uri? str) + (regexp-match? #rx"^[a-z]+://.*" (string-downcase str))) + (: str->peer (String (Option Peer))) (define (str->peer str) (log-debug "Parsing peer string: ~v" str) @@ -288,8 +292,8 @@ (log-error "Invalid URI in string: ~v, exn: ~v" str e) #f)]) (match (string-split str) - [(list u) (Peer #f (string->url u))] - [(list n u) (Peer n (string->url u))] + [(list u) #:when (uri? u) (Peer #f (string->url u))] + [(list n u) #:when (uri? u) (Peer n (string->url u))] [_ (log-error "Invalid peer string: ~v" str) #f]))) @@ -303,12 +307,22 @@ (define (str->peers str) (filter-map str->peer (filter-comments (str->lines str)))) +(: peers->file (-> (Listof Peers) Path-String Void)) +(define (peers->file peers path) + (display-lines-to-file + (map (match-lambda + [(Peer n u) + (format "~a~a" (if n (format "~a " n) "") (url->string u))]) + peers) + path + #:exists 'replace)) + (: file->peers (-> Path-String (Listof Peer))) (define (file->peers file-path) (if (file-exists? file-path) (str->peers (file->string file-path)) (begin - (log-error "File does not exist: ~v" (path->string file-path)) + (log-warning "File does not exist: ~v" (path->string file-path)) '()))) (define re-rfc2822 @@ -462,14 +476,20 @@ ; TODO No need for map - can just iter (void (concurrent-filter-map num-workers peer-download peers))) -; TODO timeline contract : time-sorted list of messages -(: timeline-read (-> Timeline-Order (Listof Peer) (Listof Msg))) -(define (timeline-read order peers) +(define (uniq xs) + (set->list (list->set xs))) + +(: peers->timeline (-> (listof Peer) (listof Msg))) +(define (peers->timeline peers) + (append* (filter-map peer->msgs peers))) + +(: timeline-sort (-> (listof Msg) timeline-order (Listof Msgs))) +(define (timeline-sort msgs order) (define cmp (match order ['old->new <] ['new->old >])) - (sort (append* (filter-map peer->msgs peers)) - (λ (a b) (cmp (Msg-ts-epoch a) (Msg-ts-epoch b))))) + (sort msgs (λ (a b) (cmp (Msg-ts-epoch a) + (Msg-ts-epoch b))))) (: paths->peers (-> (Listof String) (Listof Peer))) (define (paths->peers paths) @@ -523,10 +543,11 @@ #:help-labels "" "and is one of" - "r, read : Read the timeline." + "r, read : Read the timeline (offline operation)." "d, download : Download the timeline." ; TODO Add path dynamically "u, upload : Upload your twtxt file (alias to execute ~/.tt/upload)." + "c, crawl : Discover new peers mentioned by known peers (offline operation)." "" #:args (command . args) (define log-writer (log-writer-start log-level)) @@ -544,9 +565,12 @@ njobs "Number of concurrent jobs." (set! num-workers (string->number njobs))] #:args file-paths - (define-values (_res _cpu real-ms _gc) - (time-apply timeline-download (list num-workers (paths->peers file-paths)))) - (log-info "Timeline downloaded in ~a seconds." (/ real-ms 1000.0))))] + (let ([peers (paths->peers file-paths)]) + (define-values (_res _cpu real-ms _gc) + (time-apply timeline-download (list num-workers peers))) + (log-info "Downloaded timelines from ~a peers in ~a seconds." + (length peers) + (/ real-ms 1000.0)))))] [(or "u" "upload") (command-line #:program @@ -573,7 +597,45 @@ "Long output format" (set! out-format 'multi-line)] #:args file-paths - (timeline-print out-format (timeline-read order (paths->peers file-paths)))))] + (let* ([peers + (paths->peers file-paths)] + [timeline + (timeline-sort (peers->timeline peers) order)]) + (timeline-print out-format timeline))))] + [(or "c" "crawl") + (command-line + #:program + "tt crawl" + #:args file-paths + (let* ([peers-all-file + (build-path tt-home-dir "peers-all")] + [peers-mentioned-file + (build-path tt-home-dir "peers-mentioned")] + [peers + (paths->peers + (match file-paths + ; TODO Refactor such that path->string not needed + ['() (list (path->string peers-all-file))] + [_ file-paths]))] + [timeline + (peers->timeline peers)] + [peers-mentioned-curr + (uniq (append* (map Msg-mentions timeline)))] + [peers-mentioned-prev + (file->peers peers-mentioned-file)] + [peers-mentioned + (uniq (append peers-mentioned-prev + peers-mentioned-curr))] + [peers-all-prev + (file->peers peers-all-file)] + [peers-all + (uniq (append peers + peers-mentioned + peers-all-prev))]) + (peers->file peers-mentioned + peers-mentioned-file) + (peers->file peers-all + peers-all-file)))] [command (eprintf "Error: invalid command: ~v\n" command) (eprintf "Please use the \"--help\" option to see a list of available commands.\n")