Skip to content

Commit

Permalink
Adjust log processing for new cdn log format
Browse files Browse the repository at this point in the history
This adds support for the new cdn log format we are writing to s3, and
drops support for the apache common logs since we no longer serve
artifacts from nginx.
  • Loading branch information
tobias committed Feb 1, 2020
1 parent 9ad14ae commit 80de5bc
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 41 deletions.
15 changes: 4 additions & 11 deletions dev-resources/fake.access.log
Original file line number Diff line number Diff line change
@@ -1,14 +1,7 @@
127.0.0.2 - - [01/Jan/2012:06:43:40 +0000] "GET /repo/snowy/snowy/0.2.0/snowy-0.2.0.jar HTTP/1.1" 200 2377 "-" "Java/1.6.0_30"
127.0.0.2 - - [28/Jan/2012:06:43:40 +0000] "GET /repo/snowy/snowy/0.2.0/snowy-0.2.0.jar HTTP/1.1" 200 2377 "-" "Java/1.6.0_30"
127.0.0.1 - - [14/May/2012:06:40:59 +0000] "GET /repo/captain/archibald/haddock/0.1.0/haddock-0.1.0.jar HTTP/1.1" 200 2377 "-" "Java/1.6.0_30"
127.0.0.2 - - [14/May/2012:06:41:59 +0000] "GET /repo/captain/archibald/haddock/0.1.0/haddock-0.1.0.jar HTTP/1.1" 200 2377 "-" "Java/1.6.0_30"
127.0.0.2 - - [14/May/2012:06:42:40 +0000] "GET /repo/snowy/snowy/0.2.0/snowy-0.2.0.jar HTTP/1.1" 200 2377 "-" "Java/1.6.0_30"
127.0.0.2 - - [14/May/2012:06:43:40 +0000] "GET /repo/snowy/snowy/0.2.0/snowy-0.2.0.jar HTTP/1.1" 200 2377 "-" "Java/1.6.0_30"
blistering barnacles
127.0.0.4 - - [14/May/2012:06:45:40 +0000] "GET /repo/snowy/snowy/0.3.0/snowy-0.3.0.jar HTTP/1.1" 200 2377 "-" "Java/1.6.0_30" "clojars.org"

127.0.0.3 - - [14/May/2012:06:44:59 +0000] "GET /repo/captain/archibald/haddock/0.1.0/haddock-0.1.0.jar HTTP/1.1" 200 2377 "-" "Java/1.6.0_30"
127.0.0.4 - - [14/May/2012:06:45:40 +0000] "GET /repo/snowy/snowy/0.3.0/snowy-0.3.0.jar HTTP/1.1" 200 2377 "-" "Java/1.6.0_30"
127.0.0.2 - - [14/May/2012:06:43:40 +0000] "GET /repo/snowy/snowy/0.2.0/snowy-0.2.0.jar HTTP/1.1" 200 2377 "-" "Java/1.6.0_30"
<134>2012-04-14T06:40:59Z cache-bwi5023 s3-bucket[3217]: 3.90.141.179 "GET /captain/archibald/haddock/0.1.0/haddock-0.1.0.jar HTTP/1.1" 200 2377 "(null)" "Java/1.6.0_30"
billions of bilious [blue blistering] "barnacles in ten" thousand thundering "typhoons" "!"
127.0.0.4 - - [14/May/2012:06:45:40 +0000] "GET /repo/snowy/snowy/0.3.0/snowy-0.3.0.jar HTTP/1.1" 200 2377 "-" "Java/1.6.0_30" "clojars.org"
<134>2012-05-14T06:40:59Z cache-ord1741 cloudfiles-endpoint[82344]: 66.249.69.238 "-" "GET /repo/snowy/snowy/0.3.0/snowy-0.3.0.jar" 200 2377 "(null)" "Java/1.6.0_30"
<134>2012-05-14T06:40:59Z cache-ord1741 cloudfiles-endpoint[82344]: 66.249.69.238 "-" "GET /repo/snowy/snowy/0.3.0/snowy-0.3.0.jar" 200 2377 "(null)" "Java/1.6.0_30"
<134>2012-05-14T06:40:59Z cache-ord1741 s3-endpoint[82344]: 66.249.69.238 "GET /repo/snowy/snowy/0.3.0/snowy-0.3.0.jar HTTP/1.1" 200 2377 "(null)" "Java/1.6.0_30"
49 changes: 25 additions & 24 deletions src/clojars/tools/process_stats.clj
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
(ns clojars.tools.process-stats
"generate usage statistics from web log"
(:require [clojars.file-utils :as fu]
(:require [clj-time.format :as timef]
[clojars.file-utils :as fu]
[clojure.java.io :as io]
[clojure.string :as str]
[net.cgrand.regex :as re]
[clj-time.format :as timef])
(:import java.util.regex.Pattern
java.io.BufferedReader)
[net.cgrand.regex :as re])
(:import java.io.BufferedReader
java.util.regex.Pattern)
(:gen-class))

(def time-clf (timef/formatter "dd/MMM/YYYY:HH:mm:ss Z"))
Expand All @@ -21,33 +20,35 @@
re/RegexFragment
(static? [_ _] true))

(def re-clf ; common log format (apache, nginx etc)
(def re-legacy-cdn
"Log format used when we logged from fastly to rackspace cloudfiles"
(let [field #"\S+"
nonbracket #"[^\]]+"
nonquote #"[^\" ]+"
reqline (list [nonquote :as :method] \space
[nonquote :as :path] \space
[nonquote :as :protocol])]
(re/regex [field :as :host] \space
[field :as :ident] \space
[field :as :authuser] \space
\[ [nonbracket :as :time] \] #"\s+"
[nonquote :as :path])]
(re/regex \< #"\d+" \>
[field :as :time] \space
[field :as :cache-host] \space
[field :as :endpoint] \: \space
[field :as :host] \space
\" [field :as :ident] \" \space
\" reqline \" \space
[field :as :status] \space
[field :as :size]
#".*")))

(def re-cdn ; log format from our fastly cdn
(def re-cdn
"Log format used when logging from fastly to s3"
(let [field #"\S+"
nonquote #"[^\" ]+"
reqline (list [nonquote :as :method] \space
[nonquote :as :path])]
[nonquote :as :path] \space
[nonquote :as :http-version])]
(re/regex \< #"\d+" \>
[field :as :time] \space
[field :as :cache-host] \space
[field :as :endpoint] \: \space
[field :as :host] \space
\" [field :as :ident] \" \space
\" reqline \" \space
[field :as :status] \space
[field :as :size]
Expand All @@ -63,8 +64,8 @@
segment \.
[#"\w+" :as :ext])))

(defn is-cdn? [line]
(.startsWith line "<"))
(defn is-legacy? [line]
(.contains line " \"-\" "))

(defn parse-path [s]
(when s
Expand All @@ -77,18 +78,18 @@
(defn parse-long [s]
(when-not (#{nil "" "-"} s)
(try (Long/parseLong s)
(catch NumberFormatException e))))
(catch NumberFormatException _))))

(defn parse-line [line]
(let [cdn? (is-cdn? line)
m (re/exec (if cdn? re-cdn re-clf) line)]
(let [legacy? (is-legacy? line)
m (re/exec (if legacy? re-legacy-cdn re-cdn) line)]
(merge
(parse-path (:path m))
{:status (parse-long (:status m))
:method (:method m)
:size (parse-long (:size m))
:time (when (:time m) (try (timef/parse (if cdn? time-cdn time-clf) (:time m))
(catch IllegalArgumentException e)))})))
:time (when (:time m) (try (timef/parse time-cdn (:time m))
(catch IllegalArgumentException _)))})))

(defn valid-download? [m]
(and m
Expand Down
10 changes: 4 additions & 6 deletions test/clojars/unit/tools/process_stats_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,8 @@
(stats/parse-path "/captain/archibald/haddock/0.1.0/haddock-0.1.0.jar"))))

(def formats
{:old-format "::ffff:127.0.0.1 - - [14/Apr/2012:06:40:59 +0000] \"GET /repo/captain/archibald/haddock/0.1.0/haddock-0.1.0.jar HTTP/1.1\" 200 2377 \"-\" \"Java/1.6.0_30\""
:new-format "::ffff:127.0.0.1 - - [14/Apr/2012:06:40:59 +0000] \"GET /repo/captain/archibald/haddock/0.1.0/haddock-0.1.0.jar HTTP/1.1\" 200 2377 \"-\" \"Java/1.6.0_30\" \"clojars.org\""
:cdn-format "<134>2012-04-14T06:40:59Z cache-ord1741 cloudfiles-endpoint[82344]: 66.249.69.238 \"-\" \"GET /captain/archibald/haddock/0.1.0/haddock-0.1.0.jar\" 200 2377 \"(null)\" \"Java/1.6.0_30\""})
{:legacy-cdn-format "<134>2012-04-14T06:40:59Z cache-ord1741 cloudfiles-endpoint[82344]: 66.249.69.238 \"-\" \"GET /captain/archibald/haddock/0.1.0/haddock-0.1.0.jar\" 200 2377 \"(null)\" \"Java/1.6.0_30\""
:cdn-format "<134>2012-04-14T06:40:59Z cache-bwi5023 s3-bucket[3217]: 3.90.141.179 \"GET /captain/archibald/haddock/0.1.0/haddock-0.1.0.jar HTTP/1.1\" 200 2377 \"(null)\" \"Java/1.6.0_30\""})

(deftest parse-line
(doseq [sample-line (vals formats)]
Expand All @@ -36,6 +35,5 @@

(deftest compute-stats
(let [stats (stats/process-log (io/resource "fake.access.log"))]
(is (= 5 (get-in stats [["snowy" "snowy"] "0.2.0"])))
(is (= 3 (get-in stats [["snowy" "snowy"] "0.3.0"])))
(is (= 3 (get-in stats [["captain.archibald" "haddock"] "0.1.0"])))))
(is (= 2 (get-in stats [["snowy" "snowy"] "0.3.0"])))
(is (= 1 (get-in stats [["captain.archibald" "haddock"] "0.1.0"])))))

0 comments on commit 80de5bc

Please sign in to comment.