cc.R

#!/usr/bin/env Rscript

suppressPackageStartupMessages({
  library(argparser, quietly = TRUE)
  library(magrittr, quietly = TRUE)
})

arg_parser(
  description = 'Extract metadata of a specific target based on the results of "commoncrawl.org"

Examples:

$ ./cc.R --list                                      # list indices
$ ./cc.R --domain github.com                         # defaults to most recent index
$ ./cc.R --domain github.com --out /tmp/gh.json      # specify an oputput file
$ ./cc.R --index CC-MAIN-2018-34 --domain github.com # specify which index'
) %>%
  add_argument(
    arg = "--domain",
    help = "domain which will be crawled",
    type = "character",
    short = "-d",
    default = NA_character_
  ) %>%
  add_argument(
    arg = "--out",
    help = "specify an output file (default: domain.json)",
    type = "character",
    short = "-o",
    default = NA_character_
  ) %>%
  add_argument(
    arg = "--list",
    help = "list all available indexes",
    short = "-l",
    flag = TRUE
  ) %>%
  add_argument(
    arg = "--index",
    help = "use a specific index file",
    type = "character",
    short = "-i",
    default = NA_character_
  ) -> parser

opts <- parse_args(parser)

if ((is.na(opts$domain)) & (!opts$list)) {
  print(parser)
  quit(save="no", 1)
}

suppressPackageStartupMessages({
  library(xml2, quietly = TRUE)
  library(httr, quietly = TRUE)
  library(rvest, quietly = TRUE)
  library(lubridate, quietly = TRUE)
  library(jsonlite, quietly = TRUE)
  library(tidyverse, quietly = TRUE)
})

cache_dir <- path.expand("~/.cc.R")

#' Setup the cache directory
#'
#' @md
#' @return nothing
setup_cache <- function() {
  if (!dir.exists(cache_dir)) dir.create(path.expand(cache_dir))
}

#' Refresh the CC crawl index cache
#'
#' Possible side-effect of writing to the cache dir
#'
#' @md
#' @return data frame (`month`/`year`/`path`)
refresh_index_cache <- function() {

  pg <- xml2::read_html("http://index.commoncrawl.org/")

  rvest::html_nodes(pg, xpath = ".//td[1]/*/a") %>%
    rvest::html_attr("href") -> idx_paths

  rvest::html_nodes(pg, xpath = ".//td[2]") %>%
    rvest::html_text(trim=TRUE) %>%
    str_replace(" Index", "") %>%
    str_split(" ") %>%
    purrr::map(set_names, c("month", "year")) %>%
    map_df(as.list) %>%
    mutate(path = idx_paths) -> idx

  readr::write_rds(idx, file.path(cache_dir, "indexes.rds"))

}

#' Fetch cached or current CC crawl index paths
#'
#' @md
#' @return data frame (`month`/`year`/`path`)
fetch_indexes <- function() {

  if (!file.exists(file.path(cache_dir, "indexes.rds"))) {
    return(refresh_index_cache())
  }

  idx <- readr::read_rds(file.path(cache_dir, "indexes.rds"))

  dplyr::filter(
    idx,
    month == as.character(lubridate::month(Sys.Date(), abbr=FALSE, label=TRUE)),
    year == lubridate::year(Sys.Date())
  ) %>%
    nrow() -> has_this_month

  if ((!has_this_month) & (lubridate::day(Sys.Date()) > 25)) {
    return(refresh_index_cache())
  } else {
    return(idx)
  }

}

#' Retrieve domain CDX metadata from CC index
#'
#' @md
#' @param domain domain name
#' @param index CC index file
#' @param page API page #
#' @return data frame (CDX)
get_data <- function(domain, index, page) {

  httr::GET(
    url = file.path("http://index.commoncrawl.org", glue::glue("{index}-index")),
    query = list(
      url = glue::glue("*.{domain}"),
      output = "json",
      page = page
    )
  ) -> res

  httr::stop_for_status(res)

  httr::content(res, as="raw", encoding="UTF-8") %>%
    rawConnection() -> rcon

  on.exit(close(rcon), add=TRUE)

  out <- jsonlite::stream_in(rcon, verbose = FALSE)

  out

}

#' Grab all the URL data from the CC for a given index and omain
#'
#' @md
#' @param domain domain name
#' @param index CC index file
#' @return data frame (CDX)
crawl_index <- function(domain, index) {

  httr::GET(
    url = file.path("http://index.commoncrawl.org", glue::glue("{index}-index")),
    query = list(
      url = glue::glue("*.{domain}"),
      output = "json",
      showNumPages = TRUE
    )
  ) -> res

  httr::stop_for_status(res)

  meta <- httr::content(res, as="text", encoding="UTF-8")
  meta <- jsonlite::fromJSON(meta)

  purrr::map_df(
    0:(meta$pages-1), get_data, domain=domain, index=index
  ) %>%
    tbl_df() -> out

}

#' List the available CC crawl indices
#'
#' Side-effect of output to stdout
#'
#' @md
#' @return indices (invisibly)
list_indexes <- function() {
  dplyr::select(fetch_indexes(), year, month, path) %>%
    data.frame() -> tmp
  print(tmp[nrow(tmp):1,], row.names = FALSE, quote = FALSE)
  invisible(tmp)
}

setup_cache()

if (opts$list) { # just list the indexes (will prime, cache and auto-update)
  list_indexes()
} else if (!is.na(opts$index)) { # use a specific index
  idx <- fetch_indexes()
  if (!(gsub("^/", "", opts$index) %in% gsub("^/", "", idx$path))) {
    stop("Index does not exist", call.=FALSE)
  }
  out <- crawl_index(opts$domain, opts$index)
  where <-  if (is.na(opts$out)) stdout() else file(path.expand(opts$out))
  jsonlite::stream_out(out, where, verbose=FALSE)
} else { # use latest index
  idx <- fetch_indexes()
  out <- crawl_index(opts$domain, idx$path[1])
  where <-  if (is.na(opts$out)) stdout() else file(path.expand(opts$out))
  jsonlite::stream_out(out, where, verbose=FALSE)
}

quit(save="no", status=0)