Skip to content
This repository has been archived by the owner on Dec 2, 2021. It is now read-only.

Commit

Permalink
Support incremental publication extracts
Browse files Browse the repository at this point in the history
Fixes #303
  • Loading branch information
mjgiarlo committed Nov 26, 2018
1 parent ee324b4 commit a7e9cd7
Show file tree
Hide file tree
Showing 9 changed files with 121 additions and 53 deletions.
19 changes: 11 additions & 8 deletions config/schedule.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,21 @@

# Config file for whenever. Learn more: http://github.com/javan/whenever

$LOAD_PATH.unshift 'lib'
require 'rialto/etl'

set :output, 'log/etl_cron.log'

job_type :exe, 'cd :path && :task'

every :tuesday, at: '01:05pm' do
exe 'exe/extract call StanfordOrganizations > data/organizations.json ' \
'&& exe/transform call StanfordOrganizations -i data/organizations.json > data/organizations.sparql ' \
'&& exe/load call Sparql -i data/organizations.sparql ' \
'&& exe/extract call StanfordResearchers > data/researchers.ndj ' \
'&& exe/transform call StanfordPeople -i data/researchers.ndj > data/researchers.sparql ' \
'&& exe/load call Sparql -i data/researchers.sparql ' \
'&& exe/transform call StanfordPeopleList -i data/researchers.ndj > data/researchers.csv ' \
'&& exe/grants load -s 3 -i data/researchers.csv ' \
'&& exe/publications load -d data/raw/pubs -o data/pubs'
'&& exe/transform call StanfordOrganizations -i data/organizations.json > data/organizations.sparql ' \
'&& exe/load call Sparql -i data/organizations.sparql ' \
'&& exe/extract call StanfordResearchers > data/researchers.ndj ' \
'&& exe/transform call StanfordPeople -i data/researchers.ndj > data/researchers.sparql ' \
'&& exe/load call Sparql -i data/researchers.sparql ' \
'&& exe/transform call StanfordPeopleList -i data/researchers.ndj > data/researchers.csv ' \
'&& exe/grants load -s 3 -i data/researchers.csv ' \
"&& exe/publications load -d data/raw/pubs -o data/pubs --since #{Settings.wos.load_timespan}"
end
10 changes: 10 additions & 0 deletions config/settings.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,16 @@ cap:

wos:
api_key: 'evendumbervalue'
# From WoS API docs: https://developer.clarivate.com/swagger-ui/?url=/apis/wos/swagger
#
# Load time span (otherwise described as symbolic time span) defines a range
# of load dates. The load date is the date a record was added to the database.
# If load date is specified, the publishTimeSpan parameter must be omitted. If
# both publishTimeSpan and loadTimeSpan are omitted, the maximum publication
# date will be inferred from the editions data. Any of D/W/M/Y prefixed with a
# number where D-Day, M-Month, W-Week, Y-Year allowed. Acceptable value range
# for Day(0-6), Week(1-52), Month(1-12) and Year(0-10), ex: 5D,30W,10M,8Y
load_timespan: '8W'

sera:
clientid: placeholderid
Expand Down
5 changes: 5 additions & 0 deletions lib/rialto/etl/cli/extract.rb
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@ class Extract < Thor
desc: 'Institution name (for WebOfScience)',
aliases: '-i'

option :since,
required: false,
banner: 'SINCE',
desc: 'Load records since... (for WebOfScience)'

option :sunetid,
required: false,
banner: 'SUNETID',
Expand Down
13 changes: 12 additions & 1 deletion lib/rialto/etl/cli/publications.rb
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@ class Publications < Thor
banner: 'FORCE',
desc: 'Overwrite files that already exist',
aliases: '-f'
option :since,
required: false,
banner: 'SINCE',
desc: 'Load records since...'
option :skip_extract,
default: false,
type: :boolean,
Expand All @@ -48,14 +52,21 @@ def load

private

def passthrough_options
# @note Thor options hashes may be accessed "indifferently" w/r/t
# whether their keys are strings or symbols, but this does *not* work
# with the `Hash#slice` method
options.slice('since')
end

def cached_files(&_block)
Dir.glob("#{input_directory}/WOS*.json").each { |file_path| yield file_path }
end

# rubocop:disable Metrics/MethodLength
def extract(&block)
return cached_files(&block) if options[:skip_extract]
Rialto::Etl::Extractors::WebOfScience.new.each do |record_list|
Rialto::Etl::Extractors::WebOfScience.new(passthrough_options).each do |record_list|
JSON.parse(record_list).each do |record|
extract_file = File.join(input_directory, "#{record['UID']}.json")

Expand Down
6 changes: 5 additions & 1 deletion lib/rialto/etl/extractors/web_of_science.rb
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ class WebOfScience
# @option options [String] :client a preconfigured client. May be used for testing, all other
# options will be ignored.
# @option options [String] :institution ('Stanford University') The institution to search for
# @option options [String] :since (nil) How far back to retrieve records. If not provided, extract all records.
def initialize(**options)
@client = options.fetch(:client) { build_client(options) }
end
Expand All @@ -30,7 +31,10 @@ def each
private

def build_client(options)
ServiceClient::WebOfScienceClient.new(institution: options.fetch(:institution, DEFAULT_INSTITUTION))
ServiceClient::WebOfScienceClient.new(
institution: options.fetch(:institution, DEFAULT_INSTITUTION),
since: options[:since]
)
end
end
end
Expand Down
26 changes: 14 additions & 12 deletions lib/rialto/etl/service_client/web_of_science_client.rb
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,12 @@ class WebOfScienceClient
DEFAULT_QUERY_ID = 0
NO_RECORDS_FOUND = 0

def initialize(institution:)
def initialize(institution:, since: nil)
@institution = institution
@since = since
end

attr_reader :institution
attr_reader :institution, :since

# Hit the API endpoint and iterate over resulting records
def each
Expand All @@ -46,12 +47,11 @@ def each
attr_accessor :records_found, :query_id, :publication_range

def publication_ranges
# Short-circuit publication ranges if `since` was supplied
return Array.wrap(since) if since
[
'1800-01-01+1989-12-31', '1990-01-01+1999-12-31', '2000-01-01+2009-12-31',
'2010-01-01+2010-12-31', '2011-01-01+2011-12-31', '2012-01-01+2012-12-31',
'2013-01-01+2013-12-31', '2014-01-01+2014-12-31', '2015-01-01+2015-12-31',
'2016-01-01+2016-12-31', '2017-01-01+2017-12-31', '2018-01-01+2018-12-31',
'2019-01-01+2019-12-31', '2020-01-01+2021-12-31'
'2010-01-01+2013-12-31', '2014-01-01+2017-12-31', '2018-01-01+2021-12-31'
]
end

Expand Down Expand Up @@ -80,12 +80,14 @@ def connect_with_retries(path:)
# @return [String] path for the user query
def user_query_path
usr_query = "OG=#{institution}"
params = USER_QUERY_PARAMS.merge(
firstRecord: 1,
count: 1,
usrQuery: usr_query,
publishTimeSpan: publication_range
)
params = USER_QUERY_PARAMS.merge(firstRecord: 1,
count: 1,
usrQuery: usr_query)
if since
params['loadTimeSpan'] = publication_range
else
params['publishTimeSpan'] = publication_range
end
build_uri(path: USER_QUERY_PATH, params: params)
end

Expand Down
11 changes: 11 additions & 0 deletions spec/cli/publications_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,17 @@
end
end

context 'with --since flag' do
let(:args) do
['--force', '--input-directory', 'spec/fixtures/wos', '--output-directory', dir, '--since', '2W']
end

it 'invokes the extractor with the `since` option supplied' do
loader.invoke_command(command)
expect(Rialto::Etl::Extractors::WebOfScience).to have_received(:new).once.with('since' => '2W')
end
end

context 'with --skip-load flag' do
let(:args) do
['--force', '--input-directory', 'spec/fixtures/wos', '--output-directory', dir, '--skip-load']
Expand Down
33 changes: 15 additions & 18 deletions spec/extractors/web_of_science_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,6 @@
end

context 'when not given a client' do
before do
stub_request(:get, 'https://api.clarivate.com/api/wos' \
'?count=1&databaseId=WOS&firstRecord=1&usrQuery=AU=%22,%22%20AND%20OG=Stanford%20University')
.to_return(status: 200, body: '{}', headers: {})
end

it 'builds a client' do
expect(extractor.client).to be_an_instance_of(Rialto::Etl::ServiceClient::WebOfScienceClient)
end
Expand All @@ -31,30 +25,33 @@
let(:institution) { 'Foo University' }
let(:options) { { institution: institution } }

before do
stub_request(:get, 'https://api.clarivate.com/api/wos' \
'?count=1&databaseId=WOS&firstRecord=1&usrQuery=AU=%22,%22%20AND%20OG=Foo%20University')
.to_return(status: 200, body: '{}', headers: {})
end

it 'passes the value to the client' do
expect(extractor.client.institution).to eq institution
end
end

context 'when not given an institution' do
before do
stub_request(:get, 'https://api.clarivate.com/api/wos' \
'?count=1&databaseId=WOS&firstRecord=1&usrQuery=AU=%22,%22%20AND%20OG=Stanford%20University')
.to_return(status: 200, body: '{}', headers: {})
end

it 'defaults to "Stanford University"' do
expect(extractor.client.institution).to eq described_class::DEFAULT_INSTITUTION
end
end
end

context 'when given a since value' do
let(:since) { '2M' }
let(:options) { { since: since } }

it 'passes the value to the client' do
expect(extractor.client.since).to eq since
end
end

context 'when not given an since' do
it 'passes nil to the client ' do
expect(extractor.client.since).to be nil
end
end

describe '#each' do
let(:client) do
[
Expand Down
51 changes: 38 additions & 13 deletions spec/service_client/web_of_science_client_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,45 @@
let(:publication_ranges) { ['1700-01-01+1700-12-31'] }
let(:records_found) { 1 }

# rubocop:disable RSpec/AnyInstance
before do
stub_request(:get, 'https://api.clarivate.com/api/wos?count=1&databaseId=WOS&firstRecord=1' \
'&usrQuery=OG=Stanford%20University&publishTimeSpan=1700-01-01%2B1700-12-31')
.to_return(status: 200, body: api_response, headers: {})

allow(client).to receive(:query_id).and_return(123)
allow(client).to receive(:records_found).and_return(records_found)
allow(client).to receive(:publication_ranges).and_return(publication_ranges)
allow_any_instance_of(Faraday::Request::Retry).to receive(:sleep)
describe '#initialize' do
context 'when since value is supplied' do
subject(:client) { described_class.new(institution: 'Stanford University', since: since_value) }

let(:since_value) { '4D' }

before do
client.send(:publication_range=, since_value)
end

it 'sets the since value' do
expect(client.since).to eq since_value
end

it 'short-circuits the publication ranges' do
expect(client.send(:publication_ranges)).to eq Array(since_value)
end

it 'uses the loadTimeSpan param instead of the publishTimeSpan param' do
expect(client.send(:user_query_path)).to eq '/api/wos?databaseId=WOS&firstRecord=1&count=1' \
'&usrQuery=OG%3DStanford+University&loadTimeSpan=4D'
end
end
end
# rubocop:enable RSpec/AnyInstance

describe '#each' do
# rubocop:disable RSpec/AnyInstance
before do
stub_request(:get, 'https://api.clarivate.com/api/wos?count=1&databaseId=WOS&firstRecord=1' \
'&usrQuery=OG=Stanford%20University&publishTimeSpan=1700-01-01%2B1700-12-31')
.to_return(status: 200, body: api_response, headers: {})

allow(client).to receive(:query_id).and_return(123)
allow(client).to receive(:records_found).and_return(records_found)
allow(client).to receive(:publication_ranges).and_return(publication_ranges)
allow_any_instance_of(Faraday::Request::Retry).to receive(:sleep)
end
# rubocop:enable RSpec/AnyInstance

context 'when connection raises an exception' do
let(:error_message) { 'Uh oh!' }
let(:path) { '/api/wos/query/123?firstRecord=1&count=100' }
Expand All @@ -31,7 +56,7 @@
end

it 'raises an exception' do
expect { client.each {} }.to raise_error(RuntimeError)
expect { client.each.to_a }.to raise_error(RuntimeError)
end
end

Expand All @@ -53,7 +78,7 @@
end

it 'retries and writes to stderr multiple times' do
expect { client.each {} }.to output(expected_output_regex).to_stderr.and raise_error(RuntimeError)
expect { client.each.to_a }.to output(expected_output_regex).to_stderr.and raise_error(RuntimeError)
end
end

Expand Down

0 comments on commit a7e9cd7

Please sign in to comment.