Skip to content
This repository has been archived by the owner on Dec 2, 2021. It is now read-only.

Commit

Permalink
Merge pull request #138 from sul-dlss-labs/t101-WOS
Browse files Browse the repository at this point in the history
refs #101. Aligns WOS transformer with new transformer pattern. refs …
  • Loading branch information
mjgiarlo authored Sep 24, 2018
2 parents ac01385 + a9532d5 commit 81f9c11
Show file tree
Hide file tree
Showing 8 changed files with 1,262 additions and 173 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,14 @@
require 'digest'
require 'traject_plus'
require 'rialto/etl/readers/ndjson_reader'
require 'rialto/etl/writers/sparql_statement_writer'
require 'active_support/core_ext/array/wrap'
require 'rialto/etl/namespaces'

extend TrajectPlus::Macros
extend TrajectPlus::Macros::JSON
extend Rialto::Etl::NamedGraphs
extend Rialto::Etl::Vocabs

# Do a lookup using the entity resolution service.
# @param [String] type the type of entity
Expand Down Expand Up @@ -55,59 +59,96 @@ def lookup_address(addresses, addr_id)
end

settings do
provide 'writer_class_name', 'Traject::JsonWriter'
provide 'writer_class_name', 'Rialto::Etl::Writers::SparqlStatementWriter'
provide 'reader_class_name', 'Rialto::Etl::Readers::NDJsonReader'
# provide 'processing_thread_pool', 0 # Turns off multithreading, for debugging
end

# The named graph to place these triples into.
to_field '@graph', literal(WOS_GRAPH.to_s), single: true

to_field '@id', lambda { |json, accumulator|
source_id = JsonPath.on(json, '$.UID').first
subject_uri = "http://sul.stanford.edu/rialto/publications/#{Digest::MD5.hexdigest(source_id)}"
accumulator << subject_uri
}, single: true

to_field '@type',
extract_json('$.static_data.fullrecord_metadata.normalized_doctypes.doctype',
translation_map: 'wos_document_types_to_rialto'),
single: true
to_field 'http://purl.org/ontology/bibo/abstract', lambda { |json, accumulator|
translation_map: 'wos_document_types_to_rialto') do |_, accumulator|
accumulator.map! { |type| RDF::URI.new(type) }
end

to_field "!#{BIBO['abstract']}", literal(true), single: true
to_field BIBO['abstract'].to_s, lambda { |json, accumulator|
abstracts = JsonPath.on(json, '$.static_data.fullrecord_metadata.abstracts.abstract.abstract_text.p')
accumulator << abstracts.flatten.join(' ') unless abstracts.empty?
}, single: true
to_field 'http://purl.org/ontology/bibo/doi', lambda { |json, accumulator|

to_field "!#{BIBO['doi']}", literal(true), single: true
to_field BIBO['doi'].to_s, lambda { |json, accumulator|
doi = JsonPath.on(json, '$.dynamic_data.cluster_related.identifiers.identifier[?(@.type=="doi")].value').first ||
JsonPath.on(json, '$.dynamic_data.cluster_related.identifiers.identifier[?(@.type=="xref_doi")].value').first
accumulator << doi if doi
}, single: true
to_field 'http://vivoweb.org/ontology/core#relatedBy', lambda { |json, accumulator|

to_field "!#{VIVO['relatedBy']}", literal(true), single: true
to_field VIVO['relatedBy'].to_s, lambda { |json, accumulator|
addresses = fetch_addresses(json)
# Lookup all the contributors in the entity resolution service to find their URIs.
contributors = Array.wrap(JsonPath.on(json, '$.static_data.summary.names.name').first)
people_uris = contributors.map do |c|
authorships = contributors.map do |c|
address = lookup_address(addresses, c['addr_no'])
person_params = c.slice('orcid_id', 'first_name', 'last_name', 'full_name')
person_params.merge!(address) if address
{ '@id' => resolve_entity('person', person_params) }
resolved_person = resolve_entity('person', person_params)
new_person = {
'@id' => RIALTO_PEOPLE[Digest::MD5.hexdigest("#{c['first_name']} #{c['last_name']}".downcase)],
'@type' => [FOAF['Agent'], FOAF['Person']]
# TODO: labels and name vcard
}
{
'@id' => RIALTO_CONTEXT_RELATIONSHIPS["#{json['UID']}_#{(resolved_person ||
new_person['@id']).to_s.delete_prefix(RIALTO_PEOPLE.to_s)}"],
'@type' => VIVO['Authorship'],
"!{VIVO['relates'}" => true,
VIVO['relates'].to_s => resolved_person || new_person

}
end
accumulator << { '@type' => 'http://vivoweb.org/ontology/core#Authorship',
'http://vivoweb.org/ontology/core#relates' => people_uris }
accumulator << authorships
}, single: true
to_field 'http://purl.org/dc/terms/subject', lambda { |json, accumulator|
subjects = JsonPath.on(json, "$.static_data.fullrecord_metadata.category_info.subjects.subject[?(@.ascatype=='extended')].content")

to_field "!#{DCTERMS['subject']}", literal(true), single: true
to_field DCTERMS['subject'].to_s, lambda { |json, accumulator|
subjects = JsonPath.on(json, '$.static_data.fullrecord_metadata.category_info.subjects.' \
"subject[?(@.ascatype=='extended')].content")
accumulator << subjects.map do |subject|
resolve_entity('topic', name: subject)
resolve_entity('topic', name: subject) || { '@id' => RIALTO_CONCEPTS[Digest::MD5.hexdigest(subject.downcase)],
'@type' => SKOS['Concept'], DCTERMS['subject'].to_s => subject }
end
}, single: true
to_field 'http://purl.org/ontology/bibo/identifier',

to_field "!#{BIBO['identifier']}", literal(true), single: true
to_field BIBO['identifier'].to_s,
extract_json('$.dynamic_data.cluster_related.identifiers.identifier[*].value')
to_field 'http://purl.org/dc/terms/hasPart',

to_field "!#{DCTERMS['isPartOf']}", literal(true), single: true
to_field DCTERMS['isPartOf'].to_s,
extract_json("$.static_data.summary.titles.title[?(@.type=='source')].content"),
single: true
to_field 'http://vivoweb.org/ontology/core#publisher',

to_field "!#{VIVO['publisher']}", literal(true), single: true
to_field VIVO['publisher'].to_s,
extract_json('$.static_data.summary.publishers.publisher.names.name.display_name'),
single: true
to_field 'http://purl.org/dc/terms/title',

to_field "!#{DCTERMS['title']}", literal(true), single: true
to_field DCTERMS['title'].to_s,
extract_json("$.static_data.summary.titles.title[?(@.type=='item')].content"),
single: true
to_field 'http://purl.org/dc/terms/created',

to_field "!#{DCTERMS['created']}", literal(true), single: true
to_field DCTERMS['created'].to_s,
extract_json('$.static_data.summary.pub_info.sortdate'),
single: true
4 changes: 4 additions & 0 deletions lib/rialto/etl/namespaces.rb
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ module Vocabs
rialto_base = 'http://sul.stanford.edu/rialto/'
RIALTO_ORGANIZATIONS = RDF::Vocabulary.new(rialto_base + 'agents/orgs/')
RIALTO_PEOPLE = RDF::Vocabulary.new(rialto_base + 'agents/people/')
RIALTO_PUBLICATIONS = RDF::Vocabulary.new(rialto_base + 'publications/')
RIALTO_CONCEPTS = RDF::Vocabulary.new(rialto_base + 'concepts/')
RIALTO_CONTEXT_NAMES = RDF::Vocabulary.new(rialto_base + 'context/names/')
RIALTO_CONTEXT_ADDRESSES = RDF::Vocabulary.new(rialto_base + 'context/addresses/')
RIALTO_CONTEXT_RELATIONSHIPS = RDF::Vocabulary.new(rialto_base + 'context/relationships/')
Expand All @@ -22,12 +24,14 @@ module Vocabs
DCTERMS = RDF::Vocabulary.new('http://purl.org/dc/terms/')
OBO = RDF::Vocabulary.new('http://purl.obolibrary.org/obo/')
RDFS = RDF::Vocabulary.new('http://www.w3.org/2000/01/rdf-schema#')
BIBO = RDF::Vocabulary.new('http://purl.org/ontology/bibo/')
end
# Holds graph names
module NamedGraphs
rialto_base = 'http://sul.stanford.edu/rialto/graphs/'
STANFORD_PEOPLE_GRAPH = RDF::URI.new(rialto_base + 'stanford_people')
STANFORD_ORGANIZATIONS_GRAPH = RDF::URI.new(rialto_base + 'stanford_organizations')
WOS_GRAPH = RDF::URI.new(rialto_base + 'wos')
end
end
end
9 changes: 8 additions & 1 deletion lib/rialto/etl/service_client/entity_resolver.rb
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,14 @@ def initialize

def resolve(type, params)
resp = conn.get(type, params)
return resp.body if resp.success?
case resp.status
when 200..299
RDF::URI.new(resp.body)
when 404
nil
else
raise "Entity resolver returned #{resp.status} for #{type} type and #{params} params."
end
end

def connection
Expand Down
Loading

0 comments on commit 81f9c11

Please sign in to comment.