Merge pull request #138 from sul-dlss-labs/t101-WOS

refs #101. Aligns WOS transformer with new transformer pattern. refs …
sul-dlss-deprecated · Sep 24, 2018 · 81f9c11 · 81f9c11
2 parents ac01385 + a9532d5
commit 81f9c11
Show file tree

Hide file tree

Showing 8 changed files with 1,262 additions and 173 deletions.
diff --git a/lib/rialto/etl/configs/wos.rb → ...o/etl/configs/wos_to_sparql_statements.rb b/lib/rialto/etl/configs/wos.rb → ...o/etl/configs/wos_to_sparql_statements.rb
@@ -3,10 +3,14 @@
 require 'digest'
 require 'traject_plus'
 require 'rialto/etl/readers/ndjson_reader'
+require 'rialto/etl/writers/sparql_statement_writer'
 require 'active_support/core_ext/array/wrap'
+require 'rialto/etl/namespaces'
 
 extend TrajectPlus::Macros
 extend TrajectPlus::Macros::JSON
+extend Rialto::Etl::NamedGraphs
+extend Rialto::Etl::Vocabs
 
 # Do a lookup using the entity resolution service.
 # @param [String] type the type of entity
@@ -55,59 +59,96 @@ def lookup_address(addresses, addr_id)
 end
 
 settings do
-  provide 'writer_class_name', 'Traject::JsonWriter'
+  provide 'writer_class_name', 'Rialto::Etl::Writers::SparqlStatementWriter'
   provide 'reader_class_name', 'Rialto::Etl::Readers::NDJsonReader'
   # provide 'processing_thread_pool', 0 # Turns off multithreading, for debugging
 end
 
+# The named graph to place these triples into.
+to_field '@graph', literal(WOS_GRAPH.to_s), single: true
+
 to_field '@id', lambda { |json, accumulator|
   source_id = JsonPath.on(json, '$.UID').first
   subject_uri = "http://sul.stanford.edu/rialto/publications/#{Digest::MD5.hexdigest(source_id)}"
   accumulator << subject_uri
 }, single: true
+
 to_field '@type',
          extract_json('$.static_data.fullrecord_metadata.normalized_doctypes.doctype',
-                      translation_map: 'wos_document_types_to_rialto'),
-         single: true
-to_field 'http://purl.org/ontology/bibo/abstract', lambda { |json, accumulator|
+                      translation_map: 'wos_document_types_to_rialto') do |_, accumulator|
+  accumulator.map! { |type| RDF::URI.new(type) }
+end
+
+to_field "!#{BIBO['abstract']}", literal(true), single: true
+to_field BIBO['abstract'].to_s, lambda { |json, accumulator|
   abstracts = JsonPath.on(json, '$.static_data.fullrecord_metadata.abstracts.abstract.abstract_text.p')
   accumulator << abstracts.flatten.join(' ') unless abstracts.empty?
 }, single: true
-to_field 'http://purl.org/ontology/bibo/doi', lambda { |json, accumulator|
+
+to_field "!#{BIBO['doi']}", literal(true), single: true
+to_field BIBO['doi'].to_s, lambda { |json, accumulator|
   doi = JsonPath.on(json, '$.dynamic_data.cluster_related.identifiers.identifier[?(@.type=="doi")].value').first ||
         JsonPath.on(json, '$.dynamic_data.cluster_related.identifiers.identifier[?(@.type=="xref_doi")].value').first
   accumulator << doi if doi
 }, single: true
-to_field 'http://vivoweb.org/ontology/core#relatedBy', lambda { |json, accumulator|
+
+to_field "!#{VIVO['relatedBy']}", literal(true), single: true
+to_field VIVO['relatedBy'].to_s, lambda { |json, accumulator|
   addresses = fetch_addresses(json)
   # Lookup all the contributors in the entity resolution service to find their URIs.
   contributors = Array.wrap(JsonPath.on(json, '$.static_data.summary.names.name').first)
-  people_uris = contributors.map do |c|
+  authorships = contributors.map do |c|
     address = lookup_address(addresses, c['addr_no'])
     person_params = c.slice('orcid_id', 'first_name', 'last_name', 'full_name')
     person_params.merge!(address) if address
-    { '@id' => resolve_entity('person', person_params) }
+    resolved_person = resolve_entity('person', person_params)
+    new_person = {
+      '@id' => RIALTO_PEOPLE[Digest::MD5.hexdigest("#{c['first_name']} #{c['last_name']}".downcase)],
+      '@type' => [FOAF['Agent'], FOAF['Person']]
+      # TODO: labels and name vcard
+    }
+    {
+      '@id' => RIALTO_CONTEXT_RELATIONSHIPS["#{json['UID']}_#{(resolved_person ||
+          new_person['@id']).to_s.delete_prefix(RIALTO_PEOPLE.to_s)}"],
+      '@type' => VIVO['Authorship'],
+      "!{VIVO['relates'}" => true,
+      VIVO['relates'].to_s => resolved_person || new_person
+
+    }
   end
-  accumulator << { '@type' => 'http://vivoweb.org/ontology/core#Authorship',
-                   'http://vivoweb.org/ontology/core#relates' => people_uris }
+  accumulator << authorships
 }, single: true
-to_field 'http://purl.org/dc/terms/subject', lambda { |json, accumulator|
-  subjects = JsonPath.on(json, "$.static_data.fullrecord_metadata.category_info.subjects.subject[?(@.ascatype=='extended')].content")
+
+to_field "!#{DCTERMS['subject']}", literal(true), single: true
+to_field DCTERMS['subject'].to_s, lambda { |json, accumulator|
+  subjects = JsonPath.on(json, '$.static_data.fullrecord_metadata.category_info.subjects.' \
+     "subject[?(@.ascatype=='extended')].content")
   accumulator << subjects.map do |subject|
-    resolve_entity('topic', name: subject)
+    resolve_entity('topic', name: subject) || { '@id' => RIALTO_CONCEPTS[Digest::MD5.hexdigest(subject.downcase)],
+                                                '@type' => SKOS['Concept'], DCTERMS['subject'].to_s => subject }
   end
 }, single: true
-to_field 'http://purl.org/ontology/bibo/identifier',
+
+to_field "!#{BIBO['identifier']}", literal(true), single: true
+to_field BIBO['identifier'].to_s,
          extract_json('$.dynamic_data.cluster_related.identifiers.identifier[*].value')
-to_field 'http://purl.org/dc/terms/hasPart',
+
+to_field "!#{DCTERMS['isPartOf']}", literal(true), single: true
+to_field DCTERMS['isPartOf'].to_s,
          extract_json("$.static_data.summary.titles.title[?(@.type=='source')].content"),
          single: true
-to_field 'http://vivoweb.org/ontology/core#publisher',
+
+to_field "!#{VIVO['publisher']}", literal(true), single: true
+to_field VIVO['publisher'].to_s,
          extract_json('$.static_data.summary.publishers.publisher.names.name.display_name'),
          single: true
-to_field 'http://purl.org/dc/terms/title',
+
+to_field "!#{DCTERMS['title']}", literal(true), single: true
+to_field DCTERMS['title'].to_s,
          extract_json("$.static_data.summary.titles.title[?(@.type=='item')].content"),
          single: true
-to_field 'http://purl.org/dc/terms/created',
+
+to_field "!#{DCTERMS['created']}", literal(true), single: true
+to_field DCTERMS['created'].to_s,
          extract_json('$.static_data.summary.pub_info.sortdate'),
          single: true
diff --git a/lib/rialto/etl/namespaces.rb b/lib/rialto/etl/namespaces.rb
@@ -9,6 +9,8 @@ module Vocabs
       rialto_base = 'http://sul.stanford.edu/rialto/'
       RIALTO_ORGANIZATIONS = RDF::Vocabulary.new(rialto_base + 'agents/orgs/')
       RIALTO_PEOPLE = RDF::Vocabulary.new(rialto_base + 'agents/people/')
+      RIALTO_PUBLICATIONS = RDF::Vocabulary.new(rialto_base + 'publications/')
+      RIALTO_CONCEPTS = RDF::Vocabulary.new(rialto_base + 'concepts/')
       RIALTO_CONTEXT_NAMES = RDF::Vocabulary.new(rialto_base + 'context/names/')
       RIALTO_CONTEXT_ADDRESSES = RDF::Vocabulary.new(rialto_base + 'context/addresses/')
       RIALTO_CONTEXT_RELATIONSHIPS = RDF::Vocabulary.new(rialto_base + 'context/relationships/')
@@ -22,12 +24,14 @@ module Vocabs
       DCTERMS = RDF::Vocabulary.new('http://purl.org/dc/terms/')
       OBO = RDF::Vocabulary.new('http://purl.obolibrary.org/obo/')
       RDFS = RDF::Vocabulary.new('http://www.w3.org/2000/01/rdf-schema#')
+      BIBO = RDF::Vocabulary.new('http://purl.org/ontology/bibo/')
     end
     # Holds graph names
     module NamedGraphs
       rialto_base = 'http://sul.stanford.edu/rialto/graphs/'
       STANFORD_PEOPLE_GRAPH = RDF::URI.new(rialto_base + 'stanford_people')
       STANFORD_ORGANIZATIONS_GRAPH = RDF::URI.new(rialto_base + 'stanford_organizations')
+      WOS_GRAPH = RDF::URI.new(rialto_base + 'wos')
     end
   end
 end
diff --git a/lib/rialto/etl/service_client/entity_resolver.rb b/lib/rialto/etl/service_client/entity_resolver.rb
@@ -22,7 +22,14 @@ def initialize
 
         def resolve(type, params)
           resp = conn.get(type, params)
-          return resp.body if resp.success?
+          case resp.status
+          when 200..299
+            RDF::URI.new(resp.body)
+          when 404
+            nil
+          else
+            raise "Entity resolver returned #{resp.status} for #{type} type and #{params} params."
+          end
         end
 
         def connection