Skip to content

Commit

Permalink
Merge pull request #16 from unt-libraries/add-transcriptions-data
Browse files Browse the repository at this point in the history
Add transcriptions data to ResourceObject.
  • Loading branch information
somexpert authored Aug 31, 2018
2 parents 23f4037 + f8ce45f commit 2c6cd46
Show file tree
Hide file tree
Showing 8 changed files with 160 additions and 15 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ dist/

.tox/*
__pycache__/*
.pytest_cache/
1 change: 0 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ language: python
sudo: false
python: 2.7
env:
- TOX_ENV=py26
- TOX_ENV=py27
- TOX_ENV=py34
- TOX_ENV=flake8
Expand Down
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
Change Log
==========

1.2.0
-----

* Added transcriptions data to ResourceObject instances.
* Fixed flake8 failures dealing with bare excepts. Now those excepts catch all Exception instances.


1.1.0
-----

Expand Down
45 changes: 40 additions & 5 deletions aubreylib/resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def get_getCopy_data(getCopy_url, meta_id):
# Try returning the getCopy data
try:
return json.loads(urllib2.urlopen(record_url).read())
except:
except Exception:
# Otherwise, return an empty dictionary
return {}

Expand Down Expand Up @@ -116,10 +116,22 @@ def get_dimensions_data(mets_file):
dimensions_file = mets_file.replace('.mets.xml', '.json')
try:
return json.load(open_system_file(dimensions_file))
except:
except Exception:
return None


def get_transcriptions_data(meta_id, resource_type, transcriptions_server_url):
"""Return the JSON transcriptions structure if it exists. Only for sounds and videos."""
if resource_type not in ['sound', 'video'] or not transcriptions_server_url:
return {}
transcriptions_url = '{}/{}/'.format(transcriptions_server_url.rstrip('/'), meta_id)
try:
return json.loads(urllib2.urlopen(transcriptions_url).read())
except Exception:
# Otherwise, return an empty dictionary
return {}


class ResourceObject(object):

def __init__(self, identifier, metadataLocations, staticFileLocations,
Expand Down Expand Up @@ -162,7 +174,7 @@ def __init__(self, identifier, metadataLocations, staticFileLocations,
# Open the METS document
try:
mets_filehandle = open_system_file(self.mets_filename)
except:
except Exception:
raise ResourceObjectException("Could not open the Mets " +
"document: %s" % (self.meta_id))
# Parse the mets document
Expand All @@ -174,6 +186,12 @@ def __init__(self, identifier, metadataLocations, staticFileLocations,
# Get the descriptive metadata
self.desc_MD = get_desc_metadata(self.metadata_file,
self.metadata_type)
# Get transcriptions data
self.transcriptions = get_transcriptions_data(
meta_id=self.meta_id,
resource_type=self.desc_MD['resourceType'][0]['content'],
transcriptions_server_url=kwargs.get('transcriptions_server_url'),
)
# Get the fileSets within the fileSec
self.get_structMap(parsed_mets)
# Get the embargo information, if it exists
Expand Down Expand Up @@ -410,15 +428,26 @@ def get_fileSets(self, manifest, fileSec, file_index):
for fileSet in list(manifest):
# Get the fileSet order number
fileSet_num = int(fileSet.get("ORDER", '1'))
# Get the transcriptions data (if any) for this fileSet
fileSet_transcriptions = self.transcriptions.get(str(manifest_num), {}).get(
str(fileSet_num), [])
# Get the file pointers and fileSet view type
fileSet_data = self.get_file_pointers(fileSet, fileSec, file_index)
# Add the transcriptions (if any) to the file_ptrs list.
fileSet_data['file_ptrs'].extend(fileSet_transcriptions)
# Create the fileSet data dictionary
manifestation_dict[fileSet_num] = {
'file_ptrs': fileSet_data['file_ptrs'],
'order_label': fileSet.get("ORDERLABEL"),
'label': fileSet.get("LABEL"),
'fileSet_view_type': fileSet_data['fileSet_view_type'],
'zoom': fileSet_data['zoom'],
'has_vtt_captions': self.has_vtt_type(fileSet_transcriptions, 'captions'),
'has_vtt_subtitles': self.has_vtt_type(fileSet_transcriptions, 'subtitles'),
'has_vtt_descriptions': self.has_vtt_type(fileSet_transcriptions, 'descriptions'),
'has_vtt_chapters': self.has_vtt_type(fileSet_transcriptions, 'chapters'),
'has_vtt_thumbnails': self.has_vtt_type(fileSet_transcriptions, 'thumbnails'),
'has_vtt_metadata': self.has_vtt_type(fileSet_transcriptions, 'metadata'),
}
# If the manifestation doesn't have a view
# type (return as a regular file)
Expand Down Expand Up @@ -448,6 +477,12 @@ def get_fileSets(self, manifest, fileSec, file_index):
self.manifestation_labels[manifest_num] = manifest.get("LABEL", None)
return manifestation_dict

def has_vtt_type(self, transcriptions_list, vtt_type):
for transcription_dict in transcriptions_list:
if transcription_dict.get('vtt_kind') == vtt_type:
return True
return False

# Gets the file pointers from the given fileset
# (searches for the fileset starting from the fileSec node or fileGrp node)
# Slowest part of getting the resource object
Expand Down Expand Up @@ -557,7 +592,7 @@ def get_embargo(self):
try:
embargo_date = datetime.datetime.strptime(
date_string, "%Y-%m-%d")
except:
except Exception:
pass
else:
self.embargo_info['embargo_until_date'] = date_string
Expand All @@ -582,7 +617,7 @@ def get_embargo(self):
'REPOSITORY_ADMIN_DICT',
default_contact,
)
except:
except Exception:
self.embargo_info['repository_admin_contact'] =\
default_contact
# Attempt to get the author e-mails from the creator field
Expand Down
12 changes: 6 additions & 6 deletions aubreylib/system.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def get_file_system(meta_id, file_path, location_tuple):
break
else:
system_path = None
except:
except Exception:
pass
# returns the file name
return system_path, file_location
Expand Down Expand Up @@ -112,7 +112,7 @@ def open_system_file(file_name):
valid_url = create_valid_url(file_name)
try:
return urllib2.urlopen(valid_url)
except:
except Exception:
return get_other_system(valid_url)
# open it over the file system
else:
Expand Down Expand Up @@ -165,7 +165,7 @@ def open_file_range(file_name, range_tuple):
req = urllib2.Request(valid_url, None, headers)
try:
return urllib2.urlopen(req)
except:
except Exception:
raise SystemMethodsException("Specified Range (%s,%s) not valid." % range_tuple)
# open it over the file system
else:
Expand All @@ -179,13 +179,13 @@ def get_other_system(failed_url):
# Determine meta/static servers locations
try:
from django.conf import settings
except:
except Exception:
from aubreylib import METADATA_LOCATIONS, STATIC_FILE_LOCATIONS
else:
try:
METADATA_LOCATIONS = settings.METADATA_LOCATIONS
STATIC_FILE_LOCATIONS = settings.STATIC_FILE_LOCATIONS
except:
except Exception:
from aubreylib import METADATA_LOCATIONS, STATIC_FILE_LOCATIONS
# Combine the metadata locations with static locations
all_locations = METADATA_LOCATIONS + STATIC_FILE_LOCATIONS
Expand All @@ -197,6 +197,6 @@ def get_other_system(failed_url):
new_url = failed_url.replace(host, replacement_host)
try:
return urllib2.urlopen(new_url, timeout=3)
except:
except Exception:
pass
raise SystemMethodsException("Can't locate file: %s" % (failed_url))
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

setup(
name='aubreylib',
version='1.1.0',
version='1.2.0',
description='A helper library for the Aubrey access system.',
author='University of North Texas Libraries',
author_email='[email protected]',
Expand Down
105 changes: 104 additions & 1 deletion tests/test_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@

import os
import pytest
from mock import mock_open, patch
import urllib2
from mock import mock_open, patch, MagicMock

from aubreylib import resource, USE

Expand Down Expand Up @@ -63,6 +64,61 @@ def test_get_dimensions_data_absent(self, mock_exists):
assert returned_json is None


class TestGetTranscriptionsData:

def test_get_transcriptions_data_wrong_resource_type(self):
result = resource.get_transcriptions_data('metadc123', 'text', 'http://example.com')
assert result == {}

@pytest.mark.parametrize('url', [
'',
None,
])
def test_no_transcriptions_server_url(self, url):
result = resource.get_transcriptions_data('metadc123', 'text', url)
assert result == {}

@pytest.mark.parametrize('url', [
'http://example.com',
'http://example.com/',
])
@patch('urllib2.urlopen')
def test_no_double_slash(self, mock_urlopen, url):
mock_urlopen.return_value = '{}'
resource.get_transcriptions_data('metadc123', 'video', url)
mock_urlopen.assert_called_once_with('http://example.com/metadc123/')

@patch('urllib2.urlopen')
def test_catches_urlopen_exceptions(self, mock_urlopen):
mock_urlopen.side_effect = [
urllib2.HTTPError,
ValueError,
TypeError,
AttributeError,
]
for i in range(4):
result = resource.get_transcriptions_data('metadc123', 'video', 'bad_url')
assert result == {}

@patch('json.loads')
@patch('urllib2.urlopen')
def test_catches_loads_exceptions(self, mock_urlopen, mock_loads):
mock_loads.side_effect = [
ValueError,
TypeError,
]
mock_urlopen.return_value = ''
for i in range(2):
result = resource.get_transcriptions_data('metadc123', 'video', 'bad_json')
assert result == {}

@patch('urllib2.urlopen')
def test_returns_expected_data(self, mock_urlopen):
mock_urlopen.return_value = MagicMock(read=lambda: '{"some": "data"}')
result = resource.get_transcriptions_data('metadc123', 'video', 'http://example.com')
assert result == {'some': 'data'}


class TestResourceObject:

@patch.object(resource.ResourceObject, 'get_fileSet_file')
Expand Down Expand Up @@ -93,3 +149,50 @@ def testResourceObjectDimensions(self, mocked_fileSet_file):
'USE': '4',
'flocat': 'file://web/pf_b-229.txt'}
assert no_dimensions_data in ro.manifestation_dict[1][1]['file_ptrs']

@patch('aubreylib.resource.get_transcriptions_data')
@patch.object(resource.ResourceObject, 'get_fileSet_file')
def testResourceObjectTranscriptions(self, mocked_fileSet_file,
mocked_get_transcriptions_data):
"""Verifies accurate transcriptions data is provided."""
mocked_fileSet_file.return_value = {'file_mimetype': '',
'file_name': '',
'files_system': ''}
expected_transcription_data = {
'MIMETYPE': 'text/vtt',
'SIZE': 3618,
'USE': 'vtt',
'flocat': 'http://example.com/over/there',
'language': 'eng',
'vtt_kind': 'captions',
}
mocked_get_transcriptions_data.return_value = {
'1': {
'1': [
expected_transcription_data
]
}
}

# Use the METs file from our test data to make resource object.
current_directory = os.path.dirname(os.path.abspath(__file__))
mets_path = '{0}/data/metapth12434.mets.xml'.format(current_directory)

ro = resource.ResourceObject(identifier=mets_path, metadataLocations=[],
staticFileLocations=[], mimetypeIconsPath='', use=USE,
transcriptions_server_url='http://example.com')

mocked_get_transcriptions_data.assert_called_once_with(
meta_id='metapth12434', resource_type='image_photo',
transcriptions_server_url='http://example.com')
assert expected_transcription_data in ro.manifestation_dict[1][1]['file_ptrs']

# Check all the 'has_vtt...' values.
# This record does have captions.
assert ro.manifestation_dict[1][1]['has_vtt_captions']
# No other types of transcriptions exist for this record.
assert not ro.manifestation_dict[1][1]['has_vtt_subtitles']
assert not ro.manifestation_dict[1][1]['has_vtt_descriptions']
assert not ro.manifestation_dict[1][1]['has_vtt_chapters']
assert not ro.manifestation_dict[1][1]['has_vtt_thumbnails']
assert not ro.manifestation_dict[1][1]['has_vtt_metadata']
2 changes: 1 addition & 1 deletion tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
max-line-length = 99

[tox]
envlist = py26,py27,flake8
envlist = py27,flake8

[testenv]
usedevelop=True
Expand Down

0 comments on commit 2c6cd46

Please sign in to comment.