Merge pull request #16 from unt-libraries/add-transcriptions-data

Add transcriptions data to ResourceObject.
unt-libraries · Aug 31, 2018 · 2c6cd46 · 2c6cd46
2 parents 23f4037 + f8ce45f
commit 2c6cd46
Show file tree

Hide file tree

Showing 8 changed files with 160 additions and 15 deletions.
diff --git a/.gitignore b/.gitignore
@@ -10,3 +10,4 @@ dist/
 
 .tox/*
 __pycache__/*
+.pytest_cache/
diff --git a/.travis.yml b/.travis.yml
@@ -2,7 +2,6 @@ language: python
 sudo: false
 python: 2.7
 env:
-    - TOX_ENV=py26
     - TOX_ENV=py27
     - TOX_ENV=py34
     - TOX_ENV=flake8

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,13 @@
 Change Log
 ==========
 
+1.2.0
+-----
+
+* Added transcriptions data to ResourceObject instances.
+* Fixed flake8 failures dealing with bare excepts. Now those excepts catch all Exception instances.
+
+
 1.1.0
 -----
 

diff --git a/aubreylib/resource.py b/aubreylib/resource.py
@@ -73,7 +73,7 @@ def get_getCopy_data(getCopy_url, meta_id):
     # Try returning the getCopy data
     try:
         return json.loads(urllib2.urlopen(record_url).read())
-    except:
+    except Exception:
         # Otherwise, return an empty dictionary
         return {}
 
@@ -116,10 +116,22 @@ def get_dimensions_data(mets_file):
     dimensions_file = mets_file.replace('.mets.xml', '.json')
     try:
         return json.load(open_system_file(dimensions_file))
-    except:
+    except Exception:
         return None
 
 
+def get_transcriptions_data(meta_id, resource_type, transcriptions_server_url):
+    """Return the JSON transcriptions structure if it exists. Only for sounds and videos."""
+    if resource_type not in ['sound', 'video'] or not transcriptions_server_url:
+        return {}
+    transcriptions_url = '{}/{}/'.format(transcriptions_server_url.rstrip('/'), meta_id)
+    try:
+        return json.loads(urllib2.urlopen(transcriptions_url).read())
+    except Exception:
+        # Otherwise, return an empty dictionary
+        return {}
+
+
 class ResourceObject(object):
 
     def __init__(self, identifier, metadataLocations, staticFileLocations,
@@ -162,7 +174,7 @@ def __init__(self, identifier, metadataLocations, staticFileLocations,
         # Open the METS document
         try:
             mets_filehandle = open_system_file(self.mets_filename)
-        except:
+        except Exception:
             raise ResourceObjectException("Could not open the Mets " +
                                           "document: %s" % (self.meta_id))
         # Parse the mets document
@@ -174,6 +186,12 @@ def __init__(self, identifier, metadataLocations, staticFileLocations,
         # Get the descriptive metadata
         self.desc_MD = get_desc_metadata(self.metadata_file,
                                          self.metadata_type)
+        # Get transcriptions data
+        self.transcriptions = get_transcriptions_data(
+            meta_id=self.meta_id,
+            resource_type=self.desc_MD['resourceType'][0]['content'],
+            transcriptions_server_url=kwargs.get('transcriptions_server_url'),
+        )
         # Get the fileSets within the fileSec
         self.get_structMap(parsed_mets)
         # Get the embargo information, if it exists
@@ -410,15 +428,26 @@ def get_fileSets(self, manifest, fileSec, file_index):
         for fileSet in list(manifest):
             # Get the fileSet order number
             fileSet_num = int(fileSet.get("ORDER", '1'))
+            # Get the transcriptions data (if any) for this fileSet
+            fileSet_transcriptions = self.transcriptions.get(str(manifest_num), {}).get(
+                str(fileSet_num), [])
             # Get the file pointers and fileSet view type
             fileSet_data = self.get_file_pointers(fileSet, fileSec, file_index)
+            # Add the transcriptions (if any) to the file_ptrs list.
+            fileSet_data['file_ptrs'].extend(fileSet_transcriptions)
             # Create the fileSet data dictionary
             manifestation_dict[fileSet_num] = {
                 'file_ptrs': fileSet_data['file_ptrs'],
                 'order_label': fileSet.get("ORDERLABEL"),
                 'label': fileSet.get("LABEL"),
                 'fileSet_view_type': fileSet_data['fileSet_view_type'],
                 'zoom': fileSet_data['zoom'],
+                'has_vtt_captions': self.has_vtt_type(fileSet_transcriptions, 'captions'),
+                'has_vtt_subtitles': self.has_vtt_type(fileSet_transcriptions, 'subtitles'),
+                'has_vtt_descriptions': self.has_vtt_type(fileSet_transcriptions, 'descriptions'),
+                'has_vtt_chapters': self.has_vtt_type(fileSet_transcriptions, 'chapters'),
+                'has_vtt_thumbnails': self.has_vtt_type(fileSet_transcriptions, 'thumbnails'),
+                'has_vtt_metadata': self.has_vtt_type(fileSet_transcriptions, 'metadata'),
             }
             # If the manifestation doesn't have a view
             # type (return as a regular file)
@@ -448,6 +477,12 @@ def get_fileSets(self, manifest, fileSec, file_index):
         self.manifestation_labels[manifest_num] = manifest.get("LABEL", None)
         return manifestation_dict
 
+    def has_vtt_type(self, transcriptions_list, vtt_type):
+        for transcription_dict in transcriptions_list:
+            if transcription_dict.get('vtt_kind') == vtt_type:
+                return True
+        return False
+
     # Gets the file pointers from the given fileset
     # (searches for the fileset starting from the fileSec node or fileGrp node)
     # Slowest part of getting the resource object
@@ -557,7 +592,7 @@ def get_embargo(self):
                 try:
                     embargo_date = datetime.datetime.strptime(
                         date_string, "%Y-%m-%d")
-                except:
+                except Exception:
                     pass
                 else:
                     self.embargo_info['embargo_until_date'] = date_string
@@ -582,7 +617,7 @@ def get_embargo(self):
                             'REPOSITORY_ADMIN_DICT',
                             default_contact,
                         )
-                except:
+                except Exception:
                     self.embargo_info['repository_admin_contact'] =\
                         default_contact
                 # Attempt to get the author e-mails from the creator field

diff --git a/aubreylib/system.py b/aubreylib/system.py
@@ -68,7 +68,7 @@ def get_file_system(meta_id, file_path, location_tuple):
                     break
                 else:
                     system_path = None
-            except:
+            except Exception:
                 pass
     # returns the file name
     return system_path, file_location
@@ -112,7 +112,7 @@ def open_system_file(file_name):
         valid_url = create_valid_url(file_name)
         try:
             return urllib2.urlopen(valid_url)
-        except:
+        except Exception:
             return get_other_system(valid_url)
     # open it over the file system
     else:
@@ -165,7 +165,7 @@ def open_file_range(file_name, range_tuple):
         req = urllib2.Request(valid_url, None, headers)
         try:
             return urllib2.urlopen(req)
-        except:
+        except Exception:
             raise SystemMethodsException("Specified Range (%s,%s) not valid." % range_tuple)
     # open it over the file system
     else:
@@ -179,13 +179,13 @@ def get_other_system(failed_url):
     # Determine meta/static servers locations
     try:
         from django.conf import settings
-    except:
+    except Exception:
         from aubreylib import METADATA_LOCATIONS, STATIC_FILE_LOCATIONS
     else:
         try:
             METADATA_LOCATIONS = settings.METADATA_LOCATIONS
             STATIC_FILE_LOCATIONS = settings.STATIC_FILE_LOCATIONS
-        except:
+        except Exception:
             from aubreylib import METADATA_LOCATIONS, STATIC_FILE_LOCATIONS
     # Combine the metadata locations with static locations
     all_locations = METADATA_LOCATIONS + STATIC_FILE_LOCATIONS
@@ -197,6 +197,6 @@ def get_other_system(failed_url):
         new_url = failed_url.replace(host, replacement_host)
         try:
             return urllib2.urlopen(new_url, timeout=3)
-        except:
+        except Exception:
             pass
     raise SystemMethodsException("Can't locate file: %s" % (failed_url))
diff --git a/setup.py b/setup.py
@@ -4,7 +4,7 @@
 
 setup(
     name='aubreylib',
-    version='1.1.0',
+    version='1.2.0',
     description='A helper library for the Aubrey access system.',
     author='University of North Texas Libraries',
     author_email='[email protected]',

diff --git a/tests/test_resource.py b/tests/test_resource.py
@@ -2,7 +2,8 @@
 
 import os
 import pytest
-from mock import mock_open, patch
+import urllib2
+from mock import mock_open, patch, MagicMock
 
 from aubreylib import resource, USE
 
@@ -63,6 +64,61 @@ def test_get_dimensions_data_absent(self, mock_exists):
             assert returned_json is None
 
 
+class TestGetTranscriptionsData:
+
+    def test_get_transcriptions_data_wrong_resource_type(self):
+        result = resource.get_transcriptions_data('metadc123', 'text', 'http://example.com')
+        assert result == {}
+
+    @pytest.mark.parametrize('url', [
+        '',
+        None,
+    ])
+    def test_no_transcriptions_server_url(self, url):
+        result = resource.get_transcriptions_data('metadc123', 'text', url)
+        assert result == {}
+
+    @pytest.mark.parametrize('url', [
+        'http://example.com',
+        'http://example.com/',
+    ])
+    @patch('urllib2.urlopen')
+    def test_no_double_slash(self, mock_urlopen, url):
+        mock_urlopen.return_value = '{}'
+        resource.get_transcriptions_data('metadc123', 'video', url)
+        mock_urlopen.assert_called_once_with('http://example.com/metadc123/')
+
+    @patch('urllib2.urlopen')
+    def test_catches_urlopen_exceptions(self, mock_urlopen):
+        mock_urlopen.side_effect = [
+            urllib2.HTTPError,
+            ValueError,
+            TypeError,
+            AttributeError,
+        ]
+        for i in range(4):
+            result = resource.get_transcriptions_data('metadc123', 'video', 'bad_url')
+            assert result == {}
+
+    @patch('json.loads')
+    @patch('urllib2.urlopen')
+    def test_catches_loads_exceptions(self, mock_urlopen, mock_loads):
+        mock_loads.side_effect = [
+            ValueError,
+            TypeError,
+        ]
+        mock_urlopen.return_value = ''
+        for i in range(2):
+            result = resource.get_transcriptions_data('metadc123', 'video', 'bad_json')
+            assert result == {}
+
+    @patch('urllib2.urlopen')
+    def test_returns_expected_data(self, mock_urlopen):
+        mock_urlopen.return_value = MagicMock(read=lambda: '{"some": "data"}')
+        result = resource.get_transcriptions_data('metadc123', 'video', 'http://example.com')
+        assert result == {'some': 'data'}
+
+
 class TestResourceObject:
 
     @patch.object(resource.ResourceObject, 'get_fileSet_file')
@@ -93,3 +149,50 @@ def testResourceObjectDimensions(self, mocked_fileSet_file):
                               'USE': '4',
                               'flocat': 'file://web/pf_b-229.txt'}
         assert no_dimensions_data in ro.manifestation_dict[1][1]['file_ptrs']
+
+    @patch('aubreylib.resource.get_transcriptions_data')
+    @patch.object(resource.ResourceObject, 'get_fileSet_file')
+    def testResourceObjectTranscriptions(self, mocked_fileSet_file,
+                                         mocked_get_transcriptions_data):
+        """Verifies accurate transcriptions data is provided."""
+        mocked_fileSet_file.return_value = {'file_mimetype': '',
+                                            'file_name': '',
+                                            'files_system': ''}
+        expected_transcription_data = {
+            'MIMETYPE': 'text/vtt',
+            'SIZE': 3618,
+            'USE': 'vtt',
+            'flocat': 'http://example.com/over/there',
+            'language': 'eng',
+            'vtt_kind': 'captions',
+        }
+        mocked_get_transcriptions_data.return_value = {
+            '1': {
+                '1': [
+                    expected_transcription_data
+                ]
+            }
+        }
+
+        # Use the METs file from our test data to make resource object.
+        current_directory = os.path.dirname(os.path.abspath(__file__))
+        mets_path = '{0}/data/metapth12434.mets.xml'.format(current_directory)
+
+        ro = resource.ResourceObject(identifier=mets_path, metadataLocations=[],
+                                     staticFileLocations=[], mimetypeIconsPath='', use=USE,
+                                     transcriptions_server_url='http://example.com')
+
+        mocked_get_transcriptions_data.assert_called_once_with(
+            meta_id='metapth12434', resource_type='image_photo',
+            transcriptions_server_url='http://example.com')
+        assert expected_transcription_data in ro.manifestation_dict[1][1]['file_ptrs']
+
+        # Check all the 'has_vtt...' values.
+        # This record does have captions.
+        assert ro.manifestation_dict[1][1]['has_vtt_captions']
+        # No other types of transcriptions exist for this record.
+        assert not ro.manifestation_dict[1][1]['has_vtt_subtitles']
+        assert not ro.manifestation_dict[1][1]['has_vtt_descriptions']
+        assert not ro.manifestation_dict[1][1]['has_vtt_chapters']
+        assert not ro.manifestation_dict[1][1]['has_vtt_thumbnails']
+        assert not ro.manifestation_dict[1][1]['has_vtt_metadata']
diff --git a/tox.ini b/tox.ini
@@ -2,7 +2,7 @@
 max-line-length = 99
 
 [tox]
-envlist = py26,py27,flake8
+envlist = py27,flake8
 
 [testenv]
 usedevelop=True
Original file line number	Diff line number	Diff line change
Expand Up		@@ -10,3 +10,4 @@ dist/

		.tox/*
		__pycache__/*
		.pytest_cache/