Skip to content

Commit

Permalink
Ensure support for UTF-8 chars in filenames; update tests and test da…
Browse files Browse the repository at this point in the history
…ta to

test support.
  • Loading branch information
RayPlante committed Jul 7, 2024
1 parent ece7fef commit ca554c9
Show file tree
Hide file tree
Showing 12 changed files with 108 additions and 30 deletions.
4 changes: 2 additions & 2 deletions python/nistoar/pdr/preserv/bagger/midas3.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
from ....nerdm import utils as nerdutils
from ... import def_merge_etcdir, utils, ARK_NAAN, PDR_PUBLIC_SERVER
from .. import (SIPDirectoryError, SIPDirectoryNotFound, AIPValidationError,
ConfigurationException, StateException, PODError,
ConfigurationException, StateException, PODError, NERDError,
PreservationStateError)
from .... import pdr
from .prepupd import UpdatePrepService
Expand Down Expand Up @@ -324,7 +324,7 @@ def _filepaths_in_pod(self):

pod = self._pod_rec()

return [self._distsvcurl.sub('', urllib.unquote(d['downloadURL']))
return [self._distsvcurl.sub('', urllib.unquote(str(d['downloadURL'])))
for d in pod.get('distribution',[]) if 'downloadURL' in d]


Expand Down
8 changes: 4 additions & 4 deletions python/nistoar/pdr/preserv/bagit/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -445,7 +445,7 @@ def _download_url(self, ediid, destpath):
if ediid.startswith(arkpfx):
# our convention is to omit the "ark:/88434/" prefix
ediid = ediid[len(arkpfx):]
return self._distbase + ediid + '/' + urlencode(path)
return self._distbase + ediid + '/' + urlencode(str(path))

def assign_id(self, id, keep_conv=False):
"""
Expand Down Expand Up @@ -2495,7 +2495,7 @@ def _create_def_datafile_md(self, destpath):
out = OrderedDict([
("_schema", NERD_DEF + "Component"),
("@context", NERDM_CONTEXT),
("@id", "cmps/" + urlencode(destpath)),
("@id", "cmps/" + urlencode(str(destpath))),
("@type", deepcopy(self._comp_types["DataFile"][0]))
])
out["_extensionSchemas"] = deepcopy(self._comp_types["DataFile"][1])
Expand All @@ -2514,7 +2514,7 @@ def _create_def_chksum_md(self, destpath):
out = OrderedDict([
("_schema", NERD_DEF + "Component"),
("@context", NERDM_CONTEXT),
("@id", "cmps/" + urlencode(destpath)),
("@id", "cmps/" + urlencode(str(destpath))),
("@type", deepcopy(self._comp_types["ChecksumFile"][0])),
("filepath", destpath)
])
Expand Down Expand Up @@ -2543,7 +2543,7 @@ def _create_def_subcoll_md(self, destpath):
out = OrderedDict([
("_schema", NERD_DEF + "Component"),
("@context", NERDM_CONTEXT),
("@id", "cmps/" + urlencode(destpath)),
("@id", "cmps/" + urlencode(str(destpath))),
("@type", deepcopy(self._comp_types["Subcollection"][0])),
("_extensionSchemas", deepcopy(self._comp_types["Subcollection"][1])),
("filepath", destpath)
Expand Down
24 changes: 19 additions & 5 deletions python/nistoar/pdr/publish/midas3/mdwsgi.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
This web service provides the public access to the metadata and the data files provided
by the author to MIDAS.
"""
import os, sys, logging, json, re
import os, sys, logging, json, re, urllib
from wsgiref.headers import Headers
from cgi import parse_qs, escape as escape_qp
from collections import OrderedDict
Expand Down Expand Up @@ -96,6 +96,7 @@ def __call__(self, env, start_resp):
class Handler(object):

badidre = re.compile(r"[<>\s]")
enc = "ISO-8859-1"

def __init__(self, app, wsgienv, start_resp):
self.app = app
Expand Down Expand Up @@ -129,7 +130,8 @@ def add_header(self, name, value):
# thus, this will raise a UnicodeEncodeError if the input strings
# include Unicode (char code > 255).
e = "ISO-8859-1"
self._hdr.add_header(name.encode(e), value.encode(e))
onerr = "backslashreplace"
self._hdr.add_header(name.encode(e, onerr), value.encode(e, onerr))

def set_response(self, code, message):
self._code = code
Expand Down Expand Up @@ -410,10 +412,22 @@ def send_datafile(self, id, filepath):

self.set_response(200, "Data file found")
self.add_header('Content-Type', mtype)
self.add_header('Content-Disposition',
'inline; filename="%s"' % os.path.basename(filepath))
outname = os.path.basename(filepath)
try:
outname.encode("ISO-8859-1")
self.add_header('Content-Disposition',
'inline; filename="%s"' % outname)
except UnicodeError:
outname = urllib.quote(outname.encode())
self.add_header('Content-Disposition',
'inline; filename*=UTF-8''"%s"' % outname)
if xsend:
self.add_header('X-Accel-Redirect', xsend)
try:
xsend.encode("ISO-8859-1")
self.add_header('X-Accel-Redirect', xsend)
except UnicodeEncodeError:
xsend = urllib.quote(xsend.encode())
self.add_header('X-Accel-Redirect', xsend)
self.end_headers()

if xsend:
Expand Down
Binary file modified python/tests/nistoar/pdr/distrib/data/1491.1_0.mbag0_4-0.zip
Binary file not shown.
14 changes: 7 additions & 7 deletions python/tests/nistoar/pdr/preserv/bagger/test_midas.py
Original file line number Diff line number Diff line change
Expand Up @@ -492,7 +492,7 @@ def test_available_files(self):

datafiles = self.bagr.available_files()
self.assertIsInstance(datafiles, dict)
self.assertEqual(len(datafiles), 5)
self.assertEqual(len(datafiles), 6)
self.assertIn("trial1.json", datafiles)
self.assertIn("trial1.json.sha256", datafiles)
self.assertIn("trial2.json", datafiles)
Expand All @@ -505,7 +505,7 @@ def test_available_files(self):
# copy of trial3a.json in upload overrides
self.assertEqual(datafiles["trial3/trial3a.json"],
os.path.join(uplsip, "trial3/trial3a.json"))
self.assertEqual(len(datafiles), 5)
self.assertEqual(len(datafiles), 6)

def test_baggermd_file_for(self):
self.bagr.ensure_base_bag()
Expand Down Expand Up @@ -596,7 +596,7 @@ def test_ensure_data_files(self):

self.bagr.ensure_data_files()
self.assertIsNotNone(self.bagr.datafiles)
self.assertEqual(len(self.bagr.datafiles), 5)
self.assertEqual(len(self.bagr.datafiles), 6)
self.assertEqual(len([d for d in self.bagr.datafiles.keys()
if d.endswith(".sha256")]), 2)

Expand Down Expand Up @@ -643,7 +643,7 @@ def test_registered_files(self):

datafiles = self.bagr.registered_files()
self.assertIsInstance(datafiles, dict)
self.assertEqual(len(datafiles), 5)
self.assertEqual(len(datafiles), 6)
self.assertIn("trial1.json", datafiles)
self.assertIn("trial1.json.sha256", datafiles)
self.assertIn("trial2.json", datafiles)
Expand All @@ -655,14 +655,14 @@ def test_registered_files(self):
os.path.join(revsip, "trial2.json"))
self.assertEqual(datafiles["trial3/trial3a.json"],
os.path.join(revsip, "trial3/trial3a.json"))
self.assertEqual(len(datafiles), 5)
self.assertEqual(len(datafiles), 6)

def test_available_files(self):
revsip = os.path.join(self.revdir, self.midasid[32:])

datafiles = self.bagr.available_files()
self.assertIsInstance(datafiles, dict)
self.assertEqual(len(datafiles), 5)
self.assertEqual(len(datafiles), 6)
self.assertIn("trial1.json", datafiles)
self.assertIn("trial1.json.sha256", datafiles)
self.assertIn("trial2.json", datafiles)
Expand All @@ -674,7 +674,7 @@ def test_available_files(self):
os.path.join(revsip, "trial2.json"))
self.assertEqual(datafiles["trial3/trial3a.json"],
os.path.join(revsip, "trial3/trial3a.json"))
self.assertEqual(len(datafiles), 5)
self.assertEqual(len(datafiles), 6)

def test_fileExaminer(self):
# turn on asyncexamine (but turn off autolaunch so that we can test
Expand Down
16 changes: 8 additions & 8 deletions python/tests/nistoar/pdr/preserv/bagger/test_midas3.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def test_pod_rec(self):
def test_available_files(self):
datafiles = self.sip.available_files()
self.assertIsInstance(datafiles, dict)
self.assertEqual(len(datafiles), 5)
self.assertEqual(len(datafiles), 6)
self.assertIn("trial1.json", datafiles)
self.assertIn("trial1.json.sha256", datafiles)
self.assertIn("trial2.json", datafiles)
Expand All @@ -120,7 +120,7 @@ def test_available_files(self):
# copy of trial3a.json in upload overrides
self.assertEqual(datafiles["trial3/trial3a.json"],
os.path.join(self.sip.upldatadir, "trial3/trial3a.json"))
self.assertEqual(len(datafiles), 5)
self.assertEqual(len(datafiles), 6)

def test_registered_files(self):
pod = utils.read_json(os.path.join(self.revdir, "1491", "_pod.json"))
Expand All @@ -130,7 +130,7 @@ def test_registered_files(self):
datafiles = self.sip.registered_files()

self.assertIsInstance(datafiles, dict)
self.assertEqual(len(datafiles), 4)
self.assertEqual(len(datafiles), 5)
self.assertIn("trial1.json", datafiles)
self.assertNotIn("trial1.json.sha256", datafiles)
self.assertIn("trial2.json", datafiles)
Expand All @@ -143,7 +143,7 @@ def test_registered_files(self):
os.path.join(self.sip.revdatadir, "trial2.json"))
self.assertEqual(datafiles["trial3/trial3a.json"],
os.path.join(self.sip.revdatadir, "trial3/trial3a.json"))
self.assertEqual(len(datafiles), 4)
self.assertEqual(len(datafiles), 5)

def test_fromPOD(self):
podf = os.path.join(self.revdir, "1491", "_pod.json")
Expand Down Expand Up @@ -775,7 +775,7 @@ def test_ensure_data_files(self):
self.bagr.ensure_data_files(examine="sync")

self.assertIsNotNone(self.bagr.datafiles)
self.assertEqual(len(self.bagr.datafiles), 5)
self.assertEqual(len(self.bagr.datafiles), 6)
self.assertEqual(len([d for d in self.bagr.datafiles.keys()
if d.endswith(".sha256")]), 2)

Expand Down Expand Up @@ -836,14 +836,14 @@ def test_registered_files(self):
os.path.join(revsip, "trial2.json"))
self.assertEqual(datafiles["trial3/trial3a.json"],
os.path.join(revsip, "trial3/trial3a.json"))
self.assertEqual(len(datafiles), 5)
self.assertEqual(len(datafiles), 6)

def test_available_files(self):
revsip = os.path.join(self.revdir, self.midasid[32:])

datafiles = self.bagr.sip.available_files()
self.assertIsInstance(datafiles, dict)
self.assertEqual(len(datafiles), 5)
self.assertEqual(len(datafiles), 6)
self.assertIn("trial1.json", datafiles)
self.assertIn("trial1.json.sha256", datafiles)
self.assertIn("trial2.json", datafiles)
Expand All @@ -855,7 +855,7 @@ def test_available_files(self):
os.path.join(revsip, "trial2.json"))
self.assertEqual(datafiles["trial3/trial3a.json"],
os.path.join(revsip, "trial3/trial3a.json"))
self.assertEqual(len(datafiles), 5)
self.assertEqual(len(datafiles), 6)

def test_fileExaminer_autolaunch(self):
# show that the async thread does its work with autolaunch
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,28 @@
],
"valid": false
},
{
"filepath": "trial3/trial3\u03b1.json",
"checksum": {
"hash": "7b58010c841b7748a48a7ac6366258d5b5a8d23d756951b6059c0e80daad516b",
"algorithm": {
"tag": "sha256",
"@type": "Thing"
}
},
"mediaType": "application/json",
"downloadURL": "https://data.nist.gov/od/ds/3A1EE2F169DD3B8CE0531A570681DB5D1491/trial3/trial3%CE%B1.json",
"size": 70,
"@id": "cmps/trial3/trial3\u03b1.json",
"@type": [
"nrdp:DataFile",
"nrdp:DownloadableFile",
"dcat:Distribution"
],
"_extensionSchemas": [
"https://data.nist.gov/od/dm/nerdm-schema/pub/v0.2#/definitions/DataFile"
]
},
{
"description": "Simulation of experiment",
"filepath": "sim++.json",
Expand Down Expand Up @@ -328,4 +350,4 @@
"filepath": "sim++.json"
}
]
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,12 @@
"downloadURL": "https://data.nist.gov/od/ds/3A1EE2F169DD3B8CE0531A570681DB5D1491/trial3/trial3a.json.sha256",
"mediaType": "text/plain"
},
{
"description": "Third trial of experiment",
"downloadURL": "https://data.nist.gov/od/ds/3A1EE2F169DD3B8CE0531A570681DB5D1491/trial3/trial3%CE%B1.json",
"mediaType": "application/json",
"title": "Trial 3-alpha: JSON version of the Mathematica notebook"
},
{
"description": "Simulation of experiment",
"downloadURL": "https://s3.amazonaws.com/nist-midas/1491/sim%2B%2B.json",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"name": "tx3a",
"date": "2017-02-02",
"result": false
}
Original file line number Diff line number Diff line change
Expand Up @@ -318,7 +318,7 @@ def test_update(self):
nerdm = bag.nerdm_record()
self.assertEqual(nerdm.get('accessLevel'), "restricted public")
self.assertEqual(nerdm.get('disclaimer'), "Be careful.")
self.assertEqual(len(nerdm['components']), 9)
self.assertEqual(len(nerdm['components']), 10)
self.assertEquals(nerdm['version'], "1.0.1")

# serialize
Expand Down Expand Up @@ -347,7 +347,7 @@ def test_update(self):
nerdm = bag.nerdm_record()
self.assertEqual(nerdm.get('accessLevel'), "restricted public")
self.assertEqual(nerdm.get('disclaimer'), "Be careful.")
self.assertEqual(len(nerdm['components']), 9)
self.assertEqual(len(nerdm['components']), 10)
self.assertEquals(nerdm['version'], "1.0.1")


Expand Down
22 changes: 21 additions & 1 deletion python/tests/nistoar/pdr/publish/midas3/test_mdwsgi.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ def test_good_id(self):
self.assertGreater(len([l for l in self.resp if "Content-Type:" in l]),0)
data = json.loads(body[0])
self.assertEqual(data['ediid'], '3A1EE2F169DD3B8CE0531A570681DB5D1491')
self.assertEqual(len(data['components']), 8)
self.assertEqual(len(data['components']), 9)
for cmp in data['components']:
if 'downloadURL' in cmp:
self.assertNotIn("/od/ds/", cmp['downloadURL'])
Expand Down Expand Up @@ -172,6 +172,26 @@ def test_get_datafile2(self):
self.assertGreater(len(redirect), 0)
self.assertEqual(redirect[0],"X-Accel-Redirect: /midasdata/upload_dir/1491/trial3/trial3a.json")

def test_get_datafile_unicode(self):
req = {
'PATH_INFO': '/3A1EE2F169DD3B8CE0531A570681DB5D1491/trial\xce\xb1.json',
'REQUEST_METHOD': 'GET'
}
# body = self.svc(req, self.start)
hdlr = wsgi.Handler(self.svc, req, self.start)
body = hdlr.send_datafile('3A1EE2F169DD3B8CE0531A570681DB5D1491',
u"trial3/trial3\u03b1.json")

self.assertGreater(len(self.resp), 0)
self.assertIn("200", self.resp[0])
redirect = [r for r in self.resp if "X-Accel-Redirect:" in r]
self.assertGreater(len(redirect), 0)
self.assertEqual(redirect[0],
"X-Accel-Redirect: /midasdata/review_dir/1491/trial3/trial3%CE%B1.json")
mtype = [r for r in self.resp if "Content-Type:" in r]
self.assertGreater(len(mtype), 0)
self.assertEqual(mtype[0],"Content-Type: application/json")

def test_test_permission_read(self):
hdlr = wsgi.Handler(self.svc, {}, self.start)
body = hdlr.test_permission('mds2-2000', "read", "me")
Expand Down
11 changes: 11 additions & 0 deletions python/tests/nistoar/pdr/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
testdatadir = os.path.join(testdir, 'data')
testdatadir3 = os.path.join(testdir, 'preserv', 'data')
testdatadir2 = os.path.join(testdatadir3, 'simplesip')
testdatadir4 = os.path.join(testdatadir3, 'midassip', 'review', '1491')

loghdlr = None
rootlog = None
Expand Down Expand Up @@ -96,6 +97,8 @@ def test_checksum_of(self):
self.assertEqual(utils.checksum_of(dfile), self.syssum(dfile))
dfile = os.path.join(testdatadir2,"trial3/trial3a.json")
self.assertEqual(utils.checksum_of(dfile), self.syssum(dfile))
dfile = os.path.join(testdatadir4,u"trial3/trial3\u03b1.json")
self.assertEqual(utils.checksum_of(dfile), self.syssum(dfile))

def syssum(self, filepath):
cmd = ["sha256sum", filepath]
Expand Down Expand Up @@ -285,6 +288,14 @@ def write_test_data(self):
with open(self.testdata) as fd:
data = json.load(fd)

def test_write_unicode_name(self):
data = utils.read_json(self.testdata)
data['foo'] = 'bar'
outf = self.tf(u"d\u03b1ta.json")
utils.write_json(data, outf)
data2 = utils.read_json(outf)
self.assertEqual(data2, data)

def test_writes(self):
# this is not a definitive test that the use of LockedFile is working
data = utils.read_json(self.testdata)
Expand Down

0 comments on commit ca554c9

Please sign in to comment.