Skip to content

Commit

Permalink
basic suport for json-ld in body
Browse files Browse the repository at this point in the history
adds support for sites like dataverse.no
  • Loading branch information
borsna committed Sep 23, 2023
1 parent 546aae0 commit 044132d
Show file tree
Hide file tree
Showing 5 changed files with 45 additions and 21 deletions.
17 changes: 10 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,16 @@ landing page: https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7
704.8KiB downloaded
```

## Supported data respositories (confirmed)
* Dataverse - https://dataverse.harvard.edu
* SND - https://snd.se/catalogue
* Zenodo - https://zenodo.org

## Semi-supported respositories
* Figshare - https://su.figshare.com & https://figshare.scilifelab.se (more testing needed)
## Supported data respositories with file metadata
* schema.org/Dataset
* https://dataverse.harvard.edu
* https://dataverse.no
* https://snd.se/catalogue
* figshare
* https://su.figshare.com
* https://figshare.scilifelab.se
* zenodo
* https://zenodo.org

## Alternatives

Expand Down
4 changes: 4 additions & 0 deletions daget/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
import os, argparse
from daget.utils import *
from daget.repos import get_file_list_from_repo
from daget.exceptions import ResolveError, RepoError
2 changes: 1 addition & 1 deletion daget/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def main():

parser.add_argument('url', help="URL/DOI to the dataset")
parser.add_argument('destination', help="Full or relative path to destination directory")
parser.add_argument("--list-only", action="store_true", help="Skip download")
parser.add_argument("-l", "--list-only", action="store_true", help="Skip download")

args = parser.parse_args()

Expand Down
39 changes: 28 additions & 11 deletions daget/repos.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import requests, urllib
import requests, urllib, json.decoder
from bs4 import BeautifulSoup

def get_file_list_from_repo(url):
url_parsed = urllib.parse.urlparse(url)
Expand All @@ -14,20 +15,36 @@ def get_file_list_from_repo(url):
return get_file_list_schema_org(url)

def get_file_list_schema_org(url):
daget_headers={'User-Agent' : 'daget', 'Accept' : 'application/ld+json'}

result = requests.get(url, headers=daget_headers)

# retry with host header
if not result.ok:
daget_headers['Host'] = 'localhost'
result = requests.get(url, headers=daget_headers)

try:
r=requests.get(url, headers={'User-Agent' : 'daget', 'Accept' : 'application/ld+json'})
schema_org = r.json()
schema_org = result.json()
except:
r=requests.get(url, headers={'Host': 'daget', 'User-Agent' : 'daget', 'Accept' : 'application/ld+json'})
schema_org = r.json()
result = requests.get(url)
soup = BeautifulSoup(result.text, "html.parser")
text = "".join(soup.find('script', {'type':'application/ld+json'}).contents)

schema_org = json.loads(text)

schema_files = schema_org['distribution']
if not isinstance(schema_files, list):
schema_files = [schema_files]

files = []
for file in schema_org['distribution']:
files.append({
'url' : file['contentUrl'],
'size' : file['contentSize'],
'name' : file['name']
})
for f in schema_files:
file={
'url' : f.get('contentUrl', None),
'size' : f.get('contentSize', None),
'name' : f.get('name', None)
}
files.append(file)

return files

Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "daget"
version = "0.4"
version = "0.5"
description = "Download dataset via DOI or landing page url"
authors = [
{ name = "Olof Olsson", email = "[email protected]" }
Expand All @@ -16,7 +16,7 @@ classifiers = [
"Programming Language :: Python :: 3.11"
]
license = {text = "MIT"}
dependencies = ["requests"]
dependencies = ["requests", "beautifulsoup4"]
requires-python = ">=3.6"

[project.scripts]
Expand Down

0 comments on commit 044132d

Please sign in to comment.