Skip to content
This repository has been archived by the owner on Feb 1, 2024. It is now read-only.

Commit

Permalink
Feature/regex (#103)
Browse files Browse the repository at this point in the history
* harvester admins no longer see all paths

* monitored paths now have a regex property to filter results prior to harvesting
  • Loading branch information
mjaquiery authored Jun 27, 2023
1 parent 9ee5890 commit aaf43a5
Show file tree
Hide file tree
Showing 10 changed files with 62 additions and 22 deletions.
6 changes: 6 additions & 0 deletions backend/backend_django/galvanalyser/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,12 @@ class MonitoredPath(models.Model):
help_text="Harvester with access to this directory"
)
path = models.TextField(help_text="Directory location on Harvester")
regex = models.TextField(
null=True,
help_text="""
Python.re regular expression to filter files by,
applied to full file name starting from this Path's directory"""
)
stable_time = models.PositiveSmallIntegerField(
default=60,
help_text="Number of seconds files must remain stable to be processed"
Expand Down
9 changes: 8 additions & 1 deletion backend/backend_django/galvanalyser/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,13 @@ def validate_stable_time(self, value):
except (TypeError, ValueError, AssertionError):
raise ValidationError(f"stable_time value '{value}' is not a positive integer")

def validate_regex(self, value):
try:
re.compile(value)
return value
except BaseException as e:
raise ValidationError(f"Invalid regex: {e.__context__}")

def validate(self, attrs):
# Verify user is allowed to create/modify paths
if self.instance is not None:
Expand All @@ -308,7 +315,7 @@ def validate(self, attrs):

class Meta:
model = MonitoredPath
fields = ['url', 'id', 'path', 'stable_time', 'harvester', 'user_sets']
fields = ['url', 'id', 'path', 'regex', 'stable_time', 'harvester', 'user_sets']
read_only_fields = ['url', 'id', 'harvester', 'user_sets']
extra_kwargs = augment_extra_kwargs()

Expand Down
1 change: 1 addition & 0 deletions backend/backend_django/galvanalyser/tests/factories.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ class Meta:
)

path = factory.LazyAttribute(lambda x: os.path.dirname(x.p))
regex = ".*"
harvester = factory.SubFactory(HarvesterFactory)

@factory.post_generation
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def setUp(self):
self.user = UserFactory.create(username='test_user')
self.admin_user = UserFactory.create(username='test_user_admin')
self.user.groups.add(self.harvester.user_group)
self.admin_user.groups.add(self.harvester.admin_group)
self.admin_user.groups.add(self.dataset.file.monitored_path.admin_group)
self.url = reverse('dataset-detail', args=(self.dataset.id,))

def test_view(self):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def setUp(self):
self.user = UserFactory.create(username='test_user')
self.admin_user = UserFactory.create(username='test_user_admin')
self.user.groups.add(self.harvester.user_group)
self.admin_user.groups.add(self.harvester.admin_group)
self.admin_user.groups.add(self.path.admin_group)
self.url = reverse('observedfile-detail', args=(self.files[0].id,))

def test_view(self):
Expand Down
14 changes: 10 additions & 4 deletions backend/backend_django/galvanalyser/tests/test_view_path.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: BSD-2-Clause
# Copyright (c) 2020-2023, The Chancellor, Masters and Scholars of the University
# of Oxford, and the 'Galvanalyser' Developers. All rights reserved.

import json
import unittest
from django.urls import reverse
from rest_framework import status
Expand Down Expand Up @@ -48,6 +48,7 @@ def test_create(self):
self.client.force_login(self.user)
body = {
'path': self.path,
'regex': '.*',
'harvester': reverse('harvester-detail', args=(self.harvester.id,)),
'stable_time': 60
}
Expand All @@ -72,25 +73,30 @@ def test_create(self):

def test_update(self):
path = MonitoredPathFactory.create(path=self.path, harvester=self.harvester)
self.admin_user.groups.add(path.admin_group)
url = reverse('monitoredpath-detail', args=(path.id,))
print("Test update rejected - authorisation")
self.client.force_login(self.user)
body = {'path': path.path, 'stable_time': 100}
body = {'path': path.path, 'regex': '^abc', 'stable_time': 100}
self.assertEqual(
self.client.patch(url, body).status_code,
status.HTTP_404_NOT_FOUND
)
print("OK")
print("Test update okay")
self.client.force_login(self.admin_user)
body = {'path': path.path, 'stable_time': 1}
body = {'path': path.path, 'regex': '^abc', 'stable_time': 1}
self.assertEqual(
self.client.patch(url, body).status_code,
status.HTTP_200_OK
)
self.assertEqual(
MonitoredPath.objects.get(path=path.path, harvester__id=self.harvester.id).stable_time,
1
body.get('stable_time')
)
self.assertEqual(
MonitoredPath.objects.get(path=path.path, harvester__id=self.harvester.id).regex,
body.get('regex')
)
print("OK")

Expand Down
15 changes: 5 additions & 10 deletions backend/backend_django/galvanalyser/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -692,8 +692,7 @@ def get_serializer_class(self):
def get_queryset(self):
return MonitoredPath.objects.filter(
Q(user_group__in=self.request.user.groups.all()) |
Q(admin_group__in=self.request.user.groups.all()) |
Q(harvester__admin_group__in=self.request.user.groups.all())
Q(admin_group__in=self.request.user.groups.all())
).order_by('-id')


Expand Down Expand Up @@ -758,8 +757,7 @@ class ObservedFileViewSet(viewsets.ModelViewSet):
def get_queryset(self):
return ObservedFile.objects.filter(
Q(monitored_path__user_group__in=self.request.user.groups.all()) |
Q(monitored_path__admin_group__in=self.request.user.groups.all()) |
Q(monitored_path__harvester__admin_group__in=self.request.user.groups.all())
Q(monitored_path__admin_group__in=self.request.user.groups.all())
).order_by('-last_observed_time', '-id')

@action(detail=True, methods=['GET'])
Expand Down Expand Up @@ -822,8 +820,7 @@ class DatasetViewSet(viewsets.ModelViewSet):
def get_queryset(self):
return Dataset.objects.filter(
Q(file__monitored_path__user_group__in=self.request.user.groups.all()) |
Q(file__monitored_path__admin_group__in=self.request.user.groups.all()) |
Q(file__monitored_path__harvester__admin_group__in=self.request.user.groups.all())
Q(file__monitored_path__admin_group__in=self.request.user.groups.all())
).order_by('-date', '-id')


Expand Down Expand Up @@ -861,8 +858,7 @@ class HarvestErrorViewSet(viewsets.ReadOnlyModelViewSet):
def get_queryset(self):
return HarvestError.objects.filter(
Q(path__user_group__in=self.request.user.groups.all()) |
Q(path__admin_group__in=self.request.user.groups.all()) |
Q(harvester__admin_group__in=self.request.user.groups.all())
Q(path__admin_group__in=self.request.user.groups.all())
).order_by('-timestamp')


Expand Down Expand Up @@ -1144,8 +1140,7 @@ class DataColumnViewSet(viewsets.ReadOnlyModelViewSet):
def get_queryset(self):
datasets_ids = [d.id for d in Dataset.objects.filter(
Q(file__monitored_path__user_group__in=self.request.user.groups.all()) |
Q(file__monitored_path__admin_group__in=self.request.user.groups.all()) |
Q(file__monitored_path__harvester__admin_group__in=self.request.user.groups.all())
Q(file__monitored_path__admin_group__in=self.request.user.groups.all())
).only('id')]
return DataColumn.objects.filter(dataset_id__in=datasets_ids).order_by('-dataset_id', '-id')

Expand Down
17 changes: 16 additions & 1 deletion frontend/src/HarvesterDetail.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ export type MonitoredPathFields = {
harvester: number;
stable_time: number;
path: string;
regex: string;
user_sets: UserSet[];
}
export type HarvesterDetailProps = {
Expand All @@ -42,13 +43,14 @@ export default function HarvesterDetail(props: HarvesterDetailProps) {
const deletePath = (data: MonitoredPathFields) => Connection.fetch(data.url, {method: 'DELETE'})

const updatePath = (data: MonitoredPathFields) => {
const insert_data = {path: data.path, stable_time: data.stable_time}
const insert_data = {path: data.path, regex: data.regex, stable_time: data.stable_time}
return Connection.fetch(data.url, {body: JSON.stringify(insert_data), method: 'PATCH'})
.then(r => r.content)
};

const columns = [
{label: 'Path', help: 'Directory to watch'},
{label: 'RegEx', help: 'Python.re regular expression applied to filename after Path. Matching files will be imported'},
{label: 'Stable Time (s)', help: 'Seconds files must remain unchanged to be considered stable and imported'},
{label: 'Users', help: 'Users with access to this path\'s datasets'},
{label: 'Actions', help: 'Inspect / Save / Delete monitored path (imported datasets will remain)'},
Expand Down Expand Up @@ -77,6 +79,19 @@ export default function HarvesterDetail(props: HarvesterDetailProps) {
onChange={context.update}
/>
</Fragment>,
<Fragment>
<TextField
InputProps={{
classes: {
input: classes.resize,
}
}}
placeholder=".*"
value={row.regex}
name="regex"
onChange={context.update}
/>
</Fragment>,
<Fragment>
<TextField
type="number"
Expand Down
14 changes: 11 additions & 3 deletions harvester/harvester/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
# of Oxford, and the 'Galvanalyser' Developers. All rights reserved.

import os.path
import re
import time

from .parse.exceptions import UnsupportedFileTypeError
Expand Down Expand Up @@ -32,16 +33,23 @@ def harvest():
logger.debug(paths)

for path in paths:
harvest_path(path.get('path'))
harvest_path(path.get('path'), path.get('regex'))


def harvest_path(path: os.PathLike):
logger.info(f"Harvesting from {path}")
def harvest_path(path: os.PathLike, regex_str: str = None):
if regex_str is not None:
logger.info(f"Harvesting from {path} with regex {regex_str}")
else:
logger.info(f"Harvesting from {path}")
try:
regex = re.compile(regex_str) if regex_str is not None else None
for (dir_path, dir_names, filenames) in os.walk(path):
for filename in filenames:
full_path = os.path.join(dir_path, filename)
core_path, file_path = split_path(path, full_path)
if regex is not None and not regex.match(file_path):
logger.debug(f"Skipping {file_path} as it does not match regex {regex}")
continue
try:
get_import_file_handler(full_path)
except UnsupportedFileTypeError:
Expand Down
4 changes: 3 additions & 1 deletion harvester/test/test_harvester.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ def json(self):
"monitored_paths": [
{
"path": get_test_file_path(),
"stable_time": 0
"stable_time": 0,
"regex": "^(?!.*\\.skip$).*$",
}
],
"standard_units": [
Expand Down Expand Up @@ -190,6 +191,7 @@ def test_config_update(
def test_harvest_path(self, mock_logger, mock_import, mock_report):
# Create an unparsable file in the test set
Path(os.path.join(get_test_file_path(), 'unparsable.foo')).touch(exist_ok=True)
Path(os.path.join(get_test_file_path(), 'skipped_by_regex.skip')).touch(exist_ok=True)
mock_logger.error = fail
mock_report.return_value = JSONResponse(200, {'state': 'STABLE'})
mock_import.return_value = True
Expand Down

0 comments on commit aaf43a5

Please sign in to comment.