Skip to content

Commit

Permalink
Merge pull request #40 from planetlabs/atomic-enqueue
Browse files Browse the repository at this point in the history
Enqueue file atomically.
  • Loading branch information
shomchak authored Jun 19, 2017
2 parents d8c8b2a + 1ebe45d commit bdb5f02
Show file tree
Hide file tree
Showing 4 changed files with 71 additions and 11 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -55,3 +55,7 @@ docs/_build/

# PyBuilder
target/

# Vim
*.swp
*.un~
20 changes: 13 additions & 7 deletions datalake/dlfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ def _validate_bundle(bundle_filename):
def _validate_bundle_version(bundle):
v = File._get_content_from_bundle(bundle, 'version').decode('utf-8')
if v != File.DATALAKE_BUNDLE_VERSION:
msg = '{} has unsupported bundle version {}.'
msg = '{} has unsupported bundle version {}'
msg = msg.format(bundle.name, v)
raise InvalidDatalakeBundle(msg)

Expand Down Expand Up @@ -200,16 +200,22 @@ def to_bundle(self, bundle_filename):
Args:
bundle_filename: output file
'''
t = tarfile.open(bundle_filename, 'w')
self._add_fd_to_tar(t, 'content', self._fd)
self._add_string_to_tar(t, 'version', self.DATALAKE_BUNDLE_VERSION)
self._add_string_to_tar(t, 'datalake-metadata.json',
self.metadata.json)
t.close()
temp_filename = self._dot_filename(bundle_filename)
with open(temp_filename, 'wb') as f:
t = tarfile.open(fileobj=f, mode='w')
self._add_fd_to_tar(t, 'content', self._fd)
self._add_string_to_tar(t, 'version', self.DATALAKE_BUNDLE_VERSION)
self._add_string_to_tar(t, 'datalake-metadata.json',
self.metadata.json)
os.rename(temp_filename, bundle_filename)

# reset the file pointer in case somebody else wants to read us.
self.seek(0, 0)

def _dot_filename(self, path):
return os.path.join(os.path.dirname(path),
'.{}'.format(os.path.basename(path)))

def _add_string_to_tar(self, tfile, arcname, data):
s = BytesIO(data.encode('utf-8'))
info = tarfile.TarInfo(name=arcname)
Expand Down
13 changes: 10 additions & 3 deletions datalake/queue.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,12 @@
reason, the file remains in the queue.
'''
from os import environ
import os
from datalake_common.errors import InsufficientConfiguration
from logging import getLogger
import os
import time

from datalake import File
from datalake import File, InvalidDatalakeBundle


'''whether or not queue feature is available
Expand Down Expand Up @@ -132,7 +132,14 @@ def _setup_watch_manager(self, timeout):
pyinotify.IN_CLOSE_WRITE | pyinotify.IN_MOVED_TO)

def _push(self, filename):
f = File.from_bundle(filename)
if os.path.basename(filename).startswith('.'):
return
try:
f = File.from_bundle(filename)
except InvalidDatalakeBundle as e:
msg = '{}. Skipping upload.'.format(e.args[0])
log.exception(msg)
return
url = self._archive.push(f)
msg = 'Pushed {}({}) to {}'.format(filename, f.metadata['path'], url)
log.info(msg)
Expand Down
45 changes: 44 additions & 1 deletion test/test_queue.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import os
from datalake_common.tests import random_word
from datalake_common.errors import InsufficientConfiguration
from datalake import Enqueuer, Uploader
from datalake import Enqueuer, Uploader, InvalidDatalakeBundle
from datalake.queue import has_queue
from conftest import crtime_setuid
from gzip import GzipFile
Expand Down Expand Up @@ -71,6 +71,15 @@ def validator(f):
return validator


@pytest.fixture
def assert_s3_bucket_empty(s3_bucket):

def asserter():
assert len([k for k in s3_bucket.list()]) == 0

return asserter


@pytest.fixture
def random_file(tmpfile, random_metadata):
expected_content = random_word(100)
Expand Down Expand Up @@ -103,6 +112,40 @@ def enqueue():
uploaded_file_validator(f)


@pytest.mark.skipif(not has_queue, reason='requires queuable features')
def test_skip_incoming_dotfile(random_file, queue_dir, uploader,
assert_s3_bucket_empty):

def enqueue():
enqueued_name = os.path.join(queue_dir, '.ignoreme')
os.rename(str(random_file), enqueued_name)

t = Timer(0.5, enqueue)
t.start()
uploader.listen(timeout=1.0)

assert_s3_bucket_empty()


@pytest.mark.skipif(not has_queue, reason='requires queuable features')
def test_skip_invalid_bundles(random_file, queue_dir, uploader,
assert_s3_bucket_empty):

def enqueue():
enqueued_name = os.path.join(queue_dir, 'invalid-bundle')
os.rename(str(random_file), enqueued_name)

t = Timer(0.5, enqueue)
t.start()

try:
uploader.listen(timeout=1.0)
except InvalidDatalakeBundle:
pytest.fail("Didn't catch InvalidDatalakeBundle exception.")

assert_s3_bucket_empty()


@pytest.mark.skipif(not has_queue, reason='requires queuable features')
def test_upload_existing_cli(cli_tester, random_file, random_metadata,
uploaded_content_validator, queue_dir):
Expand Down

0 comments on commit bdb5f02

Please sign in to comment.