Skip to content

Commit

Permalink
cmdline: use reactor from scrapy project settings
Browse files Browse the repository at this point in the history
  • Loading branch information
pawelmhm committed Dec 6, 2021
1 parent b003308 commit f6757f7
Show file tree
Hide file tree
Showing 10 changed files with 85 additions and 53 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,12 @@ _trial_temp
/dist/
/test_urls*
testing_scripts*
coverage.xml
.python-version

# IDE
/.idea/
.vscode

# tests
.coverage
Expand All @@ -33,4 +36,3 @@ testing_scripts*

# sphinx
docs/build/*
.python-version
41 changes: 26 additions & 15 deletions scrapyrt/cmdline.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,11 @@
from twisted.application.service import Application
from twisted.python import log
from twisted.web.server import Site
from scrapyrt.conf.spider_settings import get_project_settings

from scrapyrt.utils import install_reactor

from .conf import settings
from .conf import app_settings
from .log import setup_logging


Expand Down Expand Up @@ -52,7 +53,7 @@ def valid_setting(string):


def get_application(arguments):
ServiceRoot = load_object(settings.SERVICE_ROOT)
ServiceRoot = load_object(app_settings.SERVICE_ROOT)
site = Site(ServiceRoot())
application = Application('scrapyrt')
server = TCPServer(arguments.port, site, interface=arguments.ip)
Expand All @@ -77,28 +78,38 @@ def find_scrapy_project(project):
return project_settings


def execute():
sys.path.insert(0, os.getcwd())
def run_application(reactor_type, arguments, app_settings):
if reactor_type is not None:
install_reactor(reactor_type)

arguments = parse_arguments()
if arguments.settings:
settings.setmodule(arguments.settings)
if arguments.set:
for name, value in arguments.set:
settings.set(name.upper(), value)

settings.set('PROJECT_SETTINGS', find_scrapy_project(arguments.project))
if settings.TWISTED_REACTOR is not None:
install_reactor(settings.TWISTED_REACTOR)
settings.freeze()
setup_logging()

application = get_application(arguments)
app_settings.freeze()
app.startApplication(application, save=False)
from twisted.internet import reactor
msg = f"Running with reactor: {reactor.__class__.__name__}. "
log.msg(msg)
reactor.run()


def execute():
sys.path.insert(0, os.getcwd())

arguments = parse_arguments()
if arguments.settings:
app_settings.setmodule(arguments.settings)
if arguments.set:
for name, value in arguments.set:
app_settings.set(name.upper(), value)

app_settings.set('PROJECT_SETTINGS',
find_scrapy_project(arguments.project))
project_settings = get_project_settings()
reactor_type = app_settings.TWISTED_REACTOR or project_settings.get(
'TWISTED_REACTOR')
run_application(reactor_type, arguments, app_settings)


if __name__ == '__main__':
execute()
2 changes: 1 addition & 1 deletion scrapyrt/conf/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,4 @@ def frozen(self):
return bool(getattr(self, '_frozen', False))


settings = Settings()
app_settings = Settings()
4 changes: 2 additions & 2 deletions scrapyrt/conf/spider_settings.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
from scrapy.settings import Settings

from . import settings
from . import app_settings


def get_scrapyrt_settings(log_file=None):
Expand All @@ -23,7 +23,7 @@ def get_scrapyrt_settings(log_file=None):
def get_project_settings(module=None, custom_settings=None):
crawler_settings = Settings()
if module is None:
module = settings.PROJECT_SETTINGS
module = app_settings.PROJECT_SETTINGS
crawler_settings.setmodule(module, priority='project')
if custom_settings:
assert isinstance(custom_settings, dict)
Expand Down
20 changes: 12 additions & 8 deletions scrapyrt/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from twisted.internet import defer

from . import log
from .conf import settings
from .conf import app_settings
from .conf.spider_settings import get_scrapyrt_settings, get_project_settings
from .decorators import deprecated
from .log import setup_spider_logging
Expand All @@ -28,6 +28,7 @@ class ScrapyrtCrawler(Crawler):
TODO: PR to scrapy - ability to set start_requests here.
"""

def __init__(self, spidercls, crawler_settings, start_requests=False):
super(ScrapyrtCrawler, self).__init__(spidercls, crawler_settings)
self.start_requests = start_requests
Expand Down Expand Up @@ -65,7 +66,8 @@ def crawl(self, spidercls, *args, **kwargs):
if attr_or_m and callable(attr_or_m):
msg = 'Crawl argument cannot override spider method.'
msg += ' Got argument {} that overrides spider method {}'
raise Error('400', message=msg.format(kw, getattr(spidercls, kw)))
raise Error('400', message=msg.format(
kw, getattr(spidercls, kw)))
# creating our own crawler that will allow us to disable start requests easily
crawler = ScrapyrtCrawler(
spidercls, self.settings, self.scrapyrt_manager.start_requests)
Expand All @@ -81,7 +83,8 @@ def crawl(self, spidercls, *args, **kwargs):
signals.spider_error)
crawler.signals.connect(self.scrapyrt_manager.handle_scheduling,
signals.request_scheduled)
dfd = super(ScrapyrtCrawlerProcess, self).crawl(crawler, *args, **kwargs)
dfd = super(ScrapyrtCrawlerProcess, self).crawl(
crawler, *args, **kwargs)
_cleanup_handler = setup_spider_logging(crawler.spider, self.settings)

def cleanup_logging(result):
Expand All @@ -96,16 +99,17 @@ class CrawlManager(object):
Runs crawls
"""

def __init__(self, spider_name, request_kwargs, max_requests=None, start_requests=False):
def __init__(self, spider_name, request_kwargs,
max_requests=None, start_requests=False):
self.spider_name = spider_name
self.log_dir = settings.LOG_DIR
self.log_dir = app_settings.LOG_DIR
self.items = []
self.items_dropped = []
self.errors = []
self.max_requests = int(max_requests) if max_requests else None
self.timeout_limit = int(settings.TIMEOUT_LIMIT)
self.timeout_limit = int(app_settings.TIMEOUT_LIMIT)
self.request_count = 0
self.debug = settings.DEBUG
self.debug = app_settings.DEBUG
self.crawler_process = None
self.crawler = None
# callback will be added after instantiation of crawler object
Expand Down Expand Up @@ -136,7 +140,7 @@ def _get_log_file_path(self):
log_dir = os.path.join(self.log_dir, self.spider_name)
if not os.path.exists(log_dir):
os.makedirs(log_dir)
time_format = settings.SPIDER_LOG_FILE_TIMEFORMAT
time_format = app_settings.SPIDER_LOG_FILE_TIMEFORMAT
filename = datetime.datetime.now().strftime(time_format) + '.log'
return os.path.join(log_dir, filename)

Expand Down
14 changes: 7 additions & 7 deletions scrapyrt/log.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from twisted.python.log import startLoggingWithObserver
from twisted.python.logfile import DailyLogFile

from .conf import settings as scrapyrt_settings
from .conf import app_settings
from .utils import to_bytes

DEBUG = logging.DEBUG
Expand Down Expand Up @@ -91,16 +91,16 @@ def filter(self, record):


def setup_logging():
if not os.path.exists(scrapyrt_settings.LOG_DIR):
os.makedirs(scrapyrt_settings.LOG_DIR)
if scrapyrt_settings.LOG_FILE:
if not os.path.exists(app_settings.LOG_DIR):
os.makedirs(app_settings.LOG_DIR)
if app_settings.LOG_FILE:
logfile = DailyLogFile.fromFullPath(
os.path.join(scrapyrt_settings.LOG_DIR,
scrapyrt_settings.LOG_FILE)
os.path.join(app_settings.LOG_DIR,
app_settings.LOG_FILE)
)
else:
logfile = sys.stderr
observer = ScrapyrtFileLogObserver(logfile, scrapyrt_settings.LOG_ENCODING)
observer = ScrapyrtFileLogObserver(logfile, app_settings.LOG_ENCODING)
startLoggingWithObserver(observer.emit, setStdout=False)

# setup general logging for Scrapy
Expand Down
15 changes: 9 additions & 6 deletions scrapyrt/resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from twisted.web.error import Error, UnsupportedMethod

from . import log
from .conf import settings
from .conf import app_settings
from .utils import extract_scrapy_request_args, to_bytes


Expand Down Expand Up @@ -88,7 +88,8 @@ def format_error_response(self, exception, request):
# Python exceptions don't have message attribute in Python 3+ anymore.
# Twisted HTTP Error objects still have 'message' attribute even in 3+
# and they fail on str(exception) call.
msg = exception.message if hasattr(exception, 'message') else str(exception)
msg = exception.message if hasattr(
exception, 'message') else str(exception)

return {
"status": "error",
Expand All @@ -112,7 +113,7 @@ class RealtimeApi(ServiceResource):

def __init__(self, **kwargs):
super(RealtimeApi, self).__init__(self)
for route, resource_path in settings.RESOURCES.items():
for route, resource_path in app_settings.RESOURCES.items():
resource_cls = load_object(resource_path)
route = to_bytes(route)
self.putChild(route, resource_cls(self, **kwargs))
Expand Down Expand Up @@ -248,9 +249,11 @@ def prepare_crawl(self, api_params, scrapy_request_args, *args, **kwargs):
return dfd

def run_crawl(self, spider_name, scrapy_request_args,
max_requests=None, crawl_args=None, start_requests=False, *args, **kwargs):
crawl_manager_cls = load_object(settings.CRAWL_MANAGER)
manager = crawl_manager_cls(spider_name, scrapy_request_args, max_requests, start_requests=start_requests)
max_requests=None, crawl_args=None,
start_requests=False, *args, **kwargs):
crawl_manager_cls = load_object(app_settings.CRAWL_MANAGER)
manager = crawl_manager_cls(
spider_name, scrapy_request_args, max_requests, start_requests=start_requests)
if crawl_args:
kwargs.update(crawl_args)
dfd = manager.crawl(*args, **kwargs)
Expand Down
11 changes: 10 additions & 1 deletion tests/test_cmdline.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,15 @@
import tempfile
from collections import namedtuple
from os import path, chdir
from mock import patch

import port_for
import pytest
from scrapy.utils.conf import closest_scrapy_cfg
from twisted.python.components import Componentized

from scrapyrt.cmdline import find_scrapy_project, get_application
from scrapyrt.cmdline import execute, find_scrapy_project, get_application
from scrapyrt.conf import app_settings
from tests.utils import generate_project, get_testenv


Expand Down Expand Up @@ -58,6 +60,13 @@ def test_get_application(self):
app = get_application(make_fake_args())
assert isinstance(app, Componentized)

@patch('scrapyrt.cmdline.run_application')
@patch('scrapyrt.cmdline.parse_arguments',
new_callable=lambda: make_fake_args)
def test_execute(self, mock_pa, mock_run_app, workdir):
execute()
mock_run_app.assert_called_once_with(None, mock_pa(), app_settings)

@pytest.mark.parametrize('reactor,expected', [
("twisted.internet.asyncioreactor.AsyncioSelectorReactor",
"AsyncioSelectorReactor"),
Expand Down
17 changes: 9 additions & 8 deletions tests/test_crawl_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from twisted.web.error import Error

from scrapyrt.core import CrawlManager
from scrapyrt.conf import settings
from scrapyrt.conf import app_settings

from .spiders import MetaSpider

Expand Down Expand Up @@ -204,22 +204,22 @@ def test_limit_runtime(self):
self._test_limit_runtime()

def test_string_number_timeout_value(self):
_timeout = settings.TIMEOUT_LIMIT
_timeout = app_settings.TIMEOUT_LIMIT
try:
settings.TIMEOUT_LIMIT = '1'
app_settings.TIMEOUT_LIMIT = '1'
self.crawl_manager = self.create_crawl_manager()
self._test_limit_runtime()
finally:
settings.TIMEOUT_LIMIT = _timeout
app_settings.TIMEOUT_LIMIT = _timeout

def test_wrong_timeout_value(self):
_timeout = settings.TIMEOUT_LIMIT
_timeout = app_settings.TIMEOUT_LIMIT
try:
settings.TIMEOUT_LIMIT = 'foo'
app_settings.TIMEOUT_LIMIT = 'foo'
self.assertRaises(
ValueError, CrawlManager, self.spider.name, self.kwargs.copy())
finally:
settings.TIMEOUT_LIMIT = _timeout
app_settings.TIMEOUT_LIMIT = _timeout


class TestHandleSpiderError(TestCrawlManager):
Expand Down Expand Up @@ -335,7 +335,8 @@ def setUp(self):
def test_return_items(self):
result = self.crawl_manager.return_items(None)
self.assertEqual(dict(result, **self.expected_result), result)
self.assertEqual(list(sorted(self.stats.keys())), list(result['stats'].keys()))
self.assertEqual(list(sorted(self.stats.keys())),
list(result['stats'].keys()))
# debug = True by default
self.assertIn('errors', result)
self.assertEquals(result['errors'], self.crawl_manager.errors)
Expand Down
10 changes: 6 additions & 4 deletions tests/test_resource_realtimeapi.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from mock import patch
from twisted.trial import unittest

from scrapyrt.conf import settings
from scrapyrt.conf import app_settings
from scrapyrt.resources import RealtimeApi, ServiceResource, CrawlResource


Expand All @@ -32,10 +32,12 @@ def test_realtimeapi_with_default_settings(self):
# XXX: one inconvenience of singleton settings - complexities during tests,
# e.g. settings are mutable, when you change them in one test -
# changes will be kept unless you cleanup those changes or use mock.
@patch('scrapyrt.resources.settings', deepcopy(settings))
@patch('scrapyrt.resources.app_settings', deepcopy(app_settings))
def test_realtimeapi_with_custom_settings(self):
from scrapyrt.resources import settings
settings.RESOURCES[b'test.json'] = self._get_class_path('SampleResource')
from scrapyrt.resources import app_settings
app_settings.RESOURCES[b'test.json'] = self._get_class_path(
'SampleResource'
)
expected_entities = {
b'crawl.json': CrawlResource,
b'test.json': SampleResource
Expand Down

0 comments on commit f6757f7

Please sign in to comment.