Skip to content

Commit

Permalink
Merge pull request #15 from blacknon/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
blacknon authored Jan 21, 2023
2 parents 0fe6f2a + fcce736 commit 876c751
Show file tree
Hide file tree
Showing 5 changed files with 94 additions and 19 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -105,3 +105,6 @@ venv.bak/

# mypy
.mypy_cache/

# Selenium
geckodriver.log
6 changes: 5 additions & 1 deletion pydork/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -441,7 +441,11 @@ def search(self, keyword: str, type='text', maximum=100):
)

# loopを抜ける
break
if self.ENGINE.NAME == "Google":
if self.ENGINE.SEARCH_NEXT_URL is None:
break
else:
break

# maximumで指定した件数を超える場合、その件数までを追加してloopを抜ける
elif len(links) > maximum - total:
Expand Down
41 changes: 37 additions & 4 deletions pydork/engine_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,12 +298,12 @@ def create_selenium_driver(self):
# optionsを取得する
options = self.create_selenium_options()

# proxyを追加
if self.PROXY != '':
options.add_argument('--proxy-server=%s' % self.PROXY)

# browserに応じてdriverを作成していく
if self.SELENIUM_BROWSER == 'chrome':
# proxyを追加
if self.PROXY != '':
options.add_argument('--proxy-server=%s' % self.PROXY)

try:
chromedriver_autoinstaller.install()
except Exception:
Expand All @@ -318,6 +318,39 @@ def create_selenium_driver(self):
profile.set_preference('plain_text.wrap_long_lines', False)
profile.set_preference('view_source.wrap_long_lines', False)

# proxyを追加
if self.PROXY != '':
# self.PROXYをパース処理する
parsed_uri = parse.urlparse(self.PROXY)

# socks5
if parsed_uri.scheme == "socks5":
# Proxy設定を追加
profile.set_preference(
'network.proxy.type', 1)
profile.set_preference('network.proxy.socks_version', 5)
profile.set_preference(
'network.proxy.socks', parsed_uri.hostname)
profile.set_preference(
'network.proxy.socks_port', parsed_uri.port)
profile.set_preference('network.proxy.no_proxies_on', '')
profile.set_preference(
'network.proxy.socks_remote_dns', True)
profile.update_preferences()
elif parsed_uri.scheme == "socks4":
# Proxy設定を追加
profile.set_preference(
'network.proxy.type', 1)
profile.set_preference('network.proxy.socks_version', 4)
profile.set_preference(
'network.proxy.socks', parsed_uri.hostname)
profile.set_preference(
'network.proxy.socks_port', parsed_uri.port)
profile.set_preference('network.proxy.no_proxies_on', '')
profile.set_preference(
'network.proxy.socks_remote_dns', True)
profile.update_preferences()

# set ssl verify(firefoxの場合はprofileで処理するのでこちらに記述する)
if not self.IGNORE_SSL_VERIFY:
profile.accept_untrusted_certs = True
Expand Down
59 changes: 47 additions & 12 deletions pydork/engine_google.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from json.decoder import JSONDecodeError
from urllib import parse
from lxml import etree
from bs4 import BeautifulSoup

from .common import Color
from .recaptcha import TwoCaptcha
Expand Down Expand Up @@ -45,6 +46,9 @@ def __init__(self):
self.IMAGE_URL = 'https://www.google.com/_/VisualFrontendUi/data/batchexecute'
self.SUGGEST_URL = 'http://www.google.com/complete/search'

# 次の検索ページのURL(`self.get_nextpage_url`の処理で取得する)
self.SEARCH_NEXT_URL = None

# ReCaptcha画面かどうかの識別用
self.SOUP_RECAPTCHA_TAG = '#captcha-form > #recaptcha'

Expand All @@ -71,7 +75,7 @@ def gen_search_url(self, keyword: str, type: str):
url_param = {
'q': keyword, # 検索キーワード
'oq': keyword, # 検索キーワード
'num': '100', # 1ページごとの表示件数
'num': '100', # 1ページごとの表示件数.
'filter': '0', # 類似ページのフィルタリング(0...無効, 1...有効)
'start': '', # 開始位置
'tbs': '', # 期間
Expand Down Expand Up @@ -100,14 +104,19 @@ def gen_search_url(self, keyword: str, type: str):

page = 0
while True:
# parameterにページを開始する番号を指定
url_param['start'] = str(page * 100)
params = parse.urlencode(url_param)
if page == 0:
# parameterにページを開始する番号を指定
url_param['start'] = str(page * 100)
params = parse.urlencode(url_param)

target_url = search_url + '?' + params
target_url = search_url + '?' + params

yield 'GET', target_url, None
else:
target_url = self.SEARCH_NEXT_URL
if self.SEARCH_NEXT_URL is None:
break

yield 'GET', target_url, None
page += 1

elif type == 'image':
Expand Down Expand Up @@ -192,18 +201,24 @@ def get_links(self, html: str, type: str):
self.SOUP_SELECT_URL = '#main > div > div > .kCrYT > a'
self.SOUP_SELECT_TITLE = '#main > div > div > .kCrYT > a > h3 > div'
self.SOUP_SELECT_TEXT = '#main > div > div > .kCrYT > div > div > div > div > div'
self.SOUP_SELECT_NEXT_URL = ''

# Selenium経由、かつFirefoxを使っている場合
if self.USE_SELENIUM and self.SELENIUM_BROWSER == 'firefox':
self.SOUP_SELECT_URL = '.jtfYYd > div > .yuRUbf > a'
self.SOUP_SELECT_TITLE = '.jtfYYd > div > .yuRUbf > a > .LC20lb'
self.SOUP_SELECT_TEXT = '.jtfYYd > div > div'
if self.USE_SELENIUM:
self.SOUP_SELECT_URL = '.yuRUbf > a'
self.SOUP_SELECT_TITLE = '.yuRUbf > a > .LC20lb'
self.SOUP_SELECT_TEXT = '.WZ8Tjf'
self.SOUP_SELECT_NEXT_URL = '.d6cvqb > a'

# Splash経由で通信している場合
elif self.USE_SPLASH:
self.SOUP_SELECT_URL = '.yuRUbf > a'
self.SOUP_SELECT_TITLE = '.yuRUbf > a > .LC20lb'
self.SOUP_SELECT_TEXT = '.jtfYYd > div > div'
self.SOUP_SELECT_TEXT = '.WZ8Tjf'
self.SOUP_SELECT_NEXT_URL = '.d6cvqb > a'

# TODO: SEARCH_NEXT_URLを書き換える
self.get_nextpage_url(html)

# CommonEngineの処理を呼び出す
links = super().get_links(html, type)
Expand All @@ -214,7 +229,6 @@ def get_links(self, html: str, type: str):

return links

# 画像検索ページの検索結果(links(list()))を生成するfunction
def get_image_links(self, html: str):
"""get_image_links
Expand Down Expand Up @@ -290,6 +304,27 @@ def get_suggest_list(self, suggests: list, char: str, html: str):

return suggests

def get_nextpage_url(self, html: str):
# BeautifulSoupでの解析を実施
soup = BeautifulSoup(html, 'lxml')

# BeautifulSoupでnext urlの要素を確認する
elements = soup.select(self.SOUP_SELECT_NEXT_URL)

# next urlを取得する
elinks = [e['href'] for e in elements]

if len(elinks) == 0:
self.SEARCH_NEXT_URL = None

elif len(elinks) == 1:
next_url = parse.urljoin(self.ENGINE_TOP_URL, elinks[0])
self.SEARCH_NEXT_URL = next_url

elif len(elinks) > 1:
next_url = parse.urljoin(self.ENGINE_TOP_URL, elinks[1])
self.SEARCH_NEXT_URL = next_url

def processings_elist(self, elinks, etitles, etexts: list):
"""processings_elist
Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,8 @@ def get_completefile_install_location(shell):


name = 'pydork'
version = '1.1.2'
release = '1.1.2'
version = '1.1.3'
release = '1.1.3'

if __name__ == "__main__":
setuptools.setup(
Expand Down

0 comments on commit 876c751

Please sign in to comment.