Skip to content

Commit

Permalink
Faster Scraping
Browse files Browse the repository at this point in the history
I'm rewriting most of the script to make it look better. Half the time, I don't even know what I'm looking for.

Anyways, the script should be up to 100% faster since we'll only call the api once instead of three times and then search for image, video and audio files.
  • Loading branch information
SecretShell committed Sep 11, 2020
1 parent 4dbd8df commit ee7cf13
Show file tree
Hide file tree
Showing 4 changed files with 285 additions and 277 deletions.
16 changes: 8 additions & 8 deletions datascraper/main_datascraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from argparse import ArgumentParser

import helpers.main_helper as main_helper
from helpers.main_helper import update_config
import modules.bbwchan as bbwchan
import modules.fourchan as fourchan
import modules.onlyfans as onlyfans
Expand Down Expand Up @@ -101,7 +100,8 @@ def start_datascraper():
'user_agent'] else json_auth['user_agent']

x = onlyfans
x.assign_vars(json_auth,json_config, json_site_settings, site_name)
x.assign_vars(json_auth, json_config,
json_site_settings, site_name)
sessions = x.create_session()
if not sessions:
print("Unable to create session")
Expand All @@ -121,7 +121,7 @@ def start_datascraper():
json_auth['sess'] = cookies["sess"]
json_auth['fp'] = cookies["fp"]
if json_config != json_config2:
update_config(json_config)
main_helper.update_config(json_config)
me_api = session["me_api"]
array = x.get_subscriptions(
session["sessions"][0], session["subscriber_count"], me_api, auth_count)
Expand Down Expand Up @@ -149,7 +149,7 @@ def start_datascraper():
cookies = session["session"].cookies.get_dict()
json_auth['session_id'] = cookies["session_id"]
if json_config != json_config2:
update_config(json_config)
main_helper.update_config(json_config)
me_api = session["me_api"]
array = x.get_subscriptions(
session["session"], auth_count)
Expand Down Expand Up @@ -215,7 +215,7 @@ def start_datascraper():
else:
print("There's nothing to scrape.")
continue
start_time = timeit.default_timer()
archive_time = timeit.default_timer()
download_list = []
app_token = ""
for name in names:
Expand All @@ -234,14 +234,14 @@ def start_datascraper():
main_helper.assign_vars(json_config)
username = main_helper.parse_links(site_name_lower, name)
result = x.start_datascraper(
session, username, site_name, app_token,choice_type=value)
session, username, site_name, app_token, choice_type=value)
if not args.metadata:
download_list.append(result)
for y in download_list:
for arg in y[1]:
x.download_media(*arg)
stop_time = str(int(timeit.default_timer() - start_time) / 60)
print('Task Completed in ' + stop_time + ' Minutes')
stop_time = str(int(timeit.default_timer() - archive_time) / 60)[:4]
print('Archive Completed in ' + stop_time + ' Minutes')
if exit_on_completion:
print("Now exiting.")
exit(0)
Expand Down
2 changes: 1 addition & 1 deletion extras/OFRenamer/start.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def update(filepath):

def start(metadata_filepath, json_settings):
if os.path.getsize(metadata_filepath) > 0:
metadatas = json.load(open(metadata_filepath))
metadatas = json.load(open(metadata_filepath, encoding='utf-8'))
metadatas2 = prepare_metadata(metadatas).items
username = os.path.basename(up(up(metadata_filepath)))
site_name = os.path.basename(up(up(up(metadata_filepath))))
Expand Down
45 changes: 33 additions & 12 deletions helpers/main_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import platform
import re
from datetime import datetime
from itertools import chain, zip_longest
from itertools import chain, zip_longest, groupby
from os.path import dirname as up
from urllib.parse import urlparse
import time
Expand Down Expand Up @@ -80,15 +80,31 @@ def clean_text(string, remove_spaces=False):
return string


def format_media_set(location, media_set):
x = {}
x["type"] = location
x["valid"] = []
x["invalid"] = []
for y in media_set:
x["valid"].extend(y[0])
x["invalid"].extend(y[1])
return x
def format_media_set(media_set):
media_set = list(chain(*media_set))
media_set.sort(key=lambda x: x["type"])
media_set = [list(g) for k, g in groupby(
media_set, key=lambda x: x["type"])]
new_list = []
for item in media_set:
item2 = {k: [d[k] for d in item] for k in item[0]}
item2["type"] = item2["type"][0]
item2["valid"] = list(chain(*item2["valid"]))
item2["invalid"] = list(chain(*item2["invalid"]))
if item2["valid"]:
seen = set()
item2["valid"] = [x for x in item2["valid"]
if x["filename"] not in seen and not seen.add(x["filename"])]
seen = set()
location_directories = [x["directory"] for x in item2["valid"]
if x["directory"] not in seen and not seen.add(x["directory"])]
for location_directory in location_directories:
os.makedirs(location_directory, exist_ok=True)
item2["valid"] = [list(g) for k, g in groupby(
item2["valid"], key=lambda x: x["post_id"])]
new_list.append(item2)
print
return new_list


def format_image(directory, timestamp):
Expand Down Expand Up @@ -250,6 +266,7 @@ def json_request(session, link, method="GET", stream=False, json_format=True, da
session = session_rules(session, link)
count = 0
sleep_number = random.randint(2, 5)
result = {}
while count < 11:
try:
count += 1
Expand All @@ -260,7 +277,8 @@ def json_request(session, link, method="GET", stream=False, json_format=True, da
r = session.request(method, link, json=data,
stream=stream, timeout=timeout)
else:
r = session.request(method, link, stream=stream, timeout=timeout)
r = session.request(
method, link, stream=stream, timeout=timeout)
rule = session_retry_rules(r, link)
if rule == 1:
continue
Expand All @@ -275,7 +293,7 @@ def json_request(session, link, method="GET", stream=False, json_format=True, da
if not text:
message = "ERROR: 100 Posts skipped. Please post the username you're trying to scrape on the issue "'100 Posts Skipped'""
log_error.exception(message)
return
return result
return json.loads(text)
else:
return r
Expand All @@ -288,6 +306,7 @@ def json_request(session, link, method="GET", stream=False, json_format=True, da
except Exception as e:
log_error.exception(e)
continue
return result


def get_config(config_path):
Expand Down Expand Up @@ -407,8 +426,10 @@ def assign_session(medias, number):
count = 0
return medias2


def create_link_group(max_threads):
x = range
print


log_error = setup_logger('errors', 'errors.log')
Loading

0 comments on commit ee7cf13

Please sign in to comment.