Faster Scraping

I'm rewriting most of the script to make it look better. Half the time, I don't even know what I'm looking for. Anyways, the script should be up to 100% faster since we'll only call the api once instead of three times and then search for image, video and audio files.
UltimaHoarder · Sep 11, 2020 · ee7cf13 · ee7cf13
1 parent 4dbd8df
commit ee7cf13
Show file tree

Hide file tree

Showing 4 changed files with 285 additions and 277 deletions.
diff --git a/datascraper/main_datascraper.py b/datascraper/main_datascraper.py
@@ -7,7 +7,6 @@
 from argparse import ArgumentParser
 
 import helpers.main_helper as main_helper
-from helpers.main_helper import update_config
 import modules.bbwchan as bbwchan
 import modules.fourchan as fourchan
 import modules.onlyfans as onlyfans
@@ -101,7 +100,8 @@ def start_datascraper():
                         'user_agent'] else json_auth['user_agent']
 
                     x = onlyfans
-                    x.assign_vars(json_auth,json_config, json_site_settings, site_name)
+                    x.assign_vars(json_auth, json_config,
+                                  json_site_settings, site_name)
                     sessions = x.create_session()
                     if not sessions:
                         print("Unable to create session")
@@ -121,7 +121,7 @@ def start_datascraper():
                     json_auth['sess'] = cookies["sess"]
                     json_auth['fp'] = cookies["fp"]
                     if json_config != json_config2:
-                        update_config(json_config)
+                        main_helper.update_config(json_config)
                     me_api = session["me_api"]
                     array = x.get_subscriptions(
                         session["sessions"][0], session["subscriber_count"], me_api, auth_count)
@@ -149,7 +149,7 @@ def start_datascraper():
                     cookies = session["session"].cookies.get_dict()
                     json_auth['session_id'] = cookies["session_id"]
                     if json_config != json_config2:
-                        update_config(json_config)
+                        main_helper.update_config(json_config)
                     me_api = session["me_api"]
                     array = x.get_subscriptions(
                         session["session"], auth_count)
@@ -215,7 +215,7 @@ def start_datascraper():
             else:
                 print("There's nothing to scrape.")
                 continue
-            start_time = timeit.default_timer()
+            archive_time = timeit.default_timer()
             download_list = []
             app_token = ""
             for name in names:
@@ -234,14 +234,14 @@ def start_datascraper():
                 main_helper.assign_vars(json_config)
                 username = main_helper.parse_links(site_name_lower, name)
                 result = x.start_datascraper(
-                    session, username, site_name, app_token,choice_type=value)
+                    session, username, site_name, app_token, choice_type=value)
                 if not args.metadata:
                     download_list.append(result)
             for y in download_list:
                 for arg in y[1]:
                     x.download_media(*arg)
-            stop_time = str(int(timeit.default_timer() - start_time) / 60)
-            print('Task Completed in ' + stop_time + ' Minutes')
+            stop_time = str(int(timeit.default_timer() - archive_time) / 60)[:4]
+            print('Archive Completed in ' + stop_time + ' Minutes')
             if exit_on_completion:
                 print("Now exiting.")
                 exit(0)

diff --git a/extras/OFRenamer/start.py b/extras/OFRenamer/start.py
@@ -84,7 +84,7 @@ def update(filepath):
 
 def start(metadata_filepath, json_settings):
     if os.path.getsize(metadata_filepath) > 0:
-        metadatas = json.load(open(metadata_filepath))
+        metadatas = json.load(open(metadata_filepath, encoding='utf-8'))
         metadatas2 = prepare_metadata(metadatas).items
         username = os.path.basename(up(up(metadata_filepath)))
         site_name = os.path.basename(up(up(up(metadata_filepath))))

diff --git a/helpers/main_helper.py b/helpers/main_helper.py
@@ -7,7 +7,7 @@
 import platform
 import re
 from datetime import datetime
-from itertools import chain, zip_longest
+from itertools import chain, zip_longest, groupby
 from os.path import dirname as up
 from urllib.parse import urlparse
 import time
@@ -80,15 +80,31 @@ def clean_text(string, remove_spaces=False):
     return string
 
 
-def format_media_set(location, media_set):
-    x = {}
-    x["type"] = location
-    x["valid"] = []
-    x["invalid"] = []
-    for y in media_set:
-        x["valid"].extend(y[0])
-        x["invalid"].extend(y[1])
-    return x
+def format_media_set(media_set):
+    media_set = list(chain(*media_set))
+    media_set.sort(key=lambda x: x["type"])
+    media_set = [list(g) for k, g in groupby(
+        media_set, key=lambda x: x["type"])]
+    new_list = []
+    for item in media_set:
+        item2 = {k: [d[k] for d in item] for k in item[0]}
+        item2["type"] = item2["type"][0]
+        item2["valid"] = list(chain(*item2["valid"]))
+        item2["invalid"] = list(chain(*item2["invalid"]))
+        if item2["valid"]:
+            seen = set()
+            item2["valid"] = [x for x in item2["valid"]
+                              if x["filename"] not in seen and not seen.add(x["filename"])]
+            seen = set()
+            location_directories = [x["directory"] for x in item2["valid"]
+                                    if x["directory"] not in seen and not seen.add(x["directory"])]
+            for location_directory in location_directories:
+                os.makedirs(location_directory, exist_ok=True)
+            item2["valid"] = [list(g) for k, g in groupby(
+                item2["valid"], key=lambda x: x["post_id"])]
+        new_list.append(item2)
+    print
+    return new_list
 
 
 def format_image(directory, timestamp):
@@ -250,6 +266,7 @@ def json_request(session, link, method="GET", stream=False, json_format=True, da
     session = session_rules(session, link)
     count = 0
     sleep_number = random.randint(2, 5)
+    result = {}
     while count < 11:
         try:
             count += 1
@@ -260,7 +277,8 @@ def json_request(session, link, method="GET", stream=False, json_format=True, da
                 r = session.request(method, link, json=data,
                                     stream=stream, timeout=timeout)
             else:
-                r = session.request(method, link, stream=stream, timeout=timeout)
+                r = session.request(
+                    method, link, stream=stream, timeout=timeout)
             rule = session_retry_rules(r, link)
             if rule == 1:
                 continue
@@ -275,7 +293,7 @@ def json_request(session, link, method="GET", stream=False, json_format=True, da
                 if not text:
                     message = "ERROR: 100 Posts skipped. Please post the username you're trying to scrape on the issue "'100 Posts Skipped'""
                     log_error.exception(message)
-                    return
+                    return result
                 return json.loads(text)
             else:
                 return r
@@ -288,6 +306,7 @@ def json_request(session, link, method="GET", stream=False, json_format=True, da
         except Exception as e:
             log_error.exception(e)
             continue
+    return result
 
 
 def get_config(config_path):
@@ -407,8 +426,10 @@ def assign_session(medias, number):
             count = 0
     return medias2
 
+
 def create_link_group(max_threads):
     x = range
     print
 
+
 log_error = setup_logger('errors', 'errors.log')