Repos / pytaku / 8e5f14292e
commit 8e5f14292e1fb641f81828f67c532519cc5d53b5
Author: Bùi Thành Nhân <hi@imnhan.com>
Date: Sun May 24 23:08:45 2020 +0700
proxied GET requests
diff --git a/src/pytaku/conf.py b/src/pytaku/conf.py
index 6db70cc..ebe84b7 100644
--- a/src/pytaku/conf.py
+++ b/src/pytaku/conf.py
@@ -17,7 +17,23 @@ class Config(GoodConf):
AWS_SECRET_ACCESS_KEY = Value()
AWS_STORAGE_BUCKET_NAME = Value(default="pytaku")
AWS_DEFAULT_ACL = Value(default="public-read")
- AWS_S3_ENDPOINT_URL = Value(default="")
+ AWS_S3_ENDPOINT_URL = Value()
+
+ # Makeshift https "proxy" running on google cloud functions:
+ GCF_PROXY_PROJECT_NAME = Value()
+ GCF_PROXY_FUNCTION_NAME = Value()
+ GCF_PROXY_PASSWORD = Value()
+ GCF_PROXY_REGIONS = Value(
+ default=[
+ "asia-east2",
+ "asia-northeast1",
+ "europe-west1",
+ "europe-west2",
+ "us-central1",
+ "us-east1",
+ "us-east4",
+ ]
+ )
config = Config(default_files=["pytaku.conf.json"])
diff --git a/src/pytaku/settings.py b/src/pytaku/settings.py
index 3795e7d..3d3a2fc 100644
--- a/src/pytaku/settings.py
+++ b/src/pytaku/settings.py
@@ -131,3 +131,9 @@
AWS_STORAGE_BUCKET_NAME = config.AWS_STORAGE_BUCKET_NAME
AWS_S3_ENDPOINT_URL = config.AWS_S3_ENDPOINT_URL or None
AWS_DEFAULT_ACL = None # use bucket's default policy
+
+
+GCF_PROXY_PROJECT_NAME = config.GCF_PROXY_PROJECT_NAME
+GCF_PROXY_FUNCTION_NAME = config.GCF_PROXY_FUNCTION_NAME
+GCF_PROXY_PASSWORD = config.GCF_PROXY_PASSWORD
+GCF_PROXY_REGIONS = config.GCF_PROXY_REGIONS
diff --git a/src/pytaku_scraper/commands.py b/src/pytaku_scraper/commands.py
index b992209..3b4d0fa 100644
--- a/src/pytaku_scraper/commands.py
+++ b/src/pytaku_scraper/commands.py
@@ -1,7 +1,7 @@
-import requests
from django.db import transaction
from django.utils import timezone
+from .httpclient import HttpClient
from .models import DownloadResult, TaskQueue
from .sites.mangadex import get_latest_id
@@ -20,14 +20,15 @@ def put_download_tasks():
print(f'Successfully put {len(result)} "download" tasks.')
-def download_worker():
+def download_worker(proxy_index):
+ http = HttpClient(proxy_index)
while True:
with transaction.atomic():
task = TaskQueue.pop("download")
task_id = task.id
print(f"Processing task {task_id}: {task.payload}")
- resp = requests.get(task.payload["url"], timeout=30)
- assert resp.status_code in (200, 404), f"Unexpected error: {resp.text}"
+ resp = http.proxied_get(task.payload["url"])
+ assert resp.status_code in (200, 404), f"Unexpected DL error: {resp.text}"
DownloadResult.objects.update_or_create(
url=task.payload["url"],
diff --git a/src/pytaku_scraper/httpclient.py b/src/pytaku_scraper/httpclient.py
new file mode 100644
index 0000000..bf9f2d7
--- /dev/null
+++ b/src/pytaku_scraper/httpclient.py
@@ -0,0 +1,30 @@
+import requests
+from django.conf import settings
+
+HEADERS = {
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"
+}
+
+
+class HttpClient:
+ def __init__(self, proxy_index=0):
+ proxy_region = settings.GCF_PROXY_REGIONS[proxy_index]
+ self.proxy_url = "https://{}-{}.cloudfunctions.net/{}".format(
+ proxy_region,
+ settings.GCF_PROXY_PROJECT_NAME,
+ settings.GCF_PROXY_FUNCTION_NAME,
+ )
+ print("HttpClient proxy region:", proxy_region)
+
+ def proxied_get(self, url, timeout=10):
+ return requests.post(
+ self.proxy_url,
+ json={
+ "url": url,
+ "method": "get",
+ "body": None,
+ "headers": HEADERS,
+ "password": settings.GCF_PROXY_PASSWORD,
+ },
+ timeout=timeout,
+ )
diff --git a/src/pytaku_scraper/management/commands/download_worker.py b/src/pytaku_scraper/management/commands/download_worker.py
index 87f3f75..bf357b0 100644
--- a/src/pytaku_scraper/management/commands/download_worker.py
+++ b/src/pytaku_scraper/management/commands/download_worker.py
@@ -6,5 +6,8 @@
class Command(BaseCommand):
help = "Download worker. Run as many as needed."
+ def add_arguments(self, parser):
+ parser.add_argument("proxy_index", type=int)
+
def handle(self, *args, **options):
- download_worker()
+ download_worker(options["proxy_index"])