Repos / pytaku / 02273ce907
commit 02273ce90758a1c1f4d160aa2892e60c4105bb61
Author: Bùi Thành Nhân <hi@imnhan.com>
Date:   Sat Aug 29 13:46:22 2020 +0700

    cache proxied images to filesystem, prune daily

diff --git a/.gitignore b/.gitignore
index 5a972bb..6d9a6c8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
+/proxy_cache
 /debug.py
 /test_secrets.json
 __pycache__
diff --git a/pyproject.toml b/pyproject.toml
index dc48d86..7b01052 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "pytaku"
-version = "0.3.7"
+version = "0.3.8"
 description = "Self-hostable web-based manga reader"
 authors = ["Bùi Thành Nhân <hi@imnhan.com>"]
 license = "AGPL-3.0-only"
diff --git a/src/pytaku/conf.py b/src/pytaku/conf.py
index 6d1ecb1..fd4e2fd 100644
--- a/src/pytaku/conf.py
+++ b/src/pytaku/conf.py
@@ -7,6 +7,9 @@ class Config(GoodConf):
     MANGADEX_USERNAME = Value()
     MANGADEX_PASSWORD = Value()
     FLASK_SECRET_KEY = Value(initial=lambda: token_urlsafe(50))
+    PROXY_CACHE_DIR = Value(default="proxy_cache")
+    PROXY_CACHE_MAX_SIZE = Value(default=1024 * 1024 * 1024)  # 1GiB in bytes
+    PROXY_CACHE_MAX_AGE = Value(default=3600 * 24 * 7)  # 7 weeks in seconds
 
 
 config = Config(default_files=["pytaku.conf.json"])
diff --git a/src/pytaku/main.py b/src/pytaku/main.py
index 2574598..9626226 100644
--- a/src/pytaku/main.py
+++ b/src/pytaku/main.py
@@ -3,6 +3,7 @@
 import re
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from datetime import timedelta
+from pathlib import Path
 from typing import List, Tuple
 
 import requests
@@ -36,6 +37,7 @@
     title_source_url,
     title_thumbnail,
 )
+from .storages import storage
 
 config.load()
 
@@ -61,19 +63,6 @@ def _chapter_name(chapter: dict):
     return result
 
 
-@app.route("/proxy/<b64_url>")
-def proxy_view(b64_url):
-    """Fine I'll do it"""
-    url = _decode_proxy_url(b64_url)
-    if not _is_manga_img_url(url):
-        print("Invalid img url:", url)
-        return "Nope", 400
-    md_resp = requests.get(url)
-    resp = make_response(md_resp.content, md_resp.status_code)
-    resp.headers["Content-Type"] = md_resp.headers["Content-Type"]
-    return resp
-
-
 def _encode_proxy_url(url):
     return base64.urlsafe_b64encode(url.encode()).decode()
 
@@ -91,6 +80,40 @@ def _is_manga_img_url(
     return pattern.match(url)
 
 
+@app.route("/proxy/<b64_url>")
+def proxy_view(b64_url):
+    """
+    Cached proxy for images (manga cover/page). Motivations:
+        - get around source site's hotlinking protection
+        - keep working even when source site is down
+        - be a polite netizen in general
+    """
+    url = _decode_proxy_url(b64_url)
+    if not _is_manga_img_url(url):
+        print("Invalid img url:", url)
+        return "Nope", 400
+
+    cached_file_path = Path(config.PROXY_CACHE_DIR) / b64_url
+    cached_headers_path = cached_file_path.with_suffix(".headers.json")
+
+    if not (storage.exists(cached_file_path) and storage.exists(cached_headers_path)):
+        md_resp = requests.get(url)
+        status_code = md_resp.status_code
+        body = md_resp.content
+        headers = {"Content-Type": md_resp.headers["content-type"]}
+        if status_code == 200:
+            storage.save(cached_headers_path, json.dumps(headers).encode())
+            storage.save(cached_file_path, md_resp.content)
+    else:
+        status_code = 200
+        body = storage.read(cached_file_path)
+        headers = json.loads(storage.read(cached_headers_path))
+
+    headers["Cache-Control"] = "max-age=31536000"
+
+    return body, status_code, headers
+
+
 def read_tachiyomi_follows(text: str) -> List[Tuple[str, str]]:
     try:
         data = json.loads(text)
diff --git a/src/pytaku/scheduler.py b/src/pytaku/scheduler.py
index d6183d2..1617c8d 100644
--- a/src/pytaku/scheduler.py
+++ b/src/pytaku/scheduler.py
@@ -1,7 +1,9 @@
 import time
 from abc import ABC, abstractmethod
 from datetime import datetime, timedelta
+from pathlib import Path
 
+from .conf import config
 from .persistence import delete_expired_tokens, find_outdated_titles, save_title
 from .source_sites import get_title
 
@@ -9,7 +11,7 @@
 
 
 def main_loop():
-    workers = [UpdateOutdatedTitles(), DeleteExpiredTokens()]
+    workers = [UpdateOutdatedTitles(), DeleteExpiredTokens(), PruneProxyCache()]
 
     while True:
         for worker in workers:
@@ -55,3 +57,47 @@ def run(self):
         num_deleted = delete_expired_tokens()
         if num_deleted > 0:
             print("Deleted", num_deleted, "tokens")
+
+
+class PruneProxyCache(Worker):
+    """
+    If proxy cache dir size exceeds config.PROXY_CACHE_MAX_SIZE,
+    delete files that are older than config.PROXY_CACHE_MAX_AGE.
+
+    Only applies for FilesystemStorage.
+    TODO: update this accordingly when a new Storage class is introduced.
+    """
+
+    interval = timedelta(days=1)
+
+    def run(self):
+        cache_dir = Path(config.PROXY_CACHE_DIR)
+        cache_size = get_dir_size(cache_dir)
+
+        if cache_size <= config.PROXY_CACHE_MAX_SIZE:
+            return
+
+        now = time.time()
+        files_deleted = 0
+        bytes_deleted = 0
+        for child in cache_dir.iterdir():
+            if child.is_file():
+                stat = child.stat()
+                modified_at = stat.st_mtime
+                if (now - modified_at) > config.PROXY_CACHE_MAX_AGE:
+                    child.unlink()  # yes this means delete
+                    files_deleted += 1
+                    bytes_deleted += stat.st_size
+
+        if files_deleted > 0:
+            in_mb = bytes_deleted / 1024 / 1024
+            print(f"Deleted {files_deleted} files ({in_mb:.2f} MiB).")
+        else:
+            print("Deleted nothing.")
+
+
+def get_dir_size(path: Path):
+    """
+    In bytes.
+    """
+    return sum(f.stat().st_size for f in path.glob("**/*") if f.is_file())
diff --git a/src/pytaku/storages.py b/src/pytaku/storages.py
new file mode 100644
index 0000000..81d7fd2
--- /dev/null
+++ b/src/pytaku/storages.py
@@ -0,0 +1,33 @@
+from abc import ABC, abstractmethod
+from pathlib import Path
+
+
+class Storage(ABC):
+    @abstractmethod
+    def save(self, path: Path, blob: bytes):
+        pass
+
+    @abstractmethod
+    def exists(self, path: Path) -> bool:
+        pass
+
+    @abstractmethod
+    def read(self, path: Path) -> bytes:
+        pass
+
+
+class FilesystemStorage(Storage):
+    def save(self, path: Path, blob: bytes):
+        if not path.parent.is_dir():
+            path.parent.mkdir(parents=True)
+        path.write_bytes(blob)
+
+    def exists(self, path: Path) -> bool:
+        return path.is_file()
+
+    def read(self, path: Path) -> bytes:
+        return path.read_bytes()
+
+
+# TODO: support other storages e.g. s3-like
+storage = FilesystemStorage()