[d01856973a] wip scraping queue | pytaku

Repos / pytaku / d01856973a
commit d01856973a553ecaa826a75057511dba1151f974
Author: Bùi Thành Nhân <hi@imnhan.com>
Date:   Tue May 5 23:22:24 2020 +0700

    wip scraping queue

diff --git a/poetry.lock b/poetry.lock
index 2a61293..d391d54 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -49,6 +49,21 @@ optional = false
 python-versions = "*"
 version = "0.1.0"
 
+[[package]]
+category = "main"
+description = "Screen-scraping library"
+name = "beautifulsoup4"
+optional = false
+python-versions = "*"
+version = "4.9.0"
+
+[package.dependencies]
+soupsieve = [">1.2", "<2.0"]
+
+[package.extras]
+html5lib = ["html5lib"]
+lxml = ["lxml"]
+
 [[package]]
 category = "dev"
 description = "The uncompromising code formatter."
@@ -69,6 +84,22 @@ typed-ast = ">=1.4.0"
 [package.extras]
 d = ["aiohttp (>=3.3.2)", "aiohttp-cors"]
 
+[[package]]
+category = "main"
+description = "Python package for providing Mozilla's CA Bundle."
+name = "certifi"
+optional = false
+python-versions = "*"
+version = "2020.4.5.1"
+
+[[package]]
+category = "main"
+description = "Universal encoding detector for Python 2 and 3"
+name = "chardet"
+optional = false
+python-versions = "*"
+version = "3.0.4"
+
 [[package]]
 category = "dev"
 description = "Composable command line interface toolkit"
@@ -147,6 +178,14 @@ maintainer = ["zest.releaser"]
 tests = ["django (<2.1)", "ruamel.yaml", "pytest (3.5.0)", "pytest-cov (2.5.1)", "pytest-mock (1.7.1)"]
 yaml = ["ruamel.yaml"]
 
+[[package]]
+category = "main"
+description = "Internationalized Domain Names in Applications (IDNA)"
+name = "idna"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+version = "2.9"
+
 [[package]]
 category = "dev"
 description = "Read metadata from Python packages"
@@ -248,6 +287,20 @@ parso = ">=0.5.2"
 [package.extras]
 testing = ["colorama (0.4.1)", "docopt", "pytest (>=3.9.0,<5.0.0)"]
 
+[[package]]
+category = "main"
+description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API."
+name = "lxml"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, != 3.4.*"
+version = "4.5.0"
+
+[package.extras]
+cssselect = ["cssselect (>=0.7)"]
+html5 = ["html5lib"]
+htmlsoup = ["beautifulsoup4"]
+source = ["Cython (>=0.29.7)"]
+
 [[package]]
 category = "dev"
 description = "McCabe checker, plugin for flake8"
@@ -423,6 +476,24 @@ optional = false
 python-versions = "*"
 version = "2020.4.4"
 
+[[package]]
+category = "main"
+description = "Python HTTP for Humans."
+name = "requests"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+version = "2.23.0"
+
+[package.dependencies]
+certifi = ">=2017.4.17"
+chardet = ">=3.0.2,<4"
+idna = ">=2.5,<3"
+urllib3 = ">=1.21.1,<1.25.0 || >1.25.0,<1.25.1 || >1.25.1,<1.26"
+
+[package.extras]
+security = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)"]
+socks = ["PySocks (>=1.5.6,<1.5.7 || >1.5.7)", "win-inet-pton"]
+
 [[package]]
 category = "dev"
 description = "Python 2 and 3 compatibility utilities"
@@ -432,6 +503,14 @@ optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*"
 version = "1.14.0"
 
+[[package]]
+category = "main"
+description = "A modern CSS selector implementation for Beautiful Soup."
+name = "soupsieve"
+optional = false
+python-versions = "*"
+version = "1.9.5"
+
 [[package]]
 category = "main"
 description = "Non-validating SQL parser"
@@ -482,6 +561,19 @@ optional = false
 python-versions = "*"
 version = "1.35"
 
+[[package]]
+category = "main"
+description = "HTTP library with thread-safe connection pooling, file post, and more."
+name = "urllib3"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, <4"
+version = "1.25.9"
+
+[package.extras]
+brotli = ["brotlipy (>=0.6.0)"]
+secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "pyOpenSSL (>=0.14)", "ipaddress"]
+socks = ["PySocks (>=1.5.6,<1.5.7 || >1.5.7,<2.0)"]
+
 [[package]]
 category = "dev"
 description = "Measures number of Terminal column cells of wide-character codes"
@@ -505,7 +597,7 @@ docs = ["sphinx", "jaraco.packaging (>=3.2)", "rst.linker (>=1.9)"]
 testing = ["jaraco.itertools", "func-timeout"]
 
 [metadata]
-content-hash = "6bda94aa80b72143c515264c16b725f01d1bcac858b642f637dd99deca5cf6c7"
+content-hash = "785d3de004285b89514d1f2fb02101901499db037100a2841959771244936f57"
 python-versions = "^3.7"
 
 [metadata.files]
@@ -529,10 +621,23 @@ backcall = [
     {file = "backcall-0.1.0.tar.gz", hash = "sha256:38ecd85be2c1e78f77fd91700c76e14667dc21e2713b63876c0eb901196e01e4"},
     {file = "backcall-0.1.0.zip", hash = "sha256:bbbf4b1e5cd2bdb08f915895b51081c041bac22394fdfcfdfbe9f14b77c08bf2"},
 ]
+beautifulsoup4 = [
+    {file = "beautifulsoup4-4.9.0-py2-none-any.whl", hash = "sha256:a4bbe77fd30670455c5296242967a123ec28c37e9702a8a81bd2f20a4baf0368"},
+    {file = "beautifulsoup4-4.9.0-py3-none-any.whl", hash = "sha256:d4e96ac9b0c3a6d3f0caae2e4124e6055c5dcafde8e2f831ff194c104f0775a0"},
+    {file = "beautifulsoup4-4.9.0.tar.gz", hash = "sha256:594ca51a10d2b3443cbac41214e12dbb2a1cd57e1a7344659849e2e20ba6a8d8"},
+]
 black = [
     {file = "black-19.10b0-py36-none-any.whl", hash = "sha256:1b30e59be925fafc1ee4565e5e08abef6b03fe455102883820fe5ee2e4734e0b"},
     {file = "black-19.10b0.tar.gz", hash = "sha256:c2edb73a08e9e0e6f65a0e6af18b059b8b1cdd5bef997d7a0b181df93dc81539"},
 ]
+certifi = [
+    {file = "certifi-2020.4.5.1-py2.py3-none-any.whl", hash = "sha256:1d987a998c75633c40847cc966fcf5904906c920a7f17ef374f5aa4282abd304"},
+    {file = "certifi-2020.4.5.1.tar.gz", hash = "sha256:51fcb31174be6e6664c5f69e3e1691a2d72a1a12e90f872cbdb1567eb47b6519"},
+]
+chardet = [
+    {file = "chardet-3.0.4-py2.py3-none-any.whl", hash = "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"},
+    {file = "chardet-3.0.4.tar.gz", hash = "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae"},
+]
 click = [
     {file = "click-7.1.2-py2.py3-none-any.whl", hash = "sha256:dacca89f4bfadd5de3d7489b7c8a566eee0d3676333fbb50030263894c38c0dc"},
     {file = "click-7.1.2.tar.gz", hash = "sha256:d2b5255c7c6349bc1bd1e59e08cd12acbbd63ce649f2588755783aa94dfb6b1a"},
@@ -561,6 +666,10 @@ goodconf = [
     {file = "goodconf-1.0.0-py2.py3-none-any.whl", hash = "sha256:beb2f9ed734015e1becd4338d8b1e363cf51fb52e2f794f4e85e8c59d097442e"},
     {file = "goodconf-1.0.0.tar.gz", hash = "sha256:2c33460b4d9859ffacff32355b7effb1a922a16c1d54e8edd6452503bd8e809b"},
 ]
+idna = [
+    {file = "idna-2.9-py2.py3-none-any.whl", hash = "sha256:a068a21ceac8a4d63dbfd964670474107f541babbd2250d61922f029858365fa"},
+    {file = "idna-2.9.tar.gz", hash = "sha256:7588d1c14ae4c77d74036e8c22ff447b26d0fde8f007354fd48a7814db15b7cb"},
+]
 importlib-metadata = [
     {file = "importlib_metadata-1.6.0-py2.py3-none-any.whl", hash = "sha256:2a688cbaa90e0cc587f1df48bdc97a6eadccdcd9c35fb3f976a09e3b5016d90f"},
     {file = "importlib_metadata-1.6.0.tar.gz", hash = "sha256:34513a8a0c4962bc66d35b359558fd8a5e10cd472d37aec5f66858addef32c1e"},
@@ -584,6 +693,35 @@ jedi = [
     {file = "jedi-0.15.2-py2.py3-none-any.whl", hash = "sha256:1349c1e8c107095a55386628bb3b2a79422f3a2cab8381e34ce19909e0cf5064"},
     {file = "jedi-0.15.2.tar.gz", hash = "sha256:e909527104a903606dd63bea6e8e888833f0ef087057829b89a18364a856f807"},
 ]
+lxml = [
+    {file = "lxml-4.5.0-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:0701f7965903a1c3f6f09328c1278ac0eee8f56f244e66af79cb224b7ef3801c"},
+    {file = "lxml-4.5.0-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:06d4e0bbb1d62e38ae6118406d7cdb4693a3fa34ee3762238bcb96c9e36a93cd"},
+    {file = "lxml-4.5.0-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:5828c7f3e615f3975d48f40d4fe66e8a7b25f16b5e5705ffe1d22e43fb1f6261"},
+    {file = "lxml-4.5.0-cp27-cp27m-win32.whl", hash = "sha256:afdb34b715daf814d1abea0317b6d672476b498472f1e5aacbadc34ebbc26e89"},
+    {file = "lxml-4.5.0-cp27-cp27m-win_amd64.whl", hash = "sha256:585c0869f75577ac7a8ff38d08f7aac9033da2c41c11352ebf86a04652758b7a"},
+    {file = "lxml-4.5.0-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:8a0ebda56ebca1a83eb2d1ac266649b80af8dd4b4a3502b2c1e09ac2f88fe128"},
+    {file = "lxml-4.5.0-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:fe976a0f1ef09b3638778024ab9fb8cde3118f203364212c198f71341c0715ca"},
+    {file = "lxml-4.5.0-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:7bc1b221e7867f2e7ff1933165c0cec7153dce93d0cdba6554b42a8beb687bdb"},
+    {file = "lxml-4.5.0-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:d068f55bda3c2c3fcaec24bd083d9e2eede32c583faf084d6e4b9daaea77dde8"},
+    {file = "lxml-4.5.0-cp35-cp35m-win32.whl", hash = "sha256:e4aa948eb15018a657702fee0b9db47e908491c64d36b4a90f59a64741516e77"},
+    {file = "lxml-4.5.0-cp35-cp35m-win_amd64.whl", hash = "sha256:1f2c4ec372bf1c4a2c7e4bb20845e8bcf8050365189d86806bad1e3ae473d081"},
+    {file = "lxml-4.5.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:5d467ce9c5d35b3bcc7172c06320dddb275fea6ac2037f72f0a4d7472035cea9"},
+    {file = "lxml-4.5.0-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:95e67224815ef86924fbc2b71a9dbd1f7262384bca4bc4793645794ac4200717"},
+    {file = "lxml-4.5.0-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:ebec08091a22c2be870890913bdadd86fcd8e9f0f22bcb398abd3af914690c15"},
+    {file = "lxml-4.5.0-cp36-cp36m-win32.whl", hash = "sha256:deadf4df349d1dcd7b2853a2c8796593cc346600726eff680ed8ed11812382a7"},
+    {file = "lxml-4.5.0-cp36-cp36m-win_amd64.whl", hash = "sha256:f2b74784ed7e0bc2d02bd53e48ad6ba523c9b36c194260b7a5045071abbb1012"},
+    {file = "lxml-4.5.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:fa071559f14bd1e92077b1b5f6c22cf09756c6de7139370249eb372854ce51e6"},
+    {file = "lxml-4.5.0-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:edc15fcfd77395e24543be48871c251f38132bb834d9fdfdad756adb6ea37679"},
+    {file = "lxml-4.5.0-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:fd52e796fee7171c4361d441796b64df1acfceb51f29e545e812f16d023c4bbc"},
+    {file = "lxml-4.5.0-cp37-cp37m-win32.whl", hash = "sha256:90ed0e36455a81b25b7034038e40880189169c308a3df360861ad74da7b68c1a"},
+    {file = "lxml-4.5.0-cp37-cp37m-win_amd64.whl", hash = "sha256:df533af6f88080419c5a604d0d63b2c33b1c0c4409aba7d0cb6de305147ea8c8"},
+    {file = "lxml-4.5.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:b4b2c63cc7963aedd08a5f5a454c9f67251b1ac9e22fd9d72836206c42dc2a72"},
+    {file = "lxml-4.5.0-cp38-cp38-manylinux1_i686.whl", hash = "sha256:e5d842c73e4ef6ed8c1bd77806bf84a7cb535f9c0cf9b2c74d02ebda310070e1"},
+    {file = "lxml-4.5.0-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:63dbc21efd7e822c11d5ddbedbbb08cd11a41e0032e382a0fd59b0b08e405a3a"},
+    {file = "lxml-4.5.0-cp38-cp38-win32.whl", hash = "sha256:4235bc124fdcf611d02047d7034164897ade13046bda967768836629bc62784f"},
+    {file = "lxml-4.5.0-cp38-cp38-win_amd64.whl", hash = "sha256:d5b3c4b7edd2e770375a01139be11307f04341ec709cf724e0f26ebb1eef12c3"},
+    {file = "lxml-4.5.0.tar.gz", hash = "sha256:8620ce80f50d023d414183bf90cc2576c2837b88e00bea3f33ad2630133bbb60"},
+]
 mccabe = [
     {file = "mccabe-0.6.1-py2.py3-none-any.whl", hash = "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42"},
     {file = "mccabe-0.6.1.tar.gz", hash = "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"},
@@ -678,10 +816,18 @@ regex = [
     {file = "regex-2020.4.4-cp38-cp38-win_amd64.whl", hash = "sha256:5bfed051dbff32fd8945eccca70f5e22b55e4148d2a8a45141a3b053d6455ae3"},
     {file = "regex-2020.4.4.tar.gz", hash = "sha256:295badf61a51add2d428a46b8580309c520d8b26e769868b922750cf3ce67142"},
 ]
+requests = [
+    {file = "requests-2.23.0-py2.py3-none-any.whl", hash = "sha256:43999036bfa82904b6af1d99e4882b560e5e2c68e5c4b0aa03b655f3d7d73fee"},
+    {file = "requests-2.23.0.tar.gz", hash = "sha256:b3f43d496c6daba4493e7c431722aeb7dbc6288f52a6e04e7b6023b0247817e6"},
+]
 six = [
     {file = "six-1.14.0-py2.py3-none-any.whl", hash = "sha256:8f3cd2e254d8f793e7f3d6d9df77b92252b52637291d0f0da013c76ea2724b6c"},
     {file = "six-1.14.0.tar.gz", hash = "sha256:236bdbdce46e6e6a3d61a337c0f8b763ca1e8717c03b369e87a7ec7ce1319c0a"},
 ]
+soupsieve = [
+    {file = "soupsieve-1.9.5-py2.py3-none-any.whl", hash = "sha256:bdb0d917b03a1369ce964056fc195cfdff8819c40de04695a80bc813c3cfa1f5"},
+    {file = "soupsieve-1.9.5.tar.gz", hash = "sha256:e2c1c5dee4a1c36bcb790e0fabd5492d874b8ebd4617622c4f6a731701060dda"},
+]
 sqlparse = [
     {file = "sqlparse-0.3.1-py2.py3-none-any.whl", hash = "sha256:022fb9c87b524d1f7862b3037e541f68597a730a8843245c349fc93e1643dc4e"},
     {file = "sqlparse-0.3.1.tar.gz", hash = "sha256:e162203737712307dfe78860cc56c8da8a852ab2ee33750e33aeadf38d12c548"},
@@ -721,6 +867,10 @@ typed-ast = [
 ujson = [
     {file = "ujson-1.35.tar.gz", hash = "sha256:f66073e5506e91d204ab0c614a148d5aa938bdbf104751be66f8ad7a222f5f86"},
 ]
+urllib3 = [
+    {file = "urllib3-1.25.9-py2.py3-none-any.whl", hash = "sha256:88206b0eb87e6d677d424843ac5209e3fb9d0190d0ee169599165ec25e9d9115"},
+    {file = "urllib3-1.25.9.tar.gz", hash = "sha256:3018294ebefce6572a474f0604c2021e33b3fd8006ecd11d62107a5d2a963527"},
+]
 wcwidth = [
     {file = "wcwidth-0.1.9-py2.py3-none-any.whl", hash = "sha256:cafe2186b3c009a04067022ce1dcd79cb38d8d65ee4f4791b8888d6599d1bbe1"},
     {file = "wcwidth-0.1.9.tar.gz", hash = "sha256:ee73862862a156bf77ff92b09034fc4825dd3af9cf81bc5b360668d425f3c5f1"},
diff --git a/pyproject.toml b/pyproject.toml
index 0411818..fcd37fb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -19,6 +19,9 @@ python = "^3.7"
 django = "^3.0.5"
 psycopg2 = "^2.8.5"
 goodconf = "^1.0.0"
+requests = "^2.23.0"
+beautifulsoup4 = "^4.9.0"
+lxml = "^4.5.0"
 
 [tool.poetry.dev-dependencies]
 python-language-server = "^0.31.10"
diff --git a/src/pytaku/asgi.py b/src/pytaku/asgi.py
deleted file mode 100644
index 2061535..0000000
--- a/src/pytaku/asgi.py
+++ /dev/null
@@ -1,16 +0,0 @@
-"""
-ASGI config for pytaku project.
-
-It exposes the ASGI callable as a module-level variable named ``application``.
-
-For more information on this file, see
-https://docs.djangoproject.com/en/3.0/howto/deployment/asgi/
-"""
-
-import os
-
-from django.core.asgi import get_asgi_application
-
-os.environ.setdefault("DJANGO_SETTINGS_MODULE", "pytaku.settings")
-
-application = get_asgi_application()
diff --git a/src/pytaku/settings.py b/src/pytaku/settings.py
index d0ab210..3795e7d 100644
--- a/src/pytaku/settings.py
+++ b/src/pytaku/settings.py
@@ -42,6 +42,7 @@
     "django.contrib.messages",
     "django.contrib.staticfiles",
     "pytaku_web",
+    "pytaku_scraper",
 ]
 
 MIDDLEWARE = [
diff --git a/src/pytaku_scraper/__init__.py b/src/pytaku_scraper/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/pytaku_scraper/models.py b/src/pytaku_scraper/models.py
new file mode 100644
index 0000000..8ca2a54
--- /dev/null
+++ b/src/pytaku_scraper/models.py
@@ -0,0 +1,27 @@
+from django.contrib.postgres.fields import JSONField
+from django.db import models
+
+QUEUE_NAMES = [("Scrape", "scrape")]
+
+
+class ScrapeAttempt(models.Model):
+    class Meta:
+        db_table = "scrape_attempt"
+
+    scraped_at = models.DateTimeField(auto_now_add=True)
+
+    url = models.CharField(max_length=1024)
+    method = models.CharField(max_length=7)
+    headers = JSONField(default=dict)
+    body = models.TextField()
+
+    resp_body = models.TextField()
+    resp_status = models.IntegerField()
+
+
+class TaskQueue(models.Model):
+    class Meta:
+        db_table = "task_queue"
+
+    created_at = models.DateTimeField(auto_now_add=True)
+    name = models.CharField(max_length=100, choices=QUEUE_NAMES)
diff --git a/src/pytaku_scraper/sites/__init__.py b/src/pytaku_scraper/sites/__init__.py
new file mode 100644
index 0000000..8f796a6
--- /dev/null
+++ b/src/pytaku_scraper/sites/__init__.py
@@ -0,0 +1,15 @@
+from importlib import import_module
+from urllib.parse import urlparse
+
+available_sites = {"mangadex.org": ".mangadex"}
+
+
+# return suitable module from url
+def get_site(url):
+    netloc = urlparse(url).netloc
+    module_name = available_sites.get(netloc)
+    return (
+        None
+        if module_name is None
+        else import_module(module_name, "pytaku_web.scraper")
+    )
diff --git a/src/pytaku_scraper/sites/mangadex.py b/src/pytaku_scraper/sites/mangadex.py
new file mode 100644
index 0000000..189c440
--- /dev/null
+++ b/src/pytaku_scraper/sites/mangadex.py
@@ -0,0 +1,182 @@
+import itertools
+import re
+
+import requests
+from attr import attrib, attrs
+from bs4 import BeautifulSoup
+
+DOMAIN = "https://mangadex.org"
+API_URL = "https://mangadex.org/api/"
+
+session = requests.Session()
+session.headers = {
+    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Firefox/68.0"
+}
+
+
+@attrs(slots=True, kw_only=True)
+class Title(object):
+    url = attrib(type=str)
+    name = attrib(type=str)
+    alt_names = attrib(type=list)
+    authors = attrib(type=list)
+    tags = attrib(type=list)
+    publication_status = attrib(type=str)
+    descriptions = attrib(type=list)
+    chapters = attrib(type=list)
+
+
+@attrs(slots=True, kw_only=True)
+class Chapter(object):
+    name = attrib(type=str)
+    pages = attrib(type=list)
+
+
+def title_url_from_id(original_id):
+    return f"{DOMAIN}/title/{original_id}"
+
+
+def chapter_url_from_id(original_id):
+    return f"{DOMAIN}/chapter/{original_id}"
+
+
+_chapter_url_regex = re.compile("^" + DOMAIN + r"/chapter/(\d+)/?$")
+
+
+def chapter_id_from_url(url):
+    match = _chapter_url_regex.match(url)
+    return match.group(1) if match else None
+
+
+_title_url_regex = re.compile("^" + DOMAIN + r"/title/(\d+)(/.*)?$")
+
+
+def title_id_from_url(url):
+    match = _title_url_regex.match(url)
+    return match.group(1) if match else None
+
+
+def scrape_title(original_id):
+    source_url = title_url_from_id(original_id)
+    html = session.get(source_url).text
+    soup = BeautifulSoup(html, "lxml")
+    print(soup)
+
+    url = soup.select('link[rel="canonical"]')[0].attrs["href"]
+    name = soup.select(".card-header span.mx-1")[0].text
+
+    alt_names = _get_next_column_of(soup, "Alt name(s)", "li")
+    authors = _get_next_column_of(soup, "Author", "a")
+    artists = _get_next_column_of(soup, "Artist", "a")
+    genres = _get_next_column_of(soup, "Genre", "a")
+    themes = _get_next_column_of(soup, "Theme", "a")
+    pub_status = _get_next_column_of(soup, "Pub. status")
+
+    raw_descs = _get_next_column_of(soup, "Description")
+    descriptions = [desc.strip() for desc in raw_descs.split("\n\n") if desc.strip()]
+
+    chapters = _get_chapters(soup)
+    return {
+        "url": url,
+        "name": name,
+        "alt_names": alt_names,
+        "authors": sorted(set(authors + artists)),
+        "tags": sorted(set(genres + themes)),
+        "publication_status": pub_status,
+        "descriptions": descriptions,
+        "chapters": chapters,
+    }
+
+
+def _get_next_column_of(soup, query, subtag=None):
+    label = soup.find("div", string=f"{query}:")
+    if label is None:
+        return None if subtag is None else []
+
+    # newlines also count as sibling, so we have to filter them out:
+    siblings = [sibl for sibl in label.next_siblings if sibl.name is not None]
+    if len(siblings) != 1:
+        raise Exception(f'Unexpected siblings found for "{query}": {siblings}')
+    next_column = siblings[0]
+    return _get_column_content(next_column, subtag)
+
+
+def _get_column_content(column, subtag):
+    if subtag is None:
+        return column.text.strip()
+    else:
+        return [
+            child.text.strip()
+            for child in column.find_all(subtag)
+            if child.text.strip()
+        ]
+
+
+def _get_chapters(soup):
+    chapter_page_urls = _chapter_page_urls(soup)
+    chapter_page_soups = [
+        BeautifulSoup(session.get(f"{DOMAIN}{url}").text, "lxml")
+        for url in chapter_page_urls
+    ]
+    chapter_page_soups.insert(0, soup)  # saves us 1 http request :)
+    chapters = [_chapters_data(soup) for soup in chapter_page_soups]
+    return list(itertools.chain(*chapters))  # flatten list of list
+
+
+def _chapter_page_urls(soup):
+    """
+    Excluding first page because we already have it
+    """
+    last_chapter_link_tag = soup.find(title="Jump to last page")
+    if not last_chapter_link_tag:
+        return []
+
+    last_chapter_link = last_chapter_link_tag.parent.attrs["href"]
+    if last_chapter_link[-1] == "/":
+        last_chapter_link = last_chapter_link[:-1]
+
+    parts = last_chapter_link.split("/")
+    max_page = int(parts.pop())
+
+    template = "/".join(parts + ["%d/"])
+
+    return [template % page_num for page_num in range(2, max_page + 1)]
+
+
+def _chapters_data(soup):
+    chapter_container = soup.find(class_="chapter-container")
+
+    def is_chapter_link(href):
+        return href.startswith("/chapter/") and not href.endswith("comments")
+
+    chapters = chapter_container.find_all("a", href=is_chapter_link)
+
+    eng_chapters = [
+        {
+            "id": chapter_id_from_url(f'{DOMAIN}{chapter.attrs["href"]}'),
+            "name": chapter.text,
+        }
+        for chapter in chapters
+        if chapter.parent.parent.find(class_="flag", title="English")
+    ]
+    return eng_chapters
+
+
+def scrape_chapter(original_id):
+    data = session.get(API_URL, params={"id": original_id, "type": "chapter"}).json()
+
+    if data["status"] == "deleted":
+        return None
+
+    # data["server"] can be either of:
+    # - "/data/..." - meaning same origin as web server: https://mangadex.org/data/...
+    # - "https://sX.mangadex.org/data/..." where X is any digit.
+    page_base_url = data["server"] + data["hash"]
+    if page_base_url.startswith("/"):
+        page_base_url = f"https://mangadex.org{page_base_url}"
+    pages = [f"{page_base_url}/{page}" for page in data["page_array"]]
+
+    return {
+        "name": data["title"],
+        "pages": pages,
+    }