Repos / pytaku / c29f04f172
commit c29f04f172f1f8f510183c0bd119157f4719380d
Author: Bùi Thành Nhân <hi@imnhan.com>
Date: Sun May 24 22:45:20 2020 +0700
put the meat of all commands into a single module
diff --git a/src/pytaku_scraper/commands.py b/src/pytaku_scraper/commands.py
new file mode 100644
index 0000000..b992209
--- /dev/null
+++ b/src/pytaku_scraper/commands.py
@@ -0,0 +1,48 @@
+import requests
+from django.db import transaction
+from django.utils import timezone
+
+from .models import DownloadResult, TaskQueue
+from .sites.mangadex import get_latest_id
+
+
+def put_download_tasks():
+ latest_id = get_latest_id()
+ print("Found latest title id:", latest_id)
+
+ result = TaskQueue.put_bulk(
+ "download",
+ [
+ {"url": f"https://mangadex.org/api/?type=manga&id={i}"}
+ for i in range(1, latest_id + 1)
+ ],
+ )
+ print(f'Successfully put {len(result)} "download" tasks.')
+
+
+def download_worker():
+ while True:
+ with transaction.atomic():
+ task = TaskQueue.pop("download")
+ task_id = task.id
+ print(f"Processing task {task_id}: {task.payload}")
+ resp = requests.get(task.payload["url"], timeout=30)
+ assert resp.status_code in (200, 404), f"Unexpected error: {resp.text}"
+
+ DownloadResult.objects.update_or_create(
+ url=task.payload["url"],
+ method="get",
+ defaults={
+ "downloaded_at": timezone.now(),
+ "resp_body": resp.text,
+ "resp_status": resp.status_code,
+ },
+ )
+
+ task.finish()
+ print("Done task", task_id)
+
+
+def purge_queue(task_name):
+ count, _ = TaskQueue.objects.filter(name=task_name).delete()
+ print(f'Deleted {count} "{task_name}" tasks.')
diff --git a/src/pytaku_scraper/management/commands/download_worker.py b/src/pytaku_scraper/management/commands/download_worker.py
new file mode 100644
index 0000000..87f3f75
--- /dev/null
+++ b/src/pytaku_scraper/management/commands/download_worker.py
@@ -0,0 +1,10 @@
+from django.core.management.base import BaseCommand
+
+from pytaku_scraper.commands import download_worker
+
+
+class Command(BaseCommand):
+ help = "Download worker. Run as many as needed."
+
+ def handle(self, *args, **options):
+ download_worker()
diff --git a/src/pytaku_scraper/management/commands/purge_queue.py b/src/pytaku_scraper/management/commands/purge_queue.py
index fe66b48..e21f547 100644
--- a/src/pytaku_scraper/management/commands/purge_queue.py
+++ b/src/pytaku_scraper/management/commands/purge_queue.py
@@ -1,17 +1,14 @@
from django.core.management.base import BaseCommand
-from pytaku_scraper.models import TaskQueue
+from pytaku_scraper.commands import purge_queue
class Command(BaseCommand):
help = "Delete all tasks in a queue."
def add_arguments(self, parser):
- parser.add_argument("task", choices=["scrape"])
+ parser.add_argument("task")
def handle(self, *args, **options):
task = options["task"]
- assert task == "scrape"
-
- count, _ = TaskQueue.objects.filter(name=task).delete()
- print(f'Deleted {count} "{task}" tasks.')
+ purge_queue(task)
diff --git a/src/pytaku_scraper/management/commands/put_download_tasks.py b/src/pytaku_scraper/management/commands/put_download_tasks.py
new file mode 100644
index 0000000..689477f
--- /dev/null
+++ b/src/pytaku_scraper/management/commands/put_download_tasks.py
@@ -0,0 +1,10 @@
+from django.core.management.base import BaseCommand
+
+from pytaku_scraper.commands import put_download_tasks
+
+
+class Command(BaseCommand):
+ help = "Puts download tasks for mangadex titles."
+
+ def handle(self, *args, **options):
+ put_download_tasks()
diff --git a/src/pytaku_scraper/management/commands/put_tasks.py b/src/pytaku_scraper/management/commands/put_tasks.py
deleted file mode 100644
index 5e85e94..0000000
--- a/src/pytaku_scraper/management/commands/put_tasks.py
+++ /dev/null
@@ -1,25 +0,0 @@
-from django.core.management.base import BaseCommand
-
-from pytaku_scraper.models import TaskQueue
-
-
-class Command(BaseCommand):
- help = "Puts various tasks."
-
- def add_arguments(self, parser):
- parser.add_argument("task", choices=["scrape"])
- parser.add_argument("start_id", type=int)
- parser.add_argument("end_id", type=int)
-
- def handle(self, *args, **options):
- assert options["task"] == "scrape"
-
- result = TaskQueue.put_bulk(
- "scrape",
- [
- {"url": f"https://mangadex.org/api/?type=manga&id={i}"}
- for i in range(options["start_id"], options["end_id"] + 1)
- ],
- )
-
- print("Result:", result)
diff --git a/src/pytaku_scraper/management/commands/scrape.py b/src/pytaku_scraper/management/commands/scrape.py
deleted file mode 100644
index c1fe0cf..0000000
--- a/src/pytaku_scraper/management/commands/scrape.py
+++ /dev/null
@@ -1,30 +0,0 @@
-import requests
-from django.core.management.base import BaseCommand
-from django.db import transaction
-
-from pytaku_scraper.models import ScrapeAttempt, TaskQueue
-
-
-class Command(BaseCommand):
- help = "Scrape worker. Run as many as needed."
-
- def handle(self, *args, **options):
- task_name = "scrape"
-
- while True:
- with transaction.atomic():
- task = TaskQueue.pop(task_name)
- task_id = task.id
- print(f"Processing task {task_id}: {task.payload}")
- resp = requests.get(task.payload["url"], timeout=30)
- assert resp.status_code in (200, 404), f"Unexpected error: {resp.text}"
-
- ScrapeAttempt.objects.create(
- url=task.payload["url"],
- method="get", # TODO
- resp_body=resp.text,
- resp_status=resp.status_code,
- )
-
- task.finish()
- print("Done task", task_id)
diff --git a/src/pytaku_scraper/migrations/0002_auto_20200524_1413.py b/src/pytaku_scraper/migrations/0002_auto_20200524_1413.py
new file mode 100644
index 0000000..181b284
--- /dev/null
+++ b/src/pytaku_scraper/migrations/0002_auto_20200524_1413.py
@@ -0,0 +1,34 @@
+# Generated by Django 3.0.5 on 2020-05-24 14:13
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('pytaku_scraper', '0001_initial'),
+ ]
+
+ operations = [
+ migrations.CreateModel(
+ name='DownloadResult',
+ fields=[
+ ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
+ ('created_at', models.DateTimeField(auto_now_add=True)),
+ ('url', models.CharField(max_length=1024)),
+ ('method', models.CharField(default='get', max_length=7)),
+ ('resp_body', models.TextField()),
+ ('resp_status', models.IntegerField()),
+ ],
+ options={
+ 'db_table': 'download_result',
+ },
+ ),
+ migrations.DeleteModel(
+ name='ScrapeAttempt',
+ ),
+ migrations.AddConstraint(
+ model_name='downloadresult',
+ constraint=models.UniqueConstraint(fields=('url', 'method'), name='unique_url_method'),
+ ),
+ ]
diff --git a/src/pytaku_scraper/migrations/0003_auto_20200524_1508.py b/src/pytaku_scraper/migrations/0003_auto_20200524_1508.py
new file mode 100644
index 0000000..66b7737
--- /dev/null
+++ b/src/pytaku_scraper/migrations/0003_auto_20200524_1508.py
@@ -0,0 +1,22 @@
+# Generated by Django 3.0.5 on 2020-05-24 15:08
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('pytaku_scraper', '0002_auto_20200524_1413'),
+ ]
+
+ operations = [
+ migrations.AlterField(
+ model_name='taskqueue',
+ name='name',
+ field=models.CharField(choices=[('Download', 'download')], max_length=100),
+ ),
+ migrations.AddConstraint(
+ model_name='taskqueue',
+ constraint=models.UniqueConstraint(fields=('name', 'payload'), name='unique_url_payload'),
+ ),
+ ]
diff --git a/src/pytaku_scraper/migrations/0004_auto_20200524_1538.py b/src/pytaku_scraper/migrations/0004_auto_20200524_1538.py
new file mode 100644
index 0000000..1a70a30
--- /dev/null
+++ b/src/pytaku_scraper/migrations/0004_auto_20200524_1538.py
@@ -0,0 +1,18 @@
+# Generated by Django 3.0.5 on 2020-05-24 15:38
+
+from django.db import migrations
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('pytaku_scraper', '0003_auto_20200524_1508'),
+ ]
+
+ operations = [
+ migrations.RenameField(
+ model_name='downloadresult',
+ old_name='created_at',
+ new_name='downloaded_at',
+ ),
+ ]
diff --git a/src/pytaku_scraper/models.py b/src/pytaku_scraper/models.py
index 0ff5391..4ac7710 100644
--- a/src/pytaku_scraper/models.py
+++ b/src/pytaku_scraper/models.py
@@ -1,7 +1,7 @@
from django.contrib.postgres.fields import JSONField
from django.db import models
-QUEUE_NAMES = [("Scrape", "scrape")]
+QUEUE_NAMES = [("Download", "download")]
class TaskQueue(models.Model):
@@ -22,6 +22,11 @@ class TaskQueue(models.Model):
class Meta:
db_table = "task_queue"
+ constraints = [
+ models.UniqueConstraint(
+ fields=["name", "payload"], name="unique_url_payload"
+ )
+ ]
created_at = models.DateTimeField(auto_now_add=True)
name = models.CharField(max_length=100, choices=QUEUE_NAMES)
@@ -34,7 +39,8 @@ def put(cls, name, payload):
@classmethod
def put_bulk(cls, name, payloads):
return cls.objects.bulk_create(
- [cls(name=name, payload=payload) for payload in payloads]
+ [cls(name=name, payload=payload) for payload in payloads],
+ ignore_conflicts=True,
)
@classmethod
@@ -56,16 +62,17 @@ def finish(self):
return self.delete()
-class ScrapeAttempt(models.Model):
+class DownloadResult(models.Model):
class Meta:
- db_table = "scrape_attempt"
+ db_table = "download_result"
+ constraints = [
+ models.UniqueConstraint(fields=["url", "method"], name="unique_url_method")
+ ]
- scraped_at = models.DateTimeField(auto_now_add=True)
+ downloaded_at = models.DateTimeField(auto_now_add=True)
url = models.CharField(max_length=1024)
- method = models.CharField(max_length=7)
- headers = JSONField(default=dict)
- body = models.TextField()
+ method = models.CharField(max_length=7, default="get")
resp_body = models.TextField()
resp_status = models.IntegerField()
diff --git a/src/pytaku_scraper/sites/mangadex.py b/src/pytaku_scraper/sites/mangadex.py
index 189c440..8ce7558 100644
--- a/src/pytaku_scraper/sites/mangadex.py
+++ b/src/pytaku_scraper/sites/mangadex.py
@@ -60,7 +60,6 @@ def scrape_title(original_id):
source_url = title_url_from_id(original_id)
html = session.get(source_url).text
soup = BeautifulSoup(html, "lxml")
- print(soup)
url = soup.select('link[rel="canonical"]')[0].attrs["href"]
name = soup.select(".card-header span.mx-1")[0].text
@@ -180,3 +179,12 @@ def scrape_chapter(original_id):
"name": data["title"],
"pages": pages,
}
+
+
+def get_latest_id():
+ resp = session.get("https://mangadex.org/")
+ assert resp.status_code == 200, resp.text
+ soup = BeautifulSoup(resp.text, "lxml")
+ latest_href = soup.select_one("#new_titles_owl_carousel a").attrs["href"]
+ latest_id = re.search(r"/(\d+)/", latest_href).group(1)
+ return int(latest_id)