Repos / pytaku / c29f04f172
commit c29f04f172f1f8f510183c0bd119157f4719380d
Author: Bùi Thành Nhân <hi@imnhan.com>
Date:   Sun May 24 22:45:20 2020 +0700

    put the meat of all commands into a single module

diff --git a/src/pytaku_scraper/commands.py b/src/pytaku_scraper/commands.py
new file mode 100644
index 0000000..b992209
--- /dev/null
+++ b/src/pytaku_scraper/commands.py
@@ -0,0 +1,48 @@
+import requests
+from django.db import transaction
+from django.utils import timezone
+
+from .models import DownloadResult, TaskQueue
+from .sites.mangadex import get_latest_id
+
+
+def put_download_tasks():
+    latest_id = get_latest_id()
+    print("Found latest title id:", latest_id)
+
+    result = TaskQueue.put_bulk(
+        "download",
+        [
+            {"url": f"https://mangadex.org/api/?type=manga&id={i}"}
+            for i in range(1, latest_id + 1)
+        ],
+    )
+    print(f'Successfully put {len(result)} "download" tasks.')
+
+
+def download_worker():
+    while True:
+        with transaction.atomic():
+            task = TaskQueue.pop("download")
+            task_id = task.id
+            print(f"Processing task {task_id}: {task.payload}")
+            resp = requests.get(task.payload["url"], timeout=30)
+            assert resp.status_code in (200, 404), f"Unexpected error: {resp.text}"
+
+            DownloadResult.objects.update_or_create(
+                url=task.payload["url"],
+                method="get",
+                defaults={
+                    "downloaded_at": timezone.now(),
+                    "resp_body": resp.text,
+                    "resp_status": resp.status_code,
+                },
+            )
+
+            task.finish()
+            print("Done task", task_id)
+
+
+def purge_queue(task_name):
+    count, _ = TaskQueue.objects.filter(name=task_name).delete()
+    print(f'Deleted {count} "{task_name}" tasks.')
diff --git a/src/pytaku_scraper/management/commands/download_worker.py b/src/pytaku_scraper/management/commands/download_worker.py
new file mode 100644
index 0000000..87f3f75
--- /dev/null
+++ b/src/pytaku_scraper/management/commands/download_worker.py
@@ -0,0 +1,10 @@
+from django.core.management.base import BaseCommand
+
+from pytaku_scraper.commands import download_worker
+
+
+class Command(BaseCommand):
+    help = "Download worker. Run as many as needed."
+
+    def handle(self, *args, **options):
+        download_worker()
diff --git a/src/pytaku_scraper/management/commands/purge_queue.py b/src/pytaku_scraper/management/commands/purge_queue.py
index fe66b48..e21f547 100644
--- a/src/pytaku_scraper/management/commands/purge_queue.py
+++ b/src/pytaku_scraper/management/commands/purge_queue.py
@@ -1,17 +1,14 @@
 from django.core.management.base import BaseCommand
 
-from pytaku_scraper.models import TaskQueue
+from pytaku_scraper.commands import purge_queue
 
 
 class Command(BaseCommand):
     help = "Delete all tasks in a queue."
 
     def add_arguments(self, parser):
-        parser.add_argument("task", choices=["scrape"])
+        parser.add_argument("task")
 
     def handle(self, *args, **options):
         task = options["task"]
-        assert task == "scrape"
-
-        count, _ = TaskQueue.objects.filter(name=task).delete()
-        print(f'Deleted {count} "{task}" tasks.')
+        purge_queue(task)
diff --git a/src/pytaku_scraper/management/commands/put_download_tasks.py b/src/pytaku_scraper/management/commands/put_download_tasks.py
new file mode 100644
index 0000000..689477f
--- /dev/null
+++ b/src/pytaku_scraper/management/commands/put_download_tasks.py
@@ -0,0 +1,10 @@
+from django.core.management.base import BaseCommand
+
+from pytaku_scraper.commands import put_download_tasks
+
+
+class Command(BaseCommand):
+    help = "Puts download tasks for mangadex titles."
+
+    def handle(self, *args, **options):
+        put_download_tasks()
diff --git a/src/pytaku_scraper/management/commands/put_tasks.py b/src/pytaku_scraper/management/commands/put_tasks.py
deleted file mode 100644
index 5e85e94..0000000
--- a/src/pytaku_scraper/management/commands/put_tasks.py
+++ /dev/null
@@ -1,25 +0,0 @@
-from django.core.management.base import BaseCommand
-
-from pytaku_scraper.models import TaskQueue
-
-
-class Command(BaseCommand):
-    help = "Puts various tasks."
-
-    def add_arguments(self, parser):
-        parser.add_argument("task", choices=["scrape"])
-        parser.add_argument("start_id", type=int)
-        parser.add_argument("end_id", type=int)
-
-    def handle(self, *args, **options):
-        assert options["task"] == "scrape"
-
-        result = TaskQueue.put_bulk(
-            "scrape",
-            [
-                {"url": f"https://mangadex.org/api/?type=manga&id={i}"}
-                for i in range(options["start_id"], options["end_id"] + 1)
-            ],
-        )
-
-        print("Result:", result)
diff --git a/src/pytaku_scraper/management/commands/scrape.py b/src/pytaku_scraper/management/commands/scrape.py
deleted file mode 100644
index c1fe0cf..0000000
--- a/src/pytaku_scraper/management/commands/scrape.py
+++ /dev/null
@@ -1,30 +0,0 @@
-import requests
-from django.core.management.base import BaseCommand
-from django.db import transaction
-
-from pytaku_scraper.models import ScrapeAttempt, TaskQueue
-
-
-class Command(BaseCommand):
-    help = "Scrape worker. Run as many as needed."
-
-    def handle(self, *args, **options):
-        task_name = "scrape"
-
-        while True:
-            with transaction.atomic():
-                task = TaskQueue.pop(task_name)
-                task_id = task.id
-                print(f"Processing task {task_id}: {task.payload}")
-                resp = requests.get(task.payload["url"], timeout=30)
-                assert resp.status_code in (200, 404), f"Unexpected error: {resp.text}"
-
-                ScrapeAttempt.objects.create(
-                    url=task.payload["url"],
-                    method="get",  # TODO
-                    resp_body=resp.text,
-                    resp_status=resp.status_code,
-                )
-
-                task.finish()
-                print("Done task", task_id)
diff --git a/src/pytaku_scraper/migrations/0002_auto_20200524_1413.py b/src/pytaku_scraper/migrations/0002_auto_20200524_1413.py
new file mode 100644
index 0000000..181b284
--- /dev/null
+++ b/src/pytaku_scraper/migrations/0002_auto_20200524_1413.py
@@ -0,0 +1,34 @@
+# Generated by Django 3.0.5 on 2020-05-24 14:13
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('pytaku_scraper', '0001_initial'),
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name='DownloadResult',
+            fields=[
+                ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
+                ('created_at', models.DateTimeField(auto_now_add=True)),
+                ('url', models.CharField(max_length=1024)),
+                ('method', models.CharField(default='get', max_length=7)),
+                ('resp_body', models.TextField()),
+                ('resp_status', models.IntegerField()),
+            ],
+            options={
+                'db_table': 'download_result',
+            },
+        ),
+        migrations.DeleteModel(
+            name='ScrapeAttempt',
+        ),
+        migrations.AddConstraint(
+            model_name='downloadresult',
+            constraint=models.UniqueConstraint(fields=('url', 'method'), name='unique_url_method'),
+        ),
+    ]
diff --git a/src/pytaku_scraper/migrations/0003_auto_20200524_1508.py b/src/pytaku_scraper/migrations/0003_auto_20200524_1508.py
new file mode 100644
index 0000000..66b7737
--- /dev/null
+++ b/src/pytaku_scraper/migrations/0003_auto_20200524_1508.py
@@ -0,0 +1,22 @@
+# Generated by Django 3.0.5 on 2020-05-24 15:08
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('pytaku_scraper', '0002_auto_20200524_1413'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='taskqueue',
+            name='name',
+            field=models.CharField(choices=[('Download', 'download')], max_length=100),
+        ),
+        migrations.AddConstraint(
+            model_name='taskqueue',
+            constraint=models.UniqueConstraint(fields=('name', 'payload'), name='unique_url_payload'),
+        ),
+    ]
diff --git a/src/pytaku_scraper/migrations/0004_auto_20200524_1538.py b/src/pytaku_scraper/migrations/0004_auto_20200524_1538.py
new file mode 100644
index 0000000..1a70a30
--- /dev/null
+++ b/src/pytaku_scraper/migrations/0004_auto_20200524_1538.py
@@ -0,0 +1,18 @@
+# Generated by Django 3.0.5 on 2020-05-24 15:38
+
+from django.db import migrations
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('pytaku_scraper', '0003_auto_20200524_1508'),
+    ]
+
+    operations = [
+        migrations.RenameField(
+            model_name='downloadresult',
+            old_name='created_at',
+            new_name='downloaded_at',
+        ),
+    ]
diff --git a/src/pytaku_scraper/models.py b/src/pytaku_scraper/models.py
index 0ff5391..4ac7710 100644
--- a/src/pytaku_scraper/models.py
+++ b/src/pytaku_scraper/models.py
@@ -1,7 +1,7 @@
 from django.contrib.postgres.fields import JSONField
 from django.db import models
 
-QUEUE_NAMES = [("Scrape", "scrape")]
+QUEUE_NAMES = [("Download", "download")]
 
 
 class TaskQueue(models.Model):
@@ -22,6 +22,11 @@ class TaskQueue(models.Model):
 
     class Meta:
         db_table = "task_queue"
+        constraints = [
+            models.UniqueConstraint(
+                fields=["name", "payload"], name="unique_url_payload"
+            )
+        ]
 
     created_at = models.DateTimeField(auto_now_add=True)
     name = models.CharField(max_length=100, choices=QUEUE_NAMES)
@@ -34,7 +39,8 @@ def put(cls, name, payload):
     @classmethod
     def put_bulk(cls, name, payloads):
         return cls.objects.bulk_create(
-            [cls(name=name, payload=payload) for payload in payloads]
+            [cls(name=name, payload=payload) for payload in payloads],
+            ignore_conflicts=True,
         )
 
     @classmethod
@@ -56,16 +62,17 @@ def finish(self):
         return self.delete()
 
 
-class ScrapeAttempt(models.Model):
+class DownloadResult(models.Model):
     class Meta:
-        db_table = "scrape_attempt"
+        db_table = "download_result"
+        constraints = [
+            models.UniqueConstraint(fields=["url", "method"], name="unique_url_method")
+        ]
 
-    scraped_at = models.DateTimeField(auto_now_add=True)
+    downloaded_at = models.DateTimeField(auto_now_add=True)
 
     url = models.CharField(max_length=1024)
-    method = models.CharField(max_length=7)
-    headers = JSONField(default=dict)
-    body = models.TextField()
+    method = models.CharField(max_length=7, default="get")
 
     resp_body = models.TextField()
     resp_status = models.IntegerField()
diff --git a/src/pytaku_scraper/sites/mangadex.py b/src/pytaku_scraper/sites/mangadex.py
index 189c440..8ce7558 100644
--- a/src/pytaku_scraper/sites/mangadex.py
+++ b/src/pytaku_scraper/sites/mangadex.py
@@ -60,7 +60,6 @@ def scrape_title(original_id):
     source_url = title_url_from_id(original_id)
     html = session.get(source_url).text
     soup = BeautifulSoup(html, "lxml")
-    print(soup)
 
     url = soup.select('link[rel="canonical"]')[0].attrs["href"]
     name = soup.select(".card-header span.mx-1")[0].text
@@ -180,3 +179,12 @@ def scrape_chapter(original_id):
         "name": data["title"],
         "pages": pages,
     }
+
+
+def get_latest_id():
+    resp = session.get("https://mangadex.org/")
+    assert resp.status_code == 200, resp.text
+    soup = BeautifulSoup(resp.text, "lxml")
+    latest_href = soup.select_one("#new_titles_owl_carousel a").attrs["href"]
+    latest_id = re.search(r"/(\d+)/", latest_href).group(1)
+    return int(latest_id)