Update v1.1

- Fixed error if you unless you put https:// at the beginning of the link. - Added a debug mode for more information about downloading - Added logger - Now all site will saves in copied folder
Weever1337 · Sep 28, 2024 · ec675ed · ec675ed
1 parent c3ed23a
commit ec675ed
Show file tree

Hide file tree

Showing 6 changed files with 142 additions and 75 deletions.
diff --git a/.gitignore b/.gitignore
@@ -25,6 +25,7 @@ share/python-wheels/
 .installed.cfg
 *.egg
 MANIFEST
+copied/
 
 # PyInstaller
 #  Usually these files are written by a python script from a template

diff --git a/README.md b/README.md
@@ -19,17 +19,25 @@ pip install -r requirements.txt
 After installing the necessary libraries, you can run the script and input any website URL to copy its contents.
 
 ```bash
-python webcopy.py
+python main.py
 ```
 The script will prompt you to enter the URL of the website you want to copy. The content will be saved in a folder named after the domain of the website.
+Or you can use the command line arguments:
+```bash
+python main.py -u https://example.com
+```
+Also you can use the debug mode:
+```bash
+python main.py -u https://example.com -d
+```
 
 ## Example
 ```bash
 Enter the URL of the website: https://example.com
 ```
-This will create a folder named example.com in your current directory, containing the website's HTML and assets in their respective subfolders.
+This will create a folder named example.com in your "copied" directory, containing the website's HTML and assets in their respective subfolders.
 
 ## Requirements
-- Python 3.7+
+- Python 3.8+
 - aiohttp
 - beautifulsoup4
diff --git a/main.py b/main.py
@@ -0,0 +1,27 @@
+import argparse
+import asyncio
+from urllib.parse import urlparse
+from src.webcopy import download_page
+from src import logger
+
+async def main():
+    parser = argparse.ArgumentParser(description="Webcopy")
+    parser.add_argument(
+        "-u", "-l", "--url", "--link", type=str, help="The URL of the website to copy"
+    )
+    parser.add_argument(
+        "-d", "-debug", "--debug", action="store_true", help="Enable debug mode"
+    )
+    args = parser.parse_args()
+    url = args.url or input("Enter the URL of the website: ")
+    debug = args.debug or False
+    if not url.startswith(("http://", "https://")):
+        url = "https://" + url
+
+    folder_name = "copied/" + urlparse(url).netloc
+    logger.info(f"Downloading website from {url} to {folder_name}")
+    await download_page(url, folder_name, debug)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/src/__init__.py b/src/__init__.py
@@ -0,0 +1,34 @@
+from loguru import logger
+import sys
+
+
+def setup_logger():
+    info = "<blue>{time:HH:mm:ss}</blue> => <green>{message}</green>"
+    error = "<red>{time:HH:mm:ss}</red> => <red>{message}</red>"
+    debug = "<yellow>{time:HH:mm:ss}</yellow> => <yellow>{message}</yellow>"
+
+    logger.remove()
+    logger.add(
+        sys.stdout,
+        colorize=True,
+        format=info,
+        level="INFO",
+        filter=lambda record: record["level"].name == "INFO",
+    )
+    logger.add(
+        sys.stdout,
+        colorize=True,
+        format=error,
+        level="ERROR",
+        filter=lambda record: record["level"].name == "ERROR",
+    )
+    logger.add(
+        sys.stdout,
+        colorize=True,
+        format=debug,
+        level="DEBUG",
+        filter=lambda record: record["level"].name == "DEBUG",
+    )
+
+
+setup_logger()
diff --git a/src/webcopy.py b/src/webcopy.py
@@ -0,0 +1,69 @@
+import os
+import aiohttp
+import asyncio
+from bs4 import BeautifulSoup
+from urllib.parse import urljoin, urlparse
+from . import logger
+
+
+async def create_directory_for_resource(resource_url, folder):
+    parsed_url = urlparse(resource_url)
+    resource_path = parsed_url.path.lstrip("/")
+    resource_folder = os.path.join(folder, os.path.dirname(resource_path))
+
+    if not os.path.exists(resource_folder):
+        os.makedirs(resource_folder)
+
+    return os.path.join(resource_folder, os.path.basename(resource_path))
+
+
+async def download_page(url, folder, debug):
+    os.makedirs(folder, exist_ok=True)
+    try:
+        async with aiohttp.ClientSession() as session:
+            async with session.get(url) as response:
+                if response.status != 200:
+                    logger.error(f"Error fetching {url}")
+                    return
+
+                content = await response.text()
+                with open(
+                    os.path.join(folder, "index.html"), "w", encoding="utf-8"
+                ) as file:
+                    file.write(content)
+
+                await download_resources(
+                    BeautifulSoup(content, "html.parser"), url, folder, session, debug
+                )
+    except Exception as e:
+        logger.error(f"Error downloading page {url}: {e}")
+        if debug:
+            from traceback import format_exc
+
+            logger.error(format_exc())
+        return
+
+    logger.info(f"Page {url} downloaded successfully.")
+
+
+async def download_resources(soup, base_url, folder, session, debug):
+    tasks = [
+        save_resource(urljoin(base_url, resource.get(attr)), folder, session, debug)
+        for tag, attr in {"img": "src", "link": "href", "script": "src"}.items()
+        for resource in soup.find_all(tag)
+        if resource.get(attr)
+    ]
+    debug and logger.debug(f"Downloading {len(tasks)} resources")
+    await asyncio.gather(*tasks)
+
+
+async def save_resource(url, folder, session, debug):
+    try:
+        async with session.get(url) as response:
+            if response.status == 200:
+                resource_path = await create_directory_for_resource(url, folder)
+                with open(resource_path, "wb") as file:
+                    file.write(await response.read())
+                debug and logger.debug(f"Resource {url} saved to {resource_path}")
+    except Exception as e:
+        logger.error(f"Error downloading resource {url}: {e}")
diff --git a/webcopy.py b/webcopy.py